# KNN benchmark

## NB This is a 'scratch' notebook, do not use. 
Instead, run `knn_basline.py` to run on a single dataset or `knn_train_tcga.py` to train on all the tcga classification datasets.

This notebook repeats the KNN benchmark reported in [Smith et al.](<https://www.biorxiv.org/content/10.1101/574723v2>)

In [1]:
from importlib import reload
import os
from pathlib import Path
from time import time
from tqdm import tqdm

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn as skl
from sklearn.model_selection import KFold, ParameterGrid
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.metrics import roc_auc_score, accuracy_score
import torch

import resampling
from utilities import hdf_keys

In [2]:
DATA_PATH = Path("/data/pfizer_tx")

Benchmark classification tasks only. Save filenames for specifc classes of task as .csv files to make life easier later.

In [3]:
# tcga_classification_strings = ['stage', 'grade']
# sra_classification_strings = ['GSE']
# all_tcga_files = [f for f in os.listdir(DATA_PATH/"tasks_all_clr")\
#                             if any([e in f for e in tcga_classification_strings])]
# all_sra_files = [f for f in os.listdir(DATA_PATH/"tasks_all_clr")\
#                             if any([e in f for e in sra_classification_strings])]

In [4]:
# pd.DataFrame(all_tcga_files, columns=["filename"]).to_csv("tcga_classification_tasks.csv", index=False)
# pd.DataFrame(all_sra_files, columns=["filename"]).to_csv("sra_classification_tasks.csv", index=False)

In [5]:
tcga_classification_tasks = pd.read_csv("tcga_classification_tasks.csv")

In [24]:
hparams = dict()
# hparams['k'] = [1, 3, 5, 7, 9]
hparams['k'] = [1, 9]

In [21]:
reload(resampling)

<module 'resampling' from '/home/ubuntu/tx-metalearning/resampling.py'>

In [22]:
tcga_classification_tasks.iloc[0,0]

'all_clr_train_LUAD_stage.h5'

In [25]:
f = tcga_classification_tasks.iloc[0,0]
dataset_path = DATA_PATH/"tasks_all_clr"/f
keys = hdf_keys(dataset_path)
test_data = {key : pd.read_hdf(dataset_path, key = key) for key in keys}
def model(params):
    return KNN(n_neighbors=params['k'], n_jobs=-1)
nestedCV = resampling.NestedCV(model, hparams, 2, 2)
performance, params = nestedCV.train(test_data['/expression'], test_data['/labels'])
final_performance = np.mean(performance)

Outer fold 1 of 2
Inner fold 1 of 2
Fitting model with params: {'k': 1}
Fitting model with params: {'k': 9}
Inner fold 2 of 2
Fitting model with params: {'k': 1}
Fitting model with params: {'k': 9}
Best params: {'k': 9}, training final model
Outer fold 2 of 2
Inner fold 1 of 2
Fitting model with params: {'k': 1}
Fitting model with params: {'k': 9}
Inner fold 2 of 2
Fitting model with params: {'k': 1}
Fitting model with params: {'k': 9}
Best params: {'k': 1}, training final model
Total time taken: 19.68738579750061
Mean performance across 2 outer splits: 0.5436284022490919


In [49]:
results = pd.DataFrame([performance, params]).transpose()
results.columns = ["auc", "params"]

In [50]:
results

Unnamed: 0,auc,params
0,0.604789,{'k': 9}
1,0.482468,{'k': 1}


In [36]:
dataset_path.stem

'all_clr_train_LUAD_stage'

In [8]:
for f in tcga_classification_tasks['filename']:
    print(f"Training on {f}")
    dataset_path = DATA_PATH/"tasks_all_clr"/f
    keys = hdf_keys(dataset_path)
    test_data = {key : pd.read_hdf(dataset_path, key = key) for key in keys}
    def model(params):
        return KNN(n_neighbors=params['k'], n_jobs=-1)
    nestedCV = resampling.NestedCV(model, hparams, 2, 2)
    performance, params = nestedCV.train(test_data['/expression'], test_data['/labels'])
    final_performance = np.mean(performance)
    print(f"Final average model performance: {final_performance}")
    results.append(final_performance)
    best_params.append(params)

Training on all_clr_train_LUAD_stage.h5
Number of inner splits (product of all hparam values): 2
Outer fold 1 of 2
Inner fold 1 of 2
Fitting model with params: {'k': 1}
Fitting model with params: {'k': 3}
Fitting model with params: {'k': 5}
Fitting model with params: {'k': 7}
Fitting model with params: {'k': 9}
Inner fold 2 of 2
Fitting model with params: {'k': 1}
Fitting model with params: {'k': 3}
Fitting model with params: {'k': 5}
Fitting model with params: {'k': 7}
Fitting model with params: {'k': 9}
Best params: {'k': 7}, training final model
Outer fold 2 of 2
Inner fold 1 of 2
Fitting model with params: {'k': 1}
Fitting model with params: {'k': 3}
Fitting model with params: {'k': 5}
Fitting model with params: {'k': 7}
Fitting model with params: {'k': 9}
Inner fold 2 of 2
Fitting model with params: {'k': 1}
Fitting model with params: {'k': 3}
Fitting model with params: {'k': 5}
Fitting model with params: {'k': 7}
Fitting model with params: {'k': 9}
Best params: {'k': 1}, training

KeyboardInterrupt: 

In [11]:
best_params[0]

[{'algorithm': 'auto',
  'leaf_size': 30,
  'metric': 'minkowski',
  'metric_params': None,
  'n_jobs': -1,
  'n_neighbors': 7,
  'p': 2,
  'weights': 'uniform'},
 {'algorithm': 'auto',
  'leaf_size': 30,
  'metric': 'minkowski',
  'metric_params': None,
  'n_jobs': -1,
  'n_neighbors': 1,
  'p': 2,
  'weights': 'uniform'}]

In [None]:
pd.DataFrame(results, columns=["results"]).to_csv("test.csv", index=False)

Training on all_clr_train_LUAD_stage.h5
Number of inner splits (product of all hparam values): 5
Outer fold 1 of 5
Inner fold 1 of 5
Fitting model with params: {'k': 1}
Fitting model with params: {'k': 3}
Fitting model with params: {'k': 5}
Fitting model with params: {'k': 7}
Fitting model with params: {'k': 9}
Inner fold 2 of 5
Fitting model with params: {'k': 1}
Fitting model with params: {'k': 3}
Fitting model with params: {'k': 5}
Fitting model with params: {'k': 7}
Fitting model with params: {'k': 9}
Inner fold 3 of 5
Fitting model with params: {'k': 1}
Fitting model with params: {'k': 3}
Fitting model with params: {'k': 5}
Fitting model with params: {'k': 7}
Fitting model with params: {'k': 9}
Inner fold 4 of 5
Fitting model with params: {'k': 1}
Fitting model with params: {'k': 3}
Fitting model with params: {'k': 5}
Fitting model with params: {'k': 7}
Fitting model with params: {'k': 9}
Inner fold 5 of 5
Fitting model with params: {'k': 1}
Fitting model with params: {'k': 3}
Fit