# KNN benchmark

This notebook repeats the KNN benchmark reported in [Smith et al.](<https://www.biorxiv.org/content/10.1101/574723v2>)

In [1]:
from collections import OrderedDict
import os
from pathlib import Path
from time import time
from tqdm import tqdm

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn as skl
from sklearn.model_selection import KFold, ParameterGrid
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.metrics import roc_auc_score, accuracy_score
import torch

from resampling import NestedCV
from utilities import hdf_keys

In [2]:
DATA_PATH = Path("/data/pfizer_tx")

Benchmark classification tasks only. Save filenames for specifc classes of task as .csv files to make life easier later.

In [3]:
# tcga_classification_strings = ['stage', 'grade']
# sra_classification_strings = ['GSE']
# all_tcga_files = [f for f in os.listdir(DATA_PATH/"tasks_all_clr")\
#                             if any([e in f for e in tcga_classification_strings])]
# all_sra_files = [f for f in os.listdir(DATA_PATH/"tasks_all_clr")\
#                             if any([e in f for e in sra_classification_strings])]

In [4]:
# pd.DataFrame(all_tcga_files, columns=["filename"]).to_csv("tcga_classification_tasks.csv", index=False)
# pd.DataFrame(all_sra_files, columns=["filename"]).to_csv("sra_classification_tasks.csv", index=False)

In [10]:
tcga_classification_tasks = pd.read_csv("tcga_classification_tasks.csv")

In [16]:
results = []
best_params = []
hparams = dict()
hparams['k'] = [1, 3, 5, 7, 9]

In [None]:
for f in tcga_classification_tasks['filename']:
    print(f"Training on {f}")
    dataset_path = DATA_PATH/"tasks_all_clr"/f
    keys = hdf_keys(dataset_path)
    test_data = {key : pd.read_hdf(dataset_path, key = key) for key in keys}
    def model(params):
        return KNN(n_neighbors=params['k'], n_jobs=-1)
    nestedCV = NestedCV(model, hparams)
    performance, params = nestedCV.train(test_data['/expression'], test_data['/labels'])
    final_performance = np.mean(performance)
    print(f"Final average model performance: {final_performance}")
    results.append(final_performance)
    best_params.append(params)
pd.DataFrame(results, columns=["results"]).to_csv("tcga_classification_tasks_results.csv", index=False)

Training on all_clr_train_LUAD_stage.h5
Number of inner splits (product of all hparam values): 5
Fold 1
Fitting model with params: {'k': 1}
Fitting model with params: {'k': 3}
Fitting model with params: {'k': 5}
Fitting model with params: {'k': 7}
Fitting model with params: {'k': 9}
Best params: {'k': 5}, training final model
Fold 2
Fitting model with params: {'k': 1}
Fitting model with params: {'k': 3}
Fitting model with params: {'k': 5}
Fitting model with params: {'k': 7}
Fitting model with params: {'k': 9}
Best params: {'k': 3}, training final model
Fold 3
Fitting model with params: {'k': 1}
Fitting model with params: {'k': 3}
Fitting model with params: {'k': 5}
Fitting model with params: {'k': 7}
Fitting model with params: {'k': 9}
Best params: {'k': 7}, training final model
Fold 4
Fitting model with params: {'k': 1}
Fitting model with params: {'k': 3}
Fitting model with params: {'k': 5}
