# KNN benchmark

This notebook repeats the KNN benchmark reported in Smith et al.

In [64]:
import os
from pathlib import Path
import numpy as np
import pandas as pd
import sklearn as skl
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.metrics import roc_auc_score, accuracy_score
import torch
from utilities import hdf_keys

In [2]:
DATA_PATH = Path("/data/pfizer_tx")
k = [1, 3, 5, 7, 9] # matches the search used in the original paper

Benchmark classification tasks only

In [23]:
classification_strings = ['stage', 'grade', 'GSE']
all_classification_files = [f for f in os.listdir(DATA_PATH/"tasks_all_clr")\
                            if any([e in f for e in classification_strings])]

Work up a cross-validated classifier on a trial dataset: `all_clr_train_LUAD_stage.h5`

In [24]:
all_classification_files[0]

'all_clr_train_LUAD_stage.h5'

In [25]:
test_data_path = DATA_PATH/"tasks_all_clr"/all_classification_files[0]
keys = hdf_keys(test_DATA_PATH)
print(keys)
test_data = {key : pd.read_hdf(test_data_path, key = key) for key in keys}

['/expression', '/labels']


In [26]:
outer = KFold(n_splits=5)
inner = KFold(n_splits=5)

In [40]:
knn = KNN(n_neighbors=10, n_jobs=-1)

In [41]:
for train, valid in inner.split(test_data['/expression']):
    pass

`cross_val_score` is probably the easiest way to call the outer loop.  
<https://scikit-learn.org/stable/modules/cross_validation.html#>
```python
from sklearn.model_selection import cross_val_score
ilf = our_inner_loop_function
scores = cross_val_score(ilf, X, y, cv=5)
scores
# array([0.96..., 1.  ..., 0.96..., 0.96..., 1.        ])
```

In [42]:
t, v = next(iter(inner.split(test_data['/expression'])))

In [43]:
x_train, y_train, x_valid, y_valid = test_data['/expression'].iloc[t], test_data['/labels'].iloc[t],\
test_data['/expression'].iloc[v], test_data['/labels'].iloc[v]

In [44]:
fitted_model1 = knn.fit(x_train, y_train)

In [59]:
train_prob = fitted_model1.predict_proba(x_train)
valid_prob = fitted_model1.predict_proba(x_valid)

In [67]:
train_acc = accuracy_score(y_true=y_train, y_pred=[np.argmax(e) for e in train_prob])
valid_acc = accuracy_score(y_true=y_valid, y_pred=[np.argmax(e) for e in valid_prob])
train_auc = roc_auc_score(y_true=y_train, y_score=train_prob[:,1])
valid_auc = roc_auc_score(y_true=y_valid, y_score=valid_prob[:,1])

In [70]:
print(f"Train acc: {train_acc}, valid acc: {valid_acc}\nTrain_auc: {train_auc}, valid_auc: {valid_auc}")

Train acc: 0.6651270207852193, valid acc: 0.5504587155963303
Train_auc: 0.7845793374019181, valid_auc: 0.5645706558485464
