In [17]:
import os
from pathlib import Path
from time import time
from tqdm import tqdm
from importlib import reload

from fire import Fire
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn as skl
from sklearn.model_selection import KFold, ParameterGrid
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score

import resampling
from utilities import hdf_keys

In [22]:
DEFAULT_DATASET_PATH = Path("/data/pfizer_tx/tasks_all_clr/all_clr_train_LUAD_stage.h5")
keys = hdf_keys(DEFAULT_DATASET_PATH)
test_data = {key : pd.read_hdf(DEFAULT_DATASET_PATH, key = key) for key in keys}
HPARAMS = dict()
HPARAMS['k'] = [1]

In [23]:
class KNN(resampling.BaseModel):
    def __init__(self, params):
        super().__init__()
        self.params = params
        print(params['k'])
        self.model = KNeighborsClassifier(n_neighbors=self.params['k'], n_jobs=-1)
    def fit(self, X, y):
        self.model.fit(X, y)
    def predict_proba(self,X):
        return self.model.predict_proba(X)

In [26]:
reload(resampling)

<module 'resampling' from '/home/ubuntu/tx-metalearning/models/resampling.py'>

In [27]:
nestedCV = resampling.NestedCV(KNN, HPARAMS, n_splits_outer=2, n_splits_inner=2)
performance, params = nestedCV.train(test_data['/expression'], test_data['/labels'])

Outer fold 1 of 2
Inner fold 1 of 2
1
Fitting model with params: {'k': 1}
Inner fold 2 of 2
1
Fitting model with params: {'k': 1}
Best params: {'k': 1}, training final model
1
Outer fold 2 of 2
Inner fold 1 of 2
1
Fitting model with params: {'k': 1}
Inner fold 2 of 2
1
Fitting model with params: {'k': 1}
Best params: {'k': 1}, training final model
1
Total time taken: 14.469674825668335
Mean performance across 2 outer splits: 0.5137707120465741


In [29]:
params

[{'k': 1}, {'k': 1}]

In [8]:
def knn_model(params):
        return KNN(n_neighbors=params['k'], n_jobs=-1)

TypeError: __init__() missing 1 required positional argument: 'params'

In [4]:
def cross_val_loop(n_splits_outer=5, n_splits_inner=5, dataset_path=DEFAULT_DATASET_PATH, hparams=DEFAULT_HPARAMS):
    """Sends a single data set (stored as an h5 file by Pandas) to a nested CV loop.  
    A KNN model is trained on multiple hyperparameters in the inner loop.  
    Unbiased performance is assessed in the outer loop.  
    Output is saved to a file named "<dataset_path>_knn.csv" in the local folder ./results/  
    
    Parameters
    ----------
    n_splits_outer : int
    n_splits_inner : int
    dataset_path : str or Path
    hparams : dict
    
    """
    results_dir =  Path("./results/l2/")
    results_dir.mkdir(parents=True, exist_ok=True)
    results = []
    best_params = []
    dataset_path = Path(dataset_path)
    results_path = Path(results_dir/f"{dataset_path.stem}_knn.csv")
    print(f"Training on {dataset_path}")
    keys = hdf_keys(dataset_path)
    test_data = {key : pd.read_hdf(dataset_path, key = key) for key in keys}
    def knn_model(params):
        return KNN(n_neighbors=params['k'], n_jobs=-1)
    nestedCV = NestedCV(knn_model, hparams, n_splits_outer, n_splits_inner)
    performance, params = nestedCV.train(test_data['/expression'], test_data['/labels'])
    results = pd.DataFrame([performance, params]).transpose()
    results.columns = ["auc", "params"]
    results.to_csv(results_path, index=False)