In [58]:
import argparse
from pathlib import Path
import pandas as pd
import os
import math
import numpy as np
from random import shuffle
from NTK import kernel_value, kernel_value_batch
from resampling import NestedCV, BaseModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.svm import SVC

In [40]:
DEFAULT_DATASET_PATH = Path("/data/pfizer_tx/tasks_all_clr/all_clr_train_LUAD_stage.h5")
keys = ['/expression', '/labels']
test_data = {key : pd.read_hdf(DEFAULT_DATASET_PATH, key = key) for key in keys}

In [41]:
X_tx = test_data['/expression'].values
Y_tx = test_data['/labels'].values
X_tx.shape, Y_tx.shape

((542, 57992), (542,))

In [51]:
MAX_DEPTH = 5 
C_LIST = [10.0 ** i for i in range(-2, 5)] # hyperparameter for NTK
n_classes = len(set(Y_tx)) # n classes
n_features = X_tx.shape[1] # n features

In [52]:
def svm(K1, K2, y1, y2, C, c):
    n_val, n_train = K2.shape
    clf = SVC(kernel = "precomputed", C = C, cache_size = 100000, probability=True)
    clf.fit(K1, y1)
    y_hat = clf.predict_proba(K2)[:,1]
    return roc_auc_score(y2, y_hat)

In [53]:
# calculate NTK
Ks = kernel_value_batch(X_tx, MAX_DEPTH)

In [54]:
idxs = [e for e in range(len(Y_tx))]
shuffle(idxs)
train_fold, val_fold = idxs[:350], idxs[350:]
y = Y_tx

I think train_fold and val_fold are indices for these folds of data

In [68]:
# load training and validation set
best_auc = 0.0
best_value = 0
best_depth = 0
best_ker = 0

# enumerate kernels and cost values to find the best hyperparameters
for depth in range(MAX_DEPTH):
    for fix_depth in range(depth + 1):
        K = Ks[depth][fix_depth]
        for c in C_LIST:
            auc = svm(K[train_fold][:, train_fold], K[val_fold][:, train_fold], y[train_fold], y[val_fold], c, n_classes)
            if auc > best_auc:
                best_auc = auc
                best_c = c
                best_depth = depth
                best_fix = fix_depth

K = Ks[best_depth][best_fix]

print ("best auc:", best_auc, "\tC:", best_c, "\tdepth:", best_depth, "\tfix:", best_fix)

best auc: 0.6926274944567627 	C: 0.01 	depth: 24 	fix: 21


In [69]:
svm(K[train_fold][:, train_fold], K[val_fold][:, train_fold], y[train_fold], y[val_fold], c, n_classes)

0.6720620842572063

In [90]:
class NTK(BaseModel):
    def __init__(self, params):
        super().__init__()
        self.params = params
    def fit(self, X, y):
        Ks = kernel_value_batch(X, self.params['max_depth'])
        self.best_auc = 0.0
        self.best_depth = 0
        self.best_fix = 0
        self.clf = SVC(kernel = "precomputed", C = self.params['C'], cache_size = 100000, probability=True)
        for depth in range(self.params['max_depth']):
            for fix_depth in range(depth + 1):
                K = Ks[depth][fix_depth]
                self.clf.fit(K, y)
                y_hat = self.clf.predict_proba(K)[:,1]
                auc = roc_auc_score(y, y_hat)
                if auc > best_auc:
                    self.best_auc = auc
                    self.best_depth = depth
                    self.best_fix = fix_depth
        # fit the best model
        print ("Best AUC:", self.best_auc, "\tDepth:", self.best_depth, "\tFix:", self.best_fix)
        K = Ks[self.best_depth][self.best_fix]
        self.clf.fit(K, y)

    def predict_proba(self,X):
        Ks = kernel_value_batch(X, self.params['max_depth'])
        K = Ks[self.best_depth][self.best_fix]
        y_hat = self.clf.predict_proba(K)[:,1]
        return y_hat

In [91]:
params = {"max_depth" : 5, "C" : 10}

In [92]:
model = NTK(params)

In [93]:
model.fit(X_tx, Y_tx)

Best AUC: 1.0 	Depth: 4 	Fix: 4


In [94]:
y_hat = model.predict_proba(X_tx)

In [95]:
roc_auc_score(Y_tx, y_hat)

1.0