In [1]:
import argparse
from pathlib import Path
import pandas as pd
import os
import math
import numpy as np
from random import shuffle
import NTK

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

In [2]:
DEFAULT_DATASET_PATH = Path("/users/lindsayedwards/data/pfizer_tx/tasks_all_clr/all_clr_train_LUAD_stage.h5")
keys = ['/expression', '/labels']
test_data = {key : pd.read_hdf(DEFAULT_DATASET_PATH, key = key) for key in keys}

In [3]:
X_tx = test_data['/expression'].values
Y_tx = test_data['/labels'].values
X_tx.shape, Y_tx.shape

((542, 57992), (542,))

In [4]:
MAX_DEPTH = 5 
C_LIST = [10.0 ** i for i in range(-2, 5)] # hyperparameter for NTK
n_classes = len(set(Y_tx)) # n classes
n_features = X_tx.shape[1] # n features

In [5]:
def svm(K1, K2, y1, y2, C, c):
    n_val, n_train = K2.shape
    clf = SVC(kernel = "precomputed", C = C, cache_size = 100000)
    clf.fit(K1, y1)
    z = clf.predict(K2)
    return 1.0 * np.sum(z == y2) / n_val

In [6]:
# X = np.random.randint(0, 200, size = [100, 58000])

In [7]:
# calculate NTK
Ks = NTK.kernel_value_batch(X_tx, MAX_DEPTH)

In [8]:
idxs = [e for e in range(len(Y_tx))]
shuffle(idxs)
train_fold, val_fold = idxs[:350], idxs[350:]
y = Y_tx

I think train_fold and val_fold are indices for these folds of data

In [9]:
# load training and validation set
best_acc = 0.0
best_value = 0
best_depth = 0
best_ker = 0

# enumerate kernels and cost values to find the best hyperparameters
for depth in range(MAX_DEPTH):
    for fix_depth in range(depth + 1):
        K = Ks[depth][fix_depth]
        for c in C_LIST:
            acc = svm(K[train_fold][:, train_fold], K[val_fold][:, train_fold], y[train_fold], y[val_fold], c, n_classes)
            if acc > best_acc:
                best_acc = acc
                best_c = c
                best_depth = depth
                best_fix = fix_depth

K = Ks[best_depth][best_fix]

print ("best acc:", best_acc, "\tC:", best_c, "\tdepth:", best_depth, "\tfix:", best_fix)

best acc: 0.6145833333333334 	C: 0.01 	depth: 3 	fix: 0


In [10]:
svm(K[train_fold][:, train_fold], K[val_fold][:, train_fold], y[train_fold], y[val_fold], c, n_classes)

0.6145833333333334