In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from joblib import Parallel, delayed
import py_stringmatching as sm
import osqp
from collections import Counter
from scipy.sparse import csc_matrix

from Local_packages.kernels import compute_kernel_matrix, gaussian_kernel, normalize
from Local_packages.run import KernelMethod
from Local_packages.optimizer import KLR_solver, SVM_solver
from scipy.optimize import minimize

# 1 - Load Data

In [2]:
# Load the dataset
Xtr0 = pd.read_csv('data/Xtr0.csv', index_col=0)
Xtr1 = pd.read_csv('data/Xtr1.csv', index_col=0)
Xtr2 = pd.read_csv('data/Xtr2.csv',  index_col=0)
Xte0 = pd.read_csv('data/Xte0.csv', index_col=0)
Xte1 = pd.read_csv('data/Xte1.csv', index_col=0)
Xte2 = pd.read_csv('data/Xte2.csv', index_col=0)

Xtr0_Xte0 = pd.concat([Xtr0, Xte0], ignore_index=True)
Xtr1_Xte1 = pd.concat([Xtr1, Xte1], ignore_index=True)
Xtr2_Xte2 = pd.concat([Xtr2, Xte2], ignore_index=True)

# Load the labels
Ytr0 = pd.read_csv('data/Ytr0.csv', index_col=0)
Ytr1 = pd.read_csv('data/Ytr1.csv', index_col=0)
Ytr2 = pd.read_csv('data/Ytr2.csv', index_col=0)
# Convert the labels to -1, 1
Ytr0 = 2*Ytr0['Bound'].values - 1
Ytr1 = 2*Ytr1['Bound'].values - 1
Ytr2 = 2*Ytr2['Bound'].values - 1

In [3]:
# Load the matrix representation of the sequences
Xtr0_mat100 = pd.read_csv('data/Xtr0_mat100.csv', header=None, sep=' ').values
Xtr1_mat100 = pd.read_csv('data/Xtr1_mat100.csv', header=None, sep=' ').values
Xtr2_mat100 = pd.read_csv('data/Xtr2_mat100.csv', header=None, sep=' ').values
Xte0_mat100 = pd.read_csv('data/Xte0_mat100.csv', header=None, sep=' ').values
Xte1_mat100 = pd.read_csv('data/Xte1_mat100.csv', header=None, sep=' ').values
Xte2_mat100 = pd.read_csv('data/Xte2_mat100.csv', header=None, sep=' ').values

Xtr0_Xte0_mat100 = np.concatenate([Xtr0_mat100, Xte0_mat100], axis=0)
Xtr1_Xte1_mat100 = np.concatenate([Xtr1_mat100, Xte1_mat100], axis=0)
Xtr2_Xte2_mat100 = np.concatenate([Xtr2_mat100, Xte2_mat100], axis=0)

# 2 - Compute Kernel Matrix

In [None]:
def gkm_kernel(seq1, seq2, k=3, gap=1):
    """Compute the gapped k-mer kernel between two sequences."""
    def extract_gapped_kmers(seq, k, gap):
        kmers = set()
        for i in range(len(seq) - (k + gap - 1)):
            kmers.add(seq[i] + seq[i + gap + 1 : i + gap + k])  # gapped k-mer
        return kmers
    
    kmers1 = extract_gapped_kmers(seq1, k, gap)
    kmers2 = extract_gapped_kmers(seq2, k, gap)
    
    return len(kmers1.intersection(kmers2))  # Kernel similarity score

def compute_row(i, X_left, X_right, k, gap):
    """Compute one row of the kernel matrix."""
    return [gkm_kernel(X_left[i], X_right[j], k, gap) for j in range(len(X_right))]

def compute_gkm_kernel_matrix(X_left, X_right, k=3, gap=1, n_jobs=-1):
    """Compute the gapped k-mer kernel matrix using parallelization."""
    n_samples_left = len(X_left)
    
    kernel_matrix = Parallel(n_jobs=n_jobs)(
      delayed(compute_row)(i, X_left, X_right, k, gap) for i in tqdm(range(n_samples_left))
    )
    
    return np.array(kernel_matrix)

# Example usage
X_left = Xtr0_Xte0['seq'].to_list()
X_right = Xtr0_Xte0['seq'].to_list()

In [4]:
kernel_matrix_0 = compute_gkm_kernel_matrix(Xtr0_Xte0['seq'].to_list(), Xtr0_Xte0['seq'].to_list(), k=8, gap=5)
K_0 = normalize(kernel_matrix_0)
kernel_matrix_1 = compute_gkm_kernel_matrix(Xtr1_Xte1['seq'].to_list(), Xtr1_Xte1['seq'].to_list(), k=8, gap=5)
K_1 = normalize(kernel_matrix_1)
kernel_matrix_2 = compute_gkm_kernel_matrix(Xtr2_Xte2['seq'].to_list(), Xtr2_Xte2['seq'].to_list(), k=8, gap=5)
K_2 = normalize(kernel_matrix_2)

100%|██████████| 3000/3000 [00:42<00:00, 71.11it/s]
100%|██████████| 3000/3000 [00:45<00:00, 65.77it/s]
100%|██████████| 3000/3000 [00:48<00:00, 62.21it/s]


In [8]:
kernel = 'mis_sub'

#Gaussian Kernel - On the matrix representation of the sequences
if kernel=='exp':
    args = {'sigma': 0.13}
#Smith-Waterman Local Alignment Score
elif kernel=='sw':
    args = {'sw': sm.SmithWaterman()}
#Spectrum Kernel
elif kernel=='spect':
    args = {'k': 5}
#Mismatch Kernel
elif kernel=='mismatch':
    args = {'k': 10, 'm': 2}
elif kernel=='mis_sub':
    args = {'k': 10, 'm': 2}
#LA Kernel
elif kernel=='LA':
    args = {'beta': 0.5, 'd': 11, 'e': 1}
elif kernel=='LA_gpu':
    args = {'beta': 0.5, 'd': 1, 'e': 0.5}

In [None]:
K_0 = compute_kernel_matrix(Xtr0, Xtr0, kernel, **args)
K_1 = compute_kernel_matrix(Xtr1, Xte1, kernel, **args)
K_2 = compute_kernel_matrix(Xtr2, Xte2, kernel, **args)

#K_0 = compute_kernel_matrix(Xtr0_Xte0, Xtr0_Xte0, kernel, **args)
#K_1 = compute_kernel_matrix(Xtr1_Xte1, Xtr1_Xte1, kernel, **args)
#K_2 = compute_kernel_matrix(Xtr2_Xte2, Xtr2_Xte2, kernel, **args)

In [None]:
np.save('features/K_0_tr_mismatch_10-2.npy', K_0)
np.save('features/K_1_te_mismatch_10-2.npy', K_1.T)
np.save('features/K_2_te_mismatch_10-2.npy', K_2.T)

In [None]:
#Compute only the diagonal of the kernel matrix for the test set
def compute_kernel_for_row(i, Xte, kernel, args):
    return compute_kernel_matrix(Xte.iloc[[i]], Xte.iloc[[i]], kernel, **args)

#K_te_0 = np.concatenate(Parallel(n_jobs=-1)(delayed(compute_kernel_for_row)(i, Xte0, kernel, args) for i in tqdm(range(len(Xte0)))), axis=0)
K_te_1 = np.concatenate(Parallel(n_jobs=-1)(delayed(compute_kernel_for_row)(i, Xte1, kernel, args) for i in tqdm(range(len(Xte1)))), axis=0)
K_te_2 = np.concatenate(Parallel(n_jobs=-1)(delayed(compute_kernel_for_row)(i, Xte2, kernel, args) for i in tqdm(range(len(Xte2)))), axis=0)

In [12]:
#np.save('features/K_0_tr_mismatch_10-2.npy', K_0)
np.save('features/K_1_te_diag_mismatch_10-2.npy', K_te_1)
np.save('features/K_2_te_diag_mismatch_10-2.npy', K_te_2)

## 2.1 - Merge sub-kernels into one kernel

In [19]:
# Load the kernel matrices on Xtr x Xtr
K_tr_0 = np.load('features/K_0_tr_mismatch_10-2.npy')
K_tr_1 = np.load('features/K_1_tr_mismatch_10-2.npy')
K_tr_2 = np.load('features/K_2_tr_mismatch_10-2.npy')
# Load the kernel vector on {Xte_i, Xte_i}_i
K_te_0 = np.load('features/K_0_te_diag_mismatch_10-2.npy')
K_te_1 = np.load('features/K_1_te_diag_mismatch_10-2.npy')
K_te_2 = np.load('features/K_2_te_diag_mismatch_10-2.npy')
# Concatenate the kernel vector on {Xtr_i, Xte_i}_i and {Xte_i, Xte_i}_i to get the diagonal of the whole kernel matrix K 
K_diag_0 = np.concatenate([np.diag(K_tr_0),K_te_0.flatten()], axis=0)
K_diag_1 = np.concatenate([np.diag(K_tr_1),K_te_1.flatten()], axis=0)
K_diag_2 = np.concatenate([np.diag(K_tr_2),K_te_2.flatten()], axis=0)
# Load the kernel matrices on Xte x Xtr
K_tr_te_0 = np.load('features/K_0_te_mismatch_10-2.npy')
K_tr_te_1 = np.load('features/K_1_te_mismatch_10-2.npy')
K_tr_te_2 = np.load('features/K_2_te_mismatch_10-2.npy')
# Concatenate the kernel matrices on Xtr x Xtr and Xte x Xtr to get the whole kernel matrix K on (Xtr U Xte) x Xtr
K_0 = np.concatenate([K_tr_0, K_tr_te_0], axis=0)
K_1 = np.concatenate([K_tr_1, K_tr_te_1], axis=0)
K_2 = np.concatenate([K_tr_2, K_tr_te_2], axis=0)
# Normalize the kernel matrix K
D_0 = np.diag(1/np.sqrt(K_diag_0))
D_1 = np.diag(1/np.sqrt(K_diag_1))
D_2 = np.diag(1/np.sqrt(K_diag_2))
K_0 = np.dot(np.dot(D_0, K_0), D_0[:K_tr_0.shape[0]][:,:K_tr_0.shape[0]]) 
K_1 = np.dot(np.dot(D_1, K_1), D_1[:K_tr_0.shape[0]][:,:K_tr_0.shape[0]]) 
K_2 = np.dot(np.dot(D_2, K_2), D_2[:K_tr_0.shape[0]][:,:K_tr_0.shape[0]]) 

In [24]:
np.save('features/K_0_mismatch_10-2.npy', K_0)
np.save('features/K_1_mismatch_10-2.npy', K_1)
np.save('features/K_2_mismatch_10-2.npy', K_2)

## 2.2 - Load kernels

In [3]:
kernel_versions = ['5-1', '5-2', '6-1', '6-2', '7-1', '7-2', '8-1', '8-2', '9-1', '9-2', '10-2']
K_0_dict, K_1_dict, K_2_dict = {}, {}, {}

for version in kernel_versions:
    K_0_dict[version] = np.load(f'features/K_0_mismatch_{version}.npy')
    K_1_dict[version] = np.load(f'features/K_1_mismatch_{version}.npy')
    K_2_dict[version] = np.load(f'features/K_2_mismatch_{version}.npy')

# Load the additional kernel matrix
#K_0_dict['exp-0_1'] = np.load('features/K_0_exp-0_1.npy')
#K_1_dict['exp-0_1'] = np.load('features/K_1_exp-0_1.npy')
#K_2_dict['exp-0_1'] = np.load('features/K_2_exp-0_1.npy')

In [101]:
print(K_0_dict['8-1'][1:,0].max(), K_0_dict['8-2'][1:,0].max(), K_0_dict['9-1'][1:,0].max(), K_0_dict['9-2'][1:,0].max(), K_0_dict['10-2'][1:,0].max())
print(K_0_dict['8-1'][1:,0].min(), K_1_dict['8-2'][1:,0].min(), K_1_dict['9-1'][1:,0].min(), K_0_dict['9-2'][1:,0].min(), K_1_dict['10-2'][1:,0].min())

0.50571245 0.68502593 0.454753 0.5665562 0.4964234
0.006316263 0.045187116 0.0 0.034760356 0.0020040865


## 2.3 - Combine kernels

In [79]:
kernel_versions = ['9-2', '10-2']
K_0_prod_2 = np.prod([K_0_dict[version][:,:2000] for version in kernel_versions], axis=0)
K_1_prod_2 = np.prod([K_1_dict[version][:,:2000] for version in kernel_versions], axis=0)
K_2_prod_2 = np.prod([K_2_dict[version][:,:2000] for version in kernel_versions], axis=0)

In [None]:
### Exponential kernel from concatenation of kernels
#Concatenate Kernels
kernel_versions = ['5-2', '6-2', '9-2', '8-2']
K_0_concat = np.stack([K_0_dict[version][:, :2000] for version in kernel_versions], axis=-1)
K_1_concat = np.stack([K_1_dict[version][:, :2000] for version in kernel_versions], axis=-1)
K_2_concat = np.stack([K_2_dict[version][:, :2000] for version in kernel_versions], axis=-1)
#Take norm Kernel
K_0_exp = np.exp(np.linalg.norm(K_0_concat, axis=-1))
K_1_exp = np.exp(np.linalg.norm(K_1_concat, axis=-1))
K_2_exp = np.exp(np.linalg.norm(K_2_concat, axis=-1))
K_0_norm = np.linalg.norm(K_0_concat, axis=-1)
K_1_norm = np.linalg.norm(K_1_concat, axis=-1)
K_2_norm = np.linalg.norm(K_2_concat, axis=-1)

In [306]:
# Optimal kernel (gives 100% accuracy on the training set)
K_0_opt = ((Ytr0+1)/2)[:,None]*((Ytr0+1)/2)[None,:]
K_1_opt = ((Ytr1+1)/2)[:,None]*((Ytr1+1)/2)[None,:]
K_2_opt = ((Ytr2+1)/2)[:,None]*((Ytr2+1)/2)[None,:]

In [100]:
def alignment(K_a, K_a_opt):
    return np.sum(K_a[:2000]*K_a_opt)/np.sqrt(np.sum(K_a[:2000]**2)*np.sum(K_a_opt**2))

In [None]:
kernel_versions = ['5-1', '5-2', '6-1', '6-2', '7-1', '7-2', '8-1', '8-2', '9-1', '9-2']
for kernel_version in kernel_versions:
    print(alignment(K_2_dict[kernel_version][:,:2000],K_2_opt))

In [134]:
# For dataset 0
w_0_prod_1 = alignment(K_0_prod_1, K_0_opt)
w_0_prod_2 = alignment(K_0_prod_2, K_0_opt)
w_0 = w_0_prod_1 + w_0_prod_2
K_0 = K_0_prod_1**(w_0_prod_1/w_0)*K_0_prod_2**(w_0_prod_2/w_0) + 1

# For dataset 1
w_1_prod_1 = alignment(K_1_prod_1, K_1_opt)
w_1_prod_2 = alignment(K_1_prod_2, K_1_opt)
w_1 = w_1_prod_1 + w_1_prod_2
K_1 = K_1_prod_1**(w_1_prod_1/w_1)*K_1_prod_2**(w_1_prod_2/w_1) + 1

# For dataset 2
w_2_prod_1 = alignment(K_2_prod_1, K_2_opt)
w_2_prod_2 = alignment(K_2_prod_2, K_2_opt)
w_2 = w_2_prod_1 + w_2_prod_2
K_2 = K_2_prod_1**(w_2_prod_1/w_2)*K_2_prod_2**(w_2_prod_2/w_2) + 1

## 2.3 - Local Alignment Kernel  : <span style="color:green">TODO / Time Complexity too high + value too high</span>

The Local Alignment Kernel defined as:
$$K_{LA}^{(\beta)}(x,y) = \sum_{\pi\in\Pi(x,y)} s_{S,g}(\pi)$$

is symmetric positive definite.

We assume an affine gap penalty:
$$\left\{\begin{aligned}
&g(0) = 0 \\
&g(n) = d + e(n-1) \quad \text{for } n>0
\end{aligned}\right.$$

where $l(\pi)$ is the length of the alignment $\pi$.

We use the formula for the Local Alignment Kernel:
$$K_{LA}^{(\beta)}(x,y) = 1 + X_2(|x|,|y|)+ Y_2(|x|,|y|) + M(|x|,|y|)$$ 
where $X_2$, $Y_2$ and $M$ are defined recursively.

# 3 - Run Kernel Method on Train Data

## 3.1 - Dataset 0

In [136]:
K_0_concat = np.stack([K_0_dict[version][:, :2000] for version in ['8-2', '9-2', '5-1']], axis=-1)
K_0_norm = np.exp(np.linalg.norm(K_0_concat, axis=-1)-2)
#K_0_norm = np.exp(np.linalg.norm(K_0_concat, axis=-1))
K_0 = K_0_norm

In [137]:
K_0[0]

array([0.7649466 , 0.28438875, 0.29345217, ..., 0.23277202, 0.2676232 ,
       0.24363628], dtype=float32)

In [157]:
#Method
lambd = 5e-5
method_0 = KernelMethod(K_0[:2000, :2000], Ytr0, solver=SVM_solver)
method_0.lambd = lambd
method_0.train_test_split(test_size=0.1, random_state=1000)
method_0.fit()
method_0.evaluate()

(0.9994444444444445, 0.625)

In [128]:
average_accuracy = method_0.validate(test_size=0.1, n_splits=200) 

100%|██████████| 200/200 [00:38<00:00,  5.23it/s]


Average Accuracy: 0.6357250000000001
Min Accuracy: 0.555 Max Accuracy: 0.735


In [158]:
average_accuracy = method_0.validate(test_size=0.1, n_splits=200) 

100%|██████████| 200/200 [00:39<00:00,  5.06it/s]


Average Accuracy: 0.633675
Min Accuracy: 0.54 Max Accuracy: 0.72


In [None]:
K_0_dict['special'] = K_0-1
K_0_s = np.stack([K_0_dict[version][:, :2000]+1 for version in ['special', '10-2', '9-1', '8-1']], axis=0)

def train_model(seed):
    methods = []
    lambd = 5e-5

    for K in K_0_s:
        K_0 = K[:, :2000] + 1
        method = KernelMethod(K_0[:2000, :2000], Ytr0, solver=SVM_solver)
        method.lambd = lambd
        method.train_test_split(test_size=0.1, random_state=seed)
        method.fit()
        method.evaluate()
        methods.append(method)

    return methods

# List of seeds
seeds = [1, 2, 3, 4, 5]

# Run in parallel
all_methods = Parallel(n_jobs=-1)(delayed(train_model)(seed) for seed in tqdm(seeds))

def evaluate(models):
    m_train = np.array([np.dot(model.kernel[model.train_indices][:, model.train_indices], model.alpha) for model in models])
    m_train_sgn = np.sign(m_train)
    m_train_agg = np.sign(np.mean(m_train_sgn, axis=0))
    train_accuracy = np.mean(m_train_agg == models[0].Y[models[0].train_indices])

    m_test = np.array([np.dot(model.kernel[model.test_indices][:, model.train_indices], model.alpha) for model in models])
    m_test_sgn = np.sign(m_test)
    m_test_agg = np.sign(np.mean(m_test_sgn, axis=0))
    test_accuracy = np.mean(m_test_agg == models[0].Y[models[0].test_indices])
    return train_accuracy, test_accuracy

# Evaluate for the first set of methods
evaluations = [evaluate([method for method in methods]) for methods in all_methods]
np.array(evaluations)[:,1].mean()

100%|██████████| 5/5 [00:00<00:00, 4976.63it/s]


0.5820000000000001

In [None]:
#Grid Search
method_0.grid_search(np.logspace(-5, -3, 10), test_size=0.1, n_folds=10)

In [None]:
# Grid search for pow
pow_values = np.linspace(0.01, 0.1, 3)

# Perform grid search
best_pow = None
best_accuracy = -np.inf

for pow in pow_values:
    method_0 = KernelMethod((K_0_exp**pow)[:2000][:, :2000], Ytr0, lambd=lambd, solver=SVM_solver)
    method_0.train_test_split(test_size=0.1)
    accuracy = method_0.validate(test_size=0.1, n_splits=10)
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_pow = pow

print(f"Best pow: {best_pow}")
print(f"Best average accuracy: {best_accuracy}")

- BEST AVERAGE ACCURACY : 64.72

In [None]:
K_0_concat = np.stack([K_0_dict[version][:, :2000] for version in ['8-2', '9-2', '5-1']], axis=-1)
K_0_norm = np.linalg.norm(K_0_concat, axis=-1)
K_0 = K_0_norm+1

## 3.2 - Dataset 1

In [264]:
K_1 = K_1_dict['9-2'][:,:2000]**2 # lambd_2 = 1e-4 ?

In [265]:
K_1

array([[1.        , 0.00653302, 0.02531799, ..., 0.02429392, 0.04268699,
        0.00788511],
       [0.00653302, 1.        , 0.01486757, ..., 0.00698799, 0.00678186,
        0.01230289],
       [0.02531799, 0.01486757, 1.        , ..., 0.02443128, 0.01686409,
        0.0100904 ],
       ...,
       [0.03528198, 0.01087151, 0.01675584, ..., 0.04444053, 0.02422663,
        0.01020658],
       [0.00957606, 0.01095784, 0.02077249, ..., 0.04816468, 0.01277906,
        0.01438896],
       [0.00749621, 0.01976574, 0.01067438, ..., 0.01590941, 0.01205017,
        0.02235378]], dtype=float32)

In [306]:
lambd_1 = 1e-4

method_1 = KernelMethod((K_1+1)[:2000], Ytr1, lambd=lambd_1, solver=SVM_solver)
method_1.train_test_split(test_size=0.1, random_state=10)
method_1.fit()
method_1.evaluate()

(1.0, 0.81)

In [None]:
#Grid Search for lambda
method_1.grid_search(np.logspace(-9, -7, 10), test_size=0.25, n_folds=4)

In [None]:
# Grid search for pow
pow_values = np.linspace(0.5, 2, 5)

# Perform grid search
best_pow = None
best_accuracy = -np.inf

for pow in pow_values:
    method_2 = KernelMethod(((K_1_exp-1)**pow+1)[:2000][:, :2000], Ytr1, lambd=lambd_1, solver=SVM_solver)
    method_2.train_test_split(test_size=0.1)
    accuracy = method_2.validate(test_size=0.1, n_splits=10)
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_pow = pow

print(f"Best pow: {best_pow}")
print(f"Best average accuracy: {best_accuracy}")

In [301]:
average_accuracy = method_1.validate(test_size=0.1, n_splits=50)

100%|██████████| 50/50 [00:04<00:00, 11.63it/s]


Average Accuracy: 0.7844
Min Accuracy: 0.72 Max Accuracy: 0.855


- BEST AVERAGE ACCURACY : 79.05

In [None]:
average_accuracy = method_1.validate(test_size=0.1, n_splits=10)

Average Accuracy: 0.7905
Min Accuracy: 0.755 Max Accuracy: 0.815


## 3.3 - Dataset 2

In [307]:
test = K_2_dict['9-1'][:,:2000]*K_2_dict['9-2'][:,:2000]**2+1

In [317]:
lambd_2 = 2e-4

method_2 = KernelMethod((test)[:2000, :2000], Ytr2, lambd=lambd_2, solver=SVM_solver)
method_2.train_test_split(test_size=0.1, random_state=10)
method_2.fit()
method_2.evaluate()

(0.9994444444444445, 0.68)

In [None]:
#Grid Search
method_2.grid_search(np.logspace(-5, -3.5, 8), test_size=0.1, n_folds=10)

In [None]:
# Grid search for pow
pow_values = np.linspace(0.5, 3, 10)

# Perform grid search
best_pow = None
best_accuracy = -np.inf

for pow in pow_values:
    method_2 = KernelMethod((K_2**pow)[:2000][:, :2000], Ytr2, lambd=lambd_2, solver=SVM_solver)
    method_2.train_test_split(test_size=0.1)
    accuracy = method_2.validate(test_size=0.1, n_splits=20)
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_pow = pow

print(f"Best pow: {best_pow}")
print(f"Best average accuracy: {best_accuracy}")

In [316]:
average_accuracy = method_2.validate(test_size=0.05, n_splits=50)

100%|██████████| 50/50 [00:05<00:00,  9.17it/s]


Average Accuracy: 0.6972
Min Accuracy: 0.61 Max Accuracy: 0.85


- BEST AVERAGE ACCURACY : 69.94

In [None]:
test = K_2_dict['9-1'][:,:2000]*K_2_dict['9-2'][:,:2000]**2+1 # lambd_2 = 1e-4 ?
average_accuracy = method_2.validate(test_size=0.1, n_splits=20)

Average Accuracy: 0.69575
Min Accuracy: 0.64 Max Accuracy: 0.75


# 4 - Apply Kernel Predictor on Test Data

In [97]:
def predict_test_labels(K, method):
    K_te = K
    alpha = method.alpha
    # Predictions
    Yte0 = np.sign(K_te @ alpha)
    return Yte0

In [98]:
Yte_file_name = 'Yte_mismatch_exp_8_9_5.csv'

#Yte0 = predict_test_labels(K_0[2000:][:, method_0.train_indices], method_0)
#Yte1 = predict_test_labels(K_1[2000:][:,method_1.train_indices], method_1)
Yte2 = predict_test_labels(test[2000:][:,method_2.train_indices], method_2)

In [None]:
Yte = pd.read_csv('Yte_mismatch_9_10.csv', index_col=0)
Yte0 = Yte['Bound'].values[:1000]*2-1
Yte1 = Yte['Bound'].values[1000:2000]*2-1
Yte2 = Yte['Bound'].values[2000:]*2-1

In [59]:
Yte_file_name = 'Yte_mismatch_9_10_bias_1.csv'

# Concatenate and add Id column
Yte = np.concatenate([Yte0, Yte1, Yte2])
Yte = pd.DataFrame(data=(Yte + 1) // 2, columns=['Bound'], dtype='int64')
Yte.insert(0, 'Id', Yte.index)

# Save the predictions
Yte.to_csv(Yte_file_name, index=False)