In [5]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from joblib import Parallel, delayed
import py_stringmatching as sm
import osqp
from collections import Counter
from scipy.sparse import csc_matrix

from Local_packages.kernels import compute_kernel_matrix, gaussian_kernel, normalize
from Local_packages.run import KernelMethod, KernelMethodBias
from Local_packages.optimizer import KLR_solver, SVM_solver, SVM_solver_with_bias
from scipy.optimize import minimize

# 1 - Load Data

In [2]:
# Load the dataset
Xtr0 = pd.read_csv('data/Xtr0.csv', index_col=0)
Xtr1 = pd.read_csv('data/Xtr1.csv', index_col=0)
Xtr2 = pd.read_csv('data/Xtr2.csv',  index_col=0)
Xte0 = pd.read_csv('data/Xte0.csv', index_col=0)
Xte1 = pd.read_csv('data/Xte1.csv', index_col=0)
Xte2 = pd.read_csv('data/Xte2.csv', index_col=0)

Xtr0_Xte0 = pd.concat([Xtr0, Xte0], ignore_index=True)
Xtr1_Xte1 = pd.concat([Xtr1, Xte1], ignore_index=True)
Xtr2_Xte2 = pd.concat([Xtr2, Xte2], ignore_index=True)

# Load the labels
Ytr0 = pd.read_csv('data/Ytr0.csv', index_col=0)
Ytr1 = pd.read_csv('data/Ytr1.csv', index_col=0)
Ytr2 = pd.read_csv('data/Ytr2.csv', index_col=0)
# Convert the labels to -1, 1
Ytr0 = 2*Ytr0['Bound'].values - 1
Ytr1 = 2*Ytr1['Bound'].values - 1
Ytr2 = 2*Ytr2['Bound'].values - 1

In [3]:
# Load the matrix representation of the sequences
Xtr0_mat100 = pd.read_csv('data/Xtr0_mat100.csv', header=None, sep=' ').values
Xtr1_mat100 = pd.read_csv('data/Xtr1_mat100.csv', header=None, sep=' ').values
Xtr2_mat100 = pd.read_csv('data/Xtr2_mat100.csv', header=None, sep=' ').values
Xte0_mat100 = pd.read_csv('data/Xte0_mat100.csv', header=None, sep=' ').values
Xte1_mat100 = pd.read_csv('data/Xte1_mat100.csv', header=None, sep=' ').values
Xte2_mat100 = pd.read_csv('data/Xte2_mat100.csv', header=None, sep=' ').values

Xtr0_Xte0_mat100 = np.concatenate([Xtr0_mat100, Xte0_mat100], axis=0)
Xtr1_Xte1_mat100 = np.concatenate([Xtr1_mat100, Xte1_mat100], axis=0)
Xtr2_Xte2_mat100 = np.concatenate([Xtr2_mat100, Xte2_mat100], axis=0)

# 2 - Compute Kernel Matrix

In [None]:
def gkm_kernel(seq1, seq2, k=3, gap=1):
    """Compute the gapped k-mer kernel between two sequences."""
    def extract_gapped_kmers(seq, k, gap):
        kmers = set()
        for i in range(len(seq) - (k + gap - 1)):
            kmers.add(seq[i] + seq[i + gap + 1 : i + gap + k])  # gapped k-mer
        return kmers
    
    kmers1 = extract_gapped_kmers(seq1, k, gap)
    kmers2 = extract_gapped_kmers(seq2, k, gap)
    
    return len(kmers1.intersection(kmers2))  # Kernel similarity score

def compute_row(i, X_left, X_right, k, gap):
    """Compute one row of the kernel matrix."""
    return [gkm_kernel(X_left[i], X_right[j], k, gap) for j in range(len(X_right))]

def compute_gkm_kernel_matrix(X_left, X_right, k=3, gap=1, n_jobs=-1):
    """Compute the gapped k-mer kernel matrix using parallelization."""
    n_samples_left = len(X_left)
    
    kernel_matrix = Parallel(n_jobs=n_jobs)(
      delayed(compute_row)(i, X_left, X_right, k, gap) for i in tqdm(range(n_samples_left))
    )
    
    return np.array(kernel_matrix)

# Example usage
X_left = Xtr0_Xte0['seq'].to_list()
X_right = Xtr0_Xte0['seq'].to_list()

In [4]:
kernel_matrix_0 = compute_gkm_kernel_matrix(Xtr0_Xte0['seq'].to_list(), Xtr0_Xte0['seq'].to_list(), k=8, gap=5)
K_0 = normalize(kernel_matrix_0)
kernel_matrix_1 = compute_gkm_kernel_matrix(Xtr1_Xte1['seq'].to_list(), Xtr1_Xte1['seq'].to_list(), k=8, gap=5)
K_1 = normalize(kernel_matrix_1)
kernel_matrix_2 = compute_gkm_kernel_matrix(Xtr2_Xte2['seq'].to_list(), Xtr2_Xte2['seq'].to_list(), k=8, gap=5)
K_2 = normalize(kernel_matrix_2)

100%|██████████| 3000/3000 [00:42<00:00, 71.11it/s]
100%|██████████| 3000/3000 [00:45<00:00, 65.77it/s]
100%|██████████| 3000/3000 [00:48<00:00, 62.21it/s]


In [51]:
kernel = 'mismatch'

#Gaussian Kernel - On the matrix representation of the sequences
if kernel=='exp':
    args = {'sigma': 0.13}
#Smith-Waterman Local Alignment Score
elif kernel=='sw':
    args = {'sw': sm.SmithWaterman()}
#Spectrum Kernel
elif kernel=='spect':
    args = {'k': 5}
#Mismatch Kernel
elif kernel=='mismatch':
    args = {'k': 5, 'm': 0}
elif kernel=='mis_sub':
    args = {'k': 5, 'm': 3}
#LA Kernel
elif kernel=='LA':
    args = {'beta': 0.5, 'd': 11, 'e': 1}
elif kernel=='LA_gpu':
    args = {'beta': 0.5, 'd': 1, 'e': 0.5}

In [52]:
K_0 = compute_kernel_matrix(Xtr0_Xte0, Xtr0_Xte0, kernel, **args)
K_1 = compute_kernel_matrix(Xtr1_Xte1, Xtr1_Xte1, kernel, **args)
K_2 = compute_kernel_matrix(Xtr2_Xte2, Xtr2_Xte2, kernel, **args)

#K_0 = compute_kernel_matrix(Xtr0_Xte0, Xtr0_Xte0, kernel, **args)
#K_1 = compute_kernel_matrix(Xtr1_Xte1, Xtr1_Xte1, kernel, **args)
#K_2 = compute_kernel_matrix(Xtr2_Xte2, Xtr2_Xte2, kernel, **args)

Computing feature vectors: 100%|██████████| 3000/3000 [00:00<00:00, 9854.85it/s]
Collecting k-mers: 100%|██████████| 3000/3000 [00:00<00:00, 3746.87it/s]
Building sparse matrix entries: 100%|██████████| 3000/3000 [00:01<00:00, 2864.62it/s]
Computing feature vectors: 100%|██████████| 3000/3000 [00:00<00:00, 9603.00it/s]
Collecting k-mers: 100%|██████████| 3000/3000 [00:00<00:00, 4479.12it/s]
Building sparse matrix entries: 100%|██████████| 3000/3000 [00:00<00:00, 3462.01it/s]
Computing feature vectors: 100%|██████████| 3000/3000 [00:00<00:00, 11092.23it/s]
Collecting k-mers: 100%|██████████| 3000/3000 [00:00<00:00, 5277.98it/s]
Building sparse matrix entries: 100%|██████████| 3000/3000 [00:00<00:00, 3571.70it/s]


In [53]:
K_0 = normalize(K_0)
K_1 = normalize(K_1)
K_2 = normalize(K_2)

In [57]:
np.save('features/K_0_mismatch_5-0.npy', K_0)
np.save('features/K_1_mismatch_5-0.npy', K_1)
np.save('features/K_2_mismatch_5-0.npy', K_2)

In [None]:
#Compute only the diagonal of the kernel matrix for the test set
def compute_kernel_for_row(i, Xte, kernel, args):
    return compute_kernel_matrix(Xte.iloc[[i]], Xte.iloc[[i]], kernel, **args)

#K_te_0 = np.concatenate(Parallel(n_jobs=-1)(delayed(compute_kernel_for_row)(i, Xte0, kernel, args) for i in tqdm(range(len(Xte0)))), axis=0)
K_te_1 = np.concatenate(Parallel(n_jobs=-1)(delayed(compute_kernel_for_row)(i, Xte1, kernel, args) for i in tqdm(range(len(Xte1)))), axis=0)
K_te_2 = np.concatenate(Parallel(n_jobs=-1)(delayed(compute_kernel_for_row)(i, Xte2, kernel, args) for i in tqdm(range(len(Xte2)))), axis=0)

In [12]:
#np.save('features/K_0_tr_mismatch_10-2.npy', K_0)
np.save('features/K_1_te_diag_mismatch_10-2.npy', K_te_1)
np.save('features/K_2_te_diag_mismatch_10-2.npy', K_te_2)

## 2.1 - Merge sub-kernels into one kernel

In [19]:
# Load the kernel matrices on Xtr x Xtr
K_tr_0 = np.load('features/K_0_tr_mismatch_10-2.npy')
K_tr_1 = np.load('features/K_1_tr_mismatch_10-2.npy')
K_tr_2 = np.load('features/K_2_tr_mismatch_10-2.npy')
# Load the kernel vector on {Xte_i, Xte_i}_i
K_te_0 = np.load('features/K_0_te_diag_mismatch_10-2.npy')
K_te_1 = np.load('features/K_1_te_diag_mismatch_10-2.npy')
K_te_2 = np.load('features/K_2_te_diag_mismatch_10-2.npy')
# Concatenate the kernel vector on {Xtr_i, Xte_i}_i and {Xte_i, Xte_i}_i to get the diagonal of the whole kernel matrix K 
K_diag_0 = np.concatenate([np.diag(K_tr_0),K_te_0.flatten()], axis=0)
K_diag_1 = np.concatenate([np.diag(K_tr_1),K_te_1.flatten()], axis=0)
K_diag_2 = np.concatenate([np.diag(K_tr_2),K_te_2.flatten()], axis=0)
# Load the kernel matrices on Xte x Xtr
K_tr_te_0 = np.load('features/K_0_te_mismatch_10-2.npy')
K_tr_te_1 = np.load('features/K_1_te_mismatch_10-2.npy')
K_tr_te_2 = np.load('features/K_2_te_mismatch_10-2.npy')
# Concatenate the kernel matrices on Xtr x Xtr and Xte x Xtr to get the whole kernel matrix K on (Xtr U Xte) x Xtr
K_0 = np.concatenate([K_tr_0, K_tr_te_0], axis=0)
K_1 = np.concatenate([K_tr_1, K_tr_te_1], axis=0)
K_2 = np.concatenate([K_tr_2, K_tr_te_2], axis=0)
# Normalize the kernel matrix K
D_0 = np.diag(1/np.sqrt(K_diag_0))
D_1 = np.diag(1/np.sqrt(K_diag_1))
D_2 = np.diag(1/np.sqrt(K_diag_2))
K_0 = np.dot(np.dot(D_0, K_0), D_0[:K_tr_0.shape[0]][:,:K_tr_0.shape[0]]) 
K_1 = np.dot(np.dot(D_1, K_1), D_1[:K_tr_0.shape[0]][:,:K_tr_0.shape[0]]) 
K_2 = np.dot(np.dot(D_2, K_2), D_2[:K_tr_0.shape[0]][:,:K_tr_0.shape[0]]) 

In [24]:
np.save('features/K_0_mismatch_10-2.npy', K_0)
np.save('features/K_1_mismatch_10-2.npy', K_1)
np.save('features/K_2_mismatch_10-2.npy', K_2)

## 2.2 - Load kernels

In [3]:
kernel_versions = ['5-1', '5-2', '6-1', '6-2', '7-1', '7-2', '8-1', '8-2', '9-1', '9-2', '10-2', '5-0', '6-0', '7-0', '8-0', '9-0']
K_0_dict, K_1_dict, K_2_dict = {}, {}, {}

for version in kernel_versions:
    K_0_dict[version] = np.load(f'features/K_0_mismatch_{version}.npy')
    K_1_dict[version] = np.load(f'features/K_1_mismatch_{version}.npy')
    K_2_dict[version] = np.load(f'features/K_2_mismatch_{version}.npy')

# Load the additional kernel matrix
#K_0_dict['exp-0_1'] = np.load('features/K_0_exp-0_1.npy')
#K_1_dict['exp-0_1'] = np.load('features/K_1_exp-0_1.npy')
#K_2_dict['exp-0_1'] = np.load('features/K_2_exp-0_1.npy')

In [101]:
print(K_0_dict['8-1'][1:,0].max(), K_0_dict['8-2'][1:,0].max(), K_0_dict['9-1'][1:,0].max(), K_0_dict['9-2'][1:,0].max(), K_0_dict['10-2'][1:,0].max())
print(K_0_dict['8-1'][1:,0].min(), K_1_dict['8-2'][1:,0].min(), K_1_dict['9-1'][1:,0].min(), K_0_dict['9-2'][1:,0].min(), K_1_dict['10-2'][1:,0].min())

0.50571245 0.68502593 0.454753 0.5665562 0.4964234
0.006316263 0.045187116 0.0 0.034760356 0.0020040865


## 2.3 - Combine kernels

In [79]:
kernel_versions = ['9-2', '10-2']
K_0_prod_2 = np.prod([K_0_dict[version][:,:2000] for version in kernel_versions], axis=0)
K_1_prod_2 = np.prod([K_1_dict[version][:,:2000] for version in kernel_versions], axis=0)
K_2_prod_2 = np.prod([K_2_dict[version][:,:2000] for version in kernel_versions], axis=0)

In [None]:
### Exponential kernel from concatenation of kernels
#Concatenate Kernels
kernel_versions = ['5-2', '6-2', '9-2', '8-2']
K_0_concat = np.stack([K_0_dict[version][:, :2000] for version in kernel_versions], axis=-1)
K_1_concat = np.stack([K_1_dict[version][:, :2000] for version in kernel_versions], axis=-1)
K_2_concat = np.stack([K_2_dict[version][:, :2000] for version in kernel_versions], axis=-1)
#Take norm Kernel
K_0_exp = np.exp(np.linalg.norm(K_0_concat, axis=-1))
K_1_exp = np.exp(np.linalg.norm(K_1_concat, axis=-1))
K_2_exp = np.exp(np.linalg.norm(K_2_concat, axis=-1))
K_0_norm = np.linalg.norm(K_0_concat, axis=-1)
K_1_norm = np.linalg.norm(K_1_concat, axis=-1)
K_2_norm = np.linalg.norm(K_2_concat, axis=-1)

In [306]:
# Optimal kernel (gives 100% accuracy on the training set)
K_0_opt = ((Ytr0+1)/2)[:,None]*((Ytr0+1)/2)[None,:]
K_1_opt = ((Ytr1+1)/2)[:,None]*((Ytr1+1)/2)[None,:]
K_2_opt = ((Ytr2+1)/2)[:,None]*((Ytr2+1)/2)[None,:]

In [100]:
def alignment(K_a, K_a_opt):
    return np.sum(K_a[:2000]*K_a_opt)/np.sqrt(np.sum(K_a[:2000]**2)*np.sum(K_a_opt**2))

In [None]:
kernel_versions = ['5-1', '5-2', '6-1', '6-2', '7-1', '7-2', '8-1', '8-2', '9-1', '9-2']
for kernel_version in kernel_versions:
    print(alignment(K_2_dict[kernel_version][:,:2000],K_2_opt))

In [134]:
# For dataset 0
w_0_prod_1 = alignment(K_0_prod_1, K_0_opt)
w_0_prod_2 = alignment(K_0_prod_2, K_0_opt)
w_0 = w_0_prod_1 + w_0_prod_2
K_0 = K_0_prod_1**(w_0_prod_1/w_0)*K_0_prod_2**(w_0_prod_2/w_0) + 1

# For dataset 1
w_1_prod_1 = alignment(K_1_prod_1, K_1_opt)
w_1_prod_2 = alignment(K_1_prod_2, K_1_opt)
w_1 = w_1_prod_1 + w_1_prod_2
K_1 = K_1_prod_1**(w_1_prod_1/w_1)*K_1_prod_2**(w_1_prod_2/w_1) + 1

# For dataset 2
w_2_prod_1 = alignment(K_2_prod_1, K_2_opt)
w_2_prod_2 = alignment(K_2_prod_2, K_2_opt)
w_2 = w_2_prod_1 + w_2_prod_2
K_2 = K_2_prod_1**(w_2_prod_1/w_2)*K_2_prod_2**(w_2_prod_2/w_2) + 1

## 2.3 - Local Alignment Kernel  : <span style="color:green">TODO / Time Complexity too high + value too high</span>

The Local Alignment Kernel defined as:
$$K_{LA}^{(\beta)}(x,y) = \sum_{\pi\in\Pi(x,y)} s_{S,g}(\pi)$$

is symmetric positive definite.

We assume an affine gap penalty:
$$\left\{\begin{aligned}
&g(0) = 0 \\
&g(n) = d + e(n-1) \quad \text{for } n>0
\end{aligned}\right.$$

where $l(\pi)$ is the length of the alignment $\pi$.

We use the formula for the Local Alignment Kernel:
$$K_{LA}^{(\beta)}(x,y) = 1 + X_2(|x|,|y|)+ Y_2(|x|,|y|) + M(|x|,|y|)$$ 
where $X_2$, $Y_2$ and $M$ are defined recursively.

# 3 - Run Kernel Method on Train Data

## 3.1 - Dataset 0

In [47]:
K_0_concat = np.stack([K_0_dict[version][:, :2000] for version in ['10-2', '5-0', '9-2']], axis=0)
K_0_norm = np.exp(np.linalg.norm(K_0_concat, axis=0)-1)
K_0 = K_0_norm+1
K_0.min(), K_0.max()

(1.3682092, 3.079341)

In [146]:
#Method
#K_0 = K_0_dict['10-2'][:, :2000]**1*K_0_dict['5-0'][:, :2000]**1+K_0_dict['10-2'][:, :2000]**2
lambd = 1.1e-1
#method_0 = KernelMethod((K_0)[:2000, :2000], Ytr0, solver=SVM_solver)
method_0 = KernelMethodBias(K_0[:2000, :2000], Ytr0, solver=SVM_solver_with_bias)
method_0.lambd = lambd
method_0.train_test_split(test_size=0.02, random_state=12)
method_0.fit()
method_0.evaluate()

(1.0, 0.675)

In [49]:
average_accuracy = method_0.validate(test_size=0.1, n_splits=10) 

100%|██████████| 10/10 [00:00<00:00, 3103.67it/s]


Average Accuracy: 0.649
Min Accuracy: 0.615 Max Accuracy: 0.69


- BEST AVERAGE ACCURACY : 64.72

In [None]:
K_0_concat = np.stack([K_0_dict[version][:, :2000] for version in ['8-2', '9-2', '5-1']], axis=-1)
K_0_norm = np.linalg.norm(K_0_concat, axis=-1)
K_0 = K_0_norm+1

## 3.2 - Dataset 1

In [654]:
K_1 = K_1_dict['9-2'][:2000,:2000]**2 + K_1_dict['6-1'][:2000,:2000]**2+K_1_dict['5-1'][:2000,:2000]**2

In [364]:
K_1 = K_1_dict['9-2'][:,:2000]**2

0.76, 0.81, 0.705, 0.755, 0.78, 0.835, 0.81, 0.82, 0.78, 0.795, 0.785


In [753]:
K_1 = (K_1_dict['9-2'][:,:2000]*K_1_dict['10-2'][:,:2000]+K_1_dict['6-1'][:,:2000]**2)

In [144]:
lambd_1 = 1e-1
K_1 = K_1_dict['9-2'][:,:2000]**2
#method_1 = KernelMethod((K_1)[:2000, :2000], Ytr1, lambd=lambd_1, solver=SVM_solver)
method_1 = KernelMethodBias(K_1[:2000, :2000], Ytr1,lambd=lambd_1, solver=SVM_solver_with_bias)
method_1.train_test_split(test_size=0.02, random_state=12)
method_1.fit()
method_1.evaluate()

(1.0, 0.8)

In [59]:
lambd_1 = 1e-1
K_1 = K_1_dict['9-2'][:,:2000]**2
#method_1 = KernelMethod((K_1)[:2000, :2000], Ytr1, lambd=lambd_1, solver=SVM_solver)
method_1 = KernelMethodBias(K_1[:2000, :2000], Ytr1,lambd=lambd_1, solver=SVM_solver_with_bias)
seeds = [1, 10, 42, 50, 100, 12, 15, 20, 25, 30]
val = []
for seed in seeds:
    method_1.train_test_split(test_size=0.1, random_state=seed)
    method_1.fit()
    val.append(method_1.evaluate()[1])
    print(val[-1], end=', ')
print(np.mean(val))

0.76, 0.815, 0.705, 0.755, 0.78, 0.835, 0.81, 0.815, 0.785, 0.795, 0.7855000000000001


In [60]:
average_accuracy = method_1.validate(test_size=0.02, n_splits=100)

100%|██████████| 100/100 [00:17<00:00,  5.87it/s]


Average Accuracy: 0.7912499999999999
Min Accuracy: 0.6 Max Accuracy: 0.95


In [56]:
average_accuracy = method_1.validate(test_size=0.02, n_splits=100)

100%|██████████| 100/100 [00:22<00:00,  4.44it/s]


Average Accuracy: 0.8032499999999999
Min Accuracy: 0.6 Max Accuracy: 0.975


## 3.3 - Dataset 2

In [511]:
K_2_s = []
K_2_s.append(K_2_dict['9-1'][:,:2000]*K_2_dict['9-2'][:,:2000]**2+1)
K_2_s.append(K_2_dict['9-1'][:,:2000]*K_2_dict['9-2'][:,:2000]**2+K_2_dict['5-0'][:,:2000]**3+1)
K_2_s.append(K_2_dict['5-0'][:,:2000]**3+1)
K_2_s.append(K_2_dict['10-2'][:,:2000]**1.5+1)
K_2_s.append(K_2_dict['7-1'][:,:2000]**2+1)

In [482]:
K_2 = K_2_dict['10-2'][:,:2000]**2.5+1
#K_2 = normalize(K_2[:2000])

In [None]:
K_2 = K_2_dict['9-1'][:,:2000]*K_2_dict['9-2'][:,:2000]**2+1

0.69, 0.68, 0.67, 0.68, 0.71, 0.665, 0.66, 0.685, 0.635, 0.765, 0.6839999999999999


In [None]:
K_2 = K_2_dict['9-1'][:,:2000]*K_2_dict['9-2'][:,:2000]**2+K_2_dict['5-0'][:,:2000]**3+1

0.705, 0.68, 0.69, 0.67, 0.74, 0.65, 0.655, 0.66, 0.62, 0.765, 0.6835


In [None]:
K_2 = K_2_dict['5-0'][:,:2000]**3+1

0.675, 0.67, 0.695, 0.68, 0.72, 0.655, 0.665, 0.66, 0.655, 0.74, 0.6815


In [None]:
K_2 = K_2_dict['10-2'][:,:2000]**1.5+1

0.715, 0.665, 0.7, 0.685, 0.72, 0.685, 0.67, 0.665, 0.64, 0.785, 0.693


In [61]:
K_2 = K_2_dict['10-2'][:,:2000]**2+1
lambd_2 = 1e-1
#method_2 = KernelMethod((K_2)[:2000, :2000], Ytr2, lambd=lambd_2, solver=SVM_solver)
method_2 = KernelMethodBias(K_2[:2000, :2000], Ytr2,lambd=lambd_2, solver=SVM_solver_with_bias)
seeds = [1, 10, 42, 50, 100, 12, 15, 20, 25, 30]
val = []
for seed in seeds:
    method_2.train_test_split(test_size=0.1, random_state=seed)
    method_2.fit()
    val.append(method_2.evaluate()[1])
    print(val[-1], end=', ')
print(np.mean(val))

0.68, 0.675, 0.68, 0.675, 0.72, 0.665, 0.665, 0.675, 0.63, 0.78, 0.6845000000000001


In [142]:
K_2 = K_2_dict['10-2'][:,:2000]**1.5
lambd_2 = 1e-1
#method_2 = KernelMethod((K_2)[:2000, :2000], Ytr2, lambd=lambd_2, solver=SVM_solver)
method_2 = KernelMethodBias(K_2[:2000, :2000], Ytr2,lambd=lambd_2, solver=SVM_solver_with_bias)
method_2.train_test_split(test_size=0.02, random_state=12)
method_2.fit()
method_2.evaluate()

(1.0, 0.7)

In [113]:
average_accuracy = method_2.validate(test_size=0.02, n_splits=100)

100%|██████████| 100/100 [00:18<00:00,  5.50it/s]


Average Accuracy: 0.6835
Min Accuracy: 0.525 Max Accuracy: 0.825


In [108]:
average_accuracy = method_2.validate(test_size=0.02, n_splits=100)

100%|██████████| 100/100 [00:20<00:00,  5.00it/s]


Average Accuracy: 0.6900000000000002
Min Accuracy: 0.475 Max Accuracy: 0.85


In [722]:
average_accuracy = method_2.validate(test_size=0.02, n_splits=100)

100%|██████████| 100/100 [00:16<00:00,  6.10it/s]


Average Accuracy: 0.6999999999999998
Min Accuracy: 0.5 Max Accuracy: 0.875


- BEST AVERAGE ACCURACY : 69.94

In [None]:
test = K_2_dict['9-1'][:,:2000]*K_2_dict['9-2'][:,:2000]**2+1 # lambd_2 = 1e-4 ?
average_accuracy = method_2.validate(test_size=0.1, n_splits=20)

Average Accuracy: 0.69575
Min Accuracy: 0.64 Max Accuracy: 0.75


# 4 - Apply Kernel Predictor on Test Data

In [147]:
def predict_test_labels(K, method):
    K_te = K
    alpha = method.alpha
    b = method.b
    # Predictions
    Yte0 = np.sign(np.dot(K_te, alpha * method.Y[method.train_indices]) + b)
    return Yte0

In [148]:
Yte_file_name = 'Yte_mismatches_bias_SVM_2.csv'

Yte0 = predict_test_labels(K_0[2000:][:, method_0.train_indices], method_0)
Yte1 = predict_test_labels(K_1[2000:][:,method_1.train_indices], method_1)
Yte2 = predict_test_labels(K_2[2000:][:,method_2.train_indices], method_2)

In [149]:
# Concatenate and add Id column
Yte = np.concatenate([Yte0, Yte1, Yte2])
Yte = pd.DataFrame(data=(Yte + 1) // 2, columns=['Bound'], dtype='int64')
Yte.insert(0, 'Id', Yte.index)

# Save the predictions
Yte.to_csv(Yte_file_name, index=False)

In [None]:
import pandas as pd
import numpy as np
from Local_packages.run import KernelMethod
from Local_packages.optimizer import SVM_solver


# Load the labels
Ytr0 = pd.read_csv('data/Ytr0.csv', index_col=0)
Ytr1 = pd.read_csv('data/Ytr1.csv', index_col=0)
Ytr2 = pd.read_csv('data/Ytr2.csv', index_col=0)
# Convert the labels to -1, 1
Ytr0 = 2*Ytr0['Bound'].values - 1
Ytr1 = 2*Ytr1['Bound'].values - 1
Ytr2 = 2*Ytr2['Bound'].values - 1

kernel_versions = ['5-1', '5-2', '6-1', '6-2', '7-1', '7-2', '8-1', '8-2', '9-1', '9-2', '10-2']
K_0_dict, K_1_dict, K_2_dict = {}, {}, {}

for version in kernel_versions:
    K_0_dict[version] = np.load(f'features/K_0_mismatch_{version}.npy')
    K_1_dict[version] = np.load(f'features/K_1_mismatch_{version}.npy')
    K_2_dict[version] = np.load(f'features/K_2_mismatch_{version}.npy')

### DATASET 1 ###
K_0_concat = np.stack([K_0_dict[version][:, :2000] for version in ['8-2', '9-2', '5-1']], axis=0)
K_0_norm = np.exp(np.linalg.norm(K_0_concat, axis=0)-1)
K_0 = K_0_norm+1

lambd_0 = 1e-5
method_0 = KernelMethod(K_0[:2000, :2000], Ytr0, lambd=lambd_0, solver=SVM_solver)
method_0.train_test_split(test_size=0.1, random_state=42)
method_0.fit()

### DATASET 2 ###
K_1 = K_1_dict['9-2'][:,:2000]**2

lambd_1 = 1e-4
method_1 = KernelMethod((K_1+1)[:2000], Ytr1, lambd=lambd_1, solver=SVM_solver)
method_1.train_test_split(test_size=0.1, random_state=10)
method_1.fit()

### DATASET 3 ###
K_2 = K_2_dict['9-1'][:,:2000]*K_2_dict['9-2'][:,:2000]**2+1

lambd_2 = 2e-4
method_2 = KernelMethod((K_2)[:2000, :2000], Ytr2, lambd=lambd_2, solver=SVM_solver)
method_2.train_test_split(test_size=0.1, random_state=10)
method_2.fit()

### Predictions ###
def predict_test_labels(K, method):
    K_te = K
    alpha = method.alpha
    Yte0 = np.sign(K_te @ alpha)
    return Yte0

Yte0 = predict_test_labels(K_0[2000:][:, method_0.train_indices], method_0)
Yte1 = predict_test_labels(K_1[2000:][:,method_1.train_indices], method_1)
Yte2 = predict_test_labels(K_2[2000:][:,method_2.train_indices], method_2)

Yte_file_name = 'Yte.csv'

# Concatenate and add Id column
Yte = np.concatenate([Yte0, Yte1, Yte2])
Yte = pd.DataFrame(data=(Yte + 1) // 2, columns=['Bound'], dtype='int64')
Yte.insert(0, 'Id', Yte.index)

# Save the predictions
Yte.to_csv(Yte_file_name, index=False)