In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from joblib import Parallel, delayed
import py_stringmatching as sm
import osqp
from collections import Counter
from scipy.sparse import csc_matrix

from Local_packages.kernels import compute_kernel_matrix, gaussian_kernel, normalize
from Local_packages.run import KernelMethod
from Local_packages.optimizer import KLR_solver, SVM_solver
from scipy.optimize import minimize

# 1 - Load Data

In [3]:
# Load the dataset
Xtr0 = pd.read_csv('data/Xtr0.csv', index_col=0)
Xtr1 = pd.read_csv('data/Xtr1.csv', index_col=0)
Xtr2 = pd.read_csv('data/Xtr2.csv',  index_col=0)
Xte0 = pd.read_csv('data/Xte0.csv', index_col=0)
Xte1 = pd.read_csv('data/Xte1.csv', index_col=0)
Xte2 = pd.read_csv('data/Xte2.csv', index_col=0)

Xtr0_Xte0 = pd.concat([Xtr0, Xte0], ignore_index=True)
Xtr1_Xte1 = pd.concat([Xtr1, Xte1], ignore_index=True)
Xtr2_Xte2 = pd.concat([Xtr2, Xte2], ignore_index=True)

# Load the labels
Ytr0 = pd.read_csv('data/Ytr0.csv', index_col=0)
Ytr1 = pd.read_csv('data/Ytr1.csv', index_col=0)
Ytr2 = pd.read_csv('data/Ytr2.csv', index_col=0)
# Convert the labels to -1, 1
Ytr0 = 2*Ytr0['Bound'].values - 1
Ytr1 = 2*Ytr1['Bound'].values - 1
Ytr2 = 2*Ytr2['Bound'].values - 1

In [None]:
# Load the matrix representation of the sequences
Xtr0_mat100 = pd.read_csv('data/Xtr0_mat100.csv', header=None, sep=' ').values
Xtr1_mat100 = pd.read_csv('data/Xtr1_mat100.csv', header=None, sep=' ').values
Xtr2_mat100 = pd.read_csv('data/Xtr2_mat100.csv', header=None, sep=' ').values
Xte0_mat100 = pd.read_csv('data/Xte0_mat100.csv', header=None, sep=' ').values
Xte1_mat100 = pd.read_csv('data/Xte1_mat100.csv', header=None, sep=' ').values
Xte2_mat100 = pd.read_csv('data/Xte2_mat100.csv', header=None, sep=' ').values

Xtr0_Xte0_mat100 = np.concatenate([Xtr0_mat100, Xte0_mat100], axis=0)
Xtr1_Xte1_mat100 = np.concatenate([Xtr1_mat100, Xte1_mat100], axis=0)
Xtr2_Xte2_mat100 = np.concatenate([Xtr2_mat100, Xte2_mat100], axis=0)

# 2 - Compute Kernel Matrix

In [64]:
kernel = 'mismatch'

#Gaussian Kernel - On the matrix representation of the sequences
if kernel=='exp':
    args = {'sigma': 1}
#Smith-Waterman Local Alignment Score
elif kernel=='sw':
    args = {'sw': sm.SmithWaterman()}
#Spectrum Kernel
elif kernel=='spect':
    args = {'k': 5}
#Mismatch Kernel
elif kernel=='mismatch':
    args = {'k': 9, 'm': 1}
elif kernel=='mis_sub':
    args = {'k': 9, 'm': 2}
#LA Kernel
elif kernel=='LA':
    args = {'beta': 0.5, 'd': 11, 'e': 1}
elif kernel=='LA_gpu':
    args = {'beta': 0.5, 'd': 1, 'e': 0.5}

In [65]:
#K_exp_0 = compute_kernel_matrix(Xtr0_mat100, Xtr0_mat100, kernel, **args)
#K_exp_1 = compute_kernel_matrix(Xtr1_mat100, Xtr1_mat100, kernel, **args)
#K_exp_2 = compute_kernel_matrix(Xtr2_mat100, Xtr2_mat100, kernel, **args)

K_0 = compute_kernel_matrix(Xtr0_Xte0, Xtr0_Xte0, kernel, **args)
K_1 = compute_kernel_matrix(Xtr1_Xte1, Xtr1_Xte1, kernel, **args)
K_2 = compute_kernel_matrix(Xtr2_Xte2, Xtr2_Xte2, kernel, **args)

Computing feature vectors: 100%|██████████| 3000/3000 [00:04<00:00, 686.81it/s]
Collecting k-mers: 100%|██████████| 3000/3000 [00:17<00:00, 170.71it/s]
Building sparse matrix entries: 100%|██████████| 3000/3000 [06:16<00:00,  7.98it/s]
Computing feature vectors: 100%|██████████| 3000/3000 [00:04<00:00, 630.40it/s]
Collecting k-mers: 100%|██████████| 3000/3000 [00:16<00:00, 183.99it/s]
Building sparse matrix entries: 100%|██████████| 3000/3000 [04:45<00:00, 10.49it/s]
Computing feature vectors: 100%|██████████| 3000/3000 [00:03<00:00, 765.72it/s]
Collecting k-mers: 100%|██████████| 3000/3000 [00:17<00:00, 171.34it/s]
Building sparse matrix entries: 100%|██████████| 3000/3000 [05:01<00:00,  9.94it/s]


In [66]:
np.save('features/K_0_mismatch_9-1.npy', K_0)
np.save('features/K_1_mismatch_9-1.npy', K_1)
np.save('features/K_2_mismatch_9-1.npy', K_2)

In [None]:
def compute_kernel_for_row(i, Xte, kernel, args):
    return compute_kernel_matrix(Xte.iloc[[i]], Xte.iloc[[i]], kernel, **args)

K_te_0 = np.concatenate(Parallel(n_jobs=-1)(delayed(compute_kernel_for_row)(i, Xte0, kernel, args) for i in tqdm(range(len(Xte0)))), axis=0)
K_te_1 = np.concatenate(Parallel(n_jobs=-1)(delayed(compute_kernel_for_row)(i, Xte1, kernel, args) for i in tqdm(range(len(Xte1)))), axis=0)
K_te_2 = np.concatenate(Parallel(n_jobs=-1)(delayed(compute_kernel_for_row)(i, Xte2, kernel, args) for i in tqdm(range(len(Xte2)))), axis=0)

## 2.1 - Load kernels

In [None]:
# Load the kernel matrices on Xtr x Xtr
K_tr_0 = np.load('features/K_0_tr_mismatch_9-2.npy')
K_tr_1 = np.load('features/K_1_tr_mismatch_9-2.npy')
K_tr_2 = np.load('features/K_2_tr_mismatch_9-2.npy')
# Load the kernel vector on {Xte_i, Xte_i}_i
K_te_0 = np.load('features/K_0_te_diag_mismatch_9-2.npy')
K_te_1 = np.load('features/K_1_te_diag_mismatch_9-2.npy')
K_te_2 = np.load('features/K_2_te_diag_mismatch_9-2.npy')
# Concatenate the kernel vector on {Xtr_i, Xte_i}_i and {Xte_i, Xte_i}_i to get the diagonal of the whole kernel matrix K 
K_diag_0 = np.concatenate([np.diag(K_tr_0),K_te_0.flatten()], axis=0)
K_diag_1 = np.concatenate([np.diag(K_tr_1),K_te_1.flatten()], axis=0)
K_diag_2 = np.concatenate([np.diag(K_tr_2),K_te_2.flatten()], axis=0)
# Load the kernel matrices on Xte x Xtr
K_tr_te_0 = np.load('features/K_0_te_mismatch_9-2.npy')
K_tr_te_1 = np.load('features/K_1_te_mismatch_9-2.npy')
K_tr_te_2 = np.load('features/K_2_te_mismatch_9-2.npy')
# Concatenate the kernel matrices on Xtr x Xtr and Xte x Xtr to get the whole kernel matrix K on (Xtr U Xte) x Xtr
K_0 = np.concatenate([K_tr_0, K_tr_te_0], axis=0)
K_1 = np.concatenate([K_tr_1, K_tr_te_1], axis=0)
K_2 = np.concatenate([K_tr_2, K_tr_te_2], axis=0)
# Normalize the kernel matrix K
D_0 = np.diag(1/np.sqrt(K_diag_0))
D_1 = np.diag(1/np.sqrt(K_diag_1))
D_2 = np.diag(1/np.sqrt(K_diag_2))
K_0 = np.dot(np.dot(D_0, K_0), D_0[:K_tr_0.shape[0]][:,:K_tr_0.shape[0]]) + 1
K_1 = np.dot(np.dot(D_1, K_1), D_1[:K_tr_0.shape[0]][:,:K_tr_0.shape[0]]) + 1
K_2 = np.dot(np.dot(D_2, K_2), D_2[:K_tr_0.shape[0]][:,:K_tr_0.shape[0]]) + 1

In [57]:
np.save('features/K_0_mis_sub_9-2.npy', K_0)
np.save('features/K_1_mis_sub_9-2.npy', K_1)
np.save('features/K_2_mis_sub_9-2.npy', K_2)

## 2.2 - Combine kernels

In [44]:
K_0_9 = np.load('features/K_0_mis_sub_9-2.npy')
K_1_9 = np.load('features/K_1_mis_sub_9-2.npy')
K_2_9 = np.load('features/K_2_mis_sub_9-2.npy')

K_0_8 = np.load('features/K_0_mismatch_8-2.npy')[:,:2000]
K_1_8 = np.load('features/K_1_mismatch_8-2.npy')[:,:2000]
K_2_8 = np.load('features/K_2_mismatch_8-2.npy')[:,:2000]

In [57]:
np.save('features/K_0_mismatch_8-2.npy', K_0_8-1)
np.save('features/K_1_mismatch_8-2.npy', K_1_8-1)
np.save('features/K_2_mismatch_8-2.npy', K_2_8-1)

np.save('features/K_0_mismatch_9-2.npy', K_0_9-1)
np.save('features/K_1_mismatch_9-2.npy', K_1_9-1)
np.save('features/K_2_mismatch_9-2.npy', K_2_9-1)

In [312]:
kernel_versions = ['5-1', '5-2', '6-1', '6-2', '7-1', '7-2', '8-1', '8-2', '9-1', '9-2']
K_0_dict, K_1_dict, K_2_dict = {}, {}, {}

for version in kernel_versions:
    K_0_dict[version] = np.load(f'features/K_0_mismatch_{version}.npy')
    K_1_dict[version] = np.load(f'features/K_1_mismatch_{version}.npy')
    K_2_dict[version] = np.load(f'features/K_2_mismatch_{version}.npy')


In [71]:
print(K_0_dict['5-1'][0,1:].max(), K_0_dict['5-2'][0,1:].max(), K_0_dict['6-1'][1:,0].max(), K_0_dict['6-2'][1:,0].max(), K_0_dict['7-1'][1:,0].max(), K_0_dict['7-2'][1:,0].max(), K_0_dict['8-1'][1:,0].max(), K_0_dict['8-2'][1:,0].max(), K_0_dict['9-1'][1:,0].max(), K_0_dict['9-2'][1:,0].max())
print(K_1_dict['5-1'][0,1:].min(), K_1_dict['5-2'][0,1:].min(), K_1_dict['6-1'][1:,0].min(), K_1_dict['6-2'][1:,0].min(), K_1_dict['7-1'][1:,0].min(), K_1_dict['7-2'][1:,0].min(), K_0_dict['8-1'][1:,0].min(), K_1_dict['8-2'][1:,0].min(), K_1_dict['9-1'][1:,0].min(), K_0_dict['9-2'][1:,0].min())

0.8666736 0.9812609 0.71469575 0.9334555 0.58508676 0.83338505 0.50571245 0.68502593 0.454753 0.5665562
0.13756227 0.41658652 0.044690106 0.24268545 0.00861483 0.13032505 0.006316263 0.045187116 0.0 0.034760356


In [337]:
#Mean of kernels
kernel_versions = ['8-2', '9-2']
K_0_mean = np.mean([K_0_dict[version][:,:2000] for version in kernel_versions], axis=0) + 1
K_1_mean = np.mean([K_1_dict[version][:,:2000] for version in kernel_versions], axis=0) + 1
K_2_mean = np.mean([K_2_dict[version][:,:2000] for version in kernel_versions], axis=0) + 1

In [None]:
#Product of kernels
kernel_versions_1 = ['8-1', '9-1']
K_0_prod_1 = np.prod([K_0_dict[version][:,:2000] for version in kernel_versions_1], axis=0)
K_1_prod_1 = np.prod([K_1_dict[version][:,:2000] for version in kernel_versions_1], axis=0)
K_2_prod_1 = np.prod([K_2_dict[version][:,:2000] for version in kernel_versions_1], axis=0)
kernel_versions_2 = ['8-2', '9-2']
K_0_prod_2 = np.prod([K_0_dict[version][:,:2000] for version in kernel_versions_2], axis=0)
K_1_prod_2 = np.prod([K_1_dict[version][:,:2000] for version in kernel_versions_2], axis=0)
K_2_prod_2 = np.prod([K_2_dict[version][:,:2000] for version in kernel_versions_2], axis=0)

In [437]:
# Optimal kernel (gives 100% accuracy on the training set)
K_0_opt = Ytr0[:,None]*Ytr0[None,:]
K_1_opt = Ytr1[:,None]*Ytr1[None,:]
K_2_opt = Ytr2[:,None]*Ytr2[None,:]

In [438]:
def alignment(K_a, K_a_opt):
    return np.sum(K_a[:2000]*K_a_opt)/np.sqrt(np.sum(K_a[:2000]**2)*np.sum(K_a_opt**2))

In [629]:
# For dataset 0
w_0_prod_1 = alignment(K_0_prod_1, K_0_opt)
w_0_prod_2 = alignment(K_0_prod_2, K_0_opt)
w_0 = w_0_prod_1 + w_0_prod_2
K_0 = K_0_prod_1**(w_0_prod_1/w_0)*K_0_prod_2**(w_0_prod_2/w_0) + 1

# For dataset 1
w_1_prod_1 = alignment(K_1_prod_1, K_1_opt)
w_1_prod_2 = alignment(K_1_prod_2, K_1_opt)
w_1 = w_1_prod_1 + w_1_prod_2
K_1 = K_1_prod_1**(w_1_prod_1/w_1)*K_1_prod_2**(w_1_prod_2/w_1) + 1

# For dataset 2
w_2_prod_1 = alignment(K_2_prod_1, K_2_opt)
w_2_prod_2 = alignment(K_2_prod_2, K_2_opt)
w_2 = w_2_prod_1 + w_2_prod_2
K_2 = K_2_prod_1**(w_2_prod_1/w_2)*K_2_prod_2**(w_2_prod_2/w_2) + 1

In [613]:
K_0 = (K_0_prod_1*K_0_prod_2)**(1/2) + 1
K_1 = (K_1_prod_1*K_1_prod_2)**(1/2) + 1
K_2 = (K_2_prod_1*K_2_prod_2)**(1/2) + 1

## 2.3 - Local Alignment Kernel  : <span style="color:green">TODO / Time Complexity too high + value too high</span>

The Local Alignment Kernel defined as:
$$K_{LA}^{(\beta)}(x,y) = \sum_{\pi\in\Pi(x,y)} s_{S,g}(\pi)$$

is symmetric positive definite.

We assume an affine gap penalty:
$$\left\{\begin{aligned}
&g(0) = 0 \\
&g(n) = d + e(n-1) \quad \text{for } n>0
\end{aligned}\right.$$

where $l(\pi)$ is the length of the alignment $\pi$.

We use the formula for the Local Alignment Kernel:
$$K_{LA}^{(\beta)}(x,y) = 1 + X_2(|x|,|y|)+ Y_2(|x|,|y|) + M(|x|,|y|)$$ 
where $X_2$, $Y_2$ and $M$ are defined recursively.

# 3 - Run Kernel Method on Train Data

In [90]:
# Compute the eigenvalues of K_0
eigenvalues, _ = np.linalg.eigh(K_gaussian)

# Find the smallest eigenvalue
min_eigenvalue = np.min(eigenvalues)

# If the smallest eigenvalue is negative, adjust the diagonal
if min_eigenvalue < 0:
    K_gaussian += np.eye(K_gaussian.shape[0]) * (-min_eigenvalue)

In [78]:
#Parameters
solver=SVM_solver #quad_solver

## 3.1 - Dataset 0

In [668]:
#Method
lambd = 1e-4
method_0 = KernelMethod(K_0[:, :2000], Ytr0, solver=solver)
method_0.lambd = lambd
method_0.train_test_split(test_size=0.1)
method_0.fit()
method_0.evaluate()

(0.9994444444444445, 0.65)

In [None]:
#Grid Search
method_0.grid_search(np.logspace(-5, -3, 10), test_size=0.1)

In [None]:
# Grid search for pow
pow_values = np.linspace(2, 3, 5)

# Perform grid search
best_pow = None
best_accuracy = -np.inf

for pow in pow_values:
    method_2 = KernelMethod((K_0_dict['9-2']**pow)[:2000][:, :2000], Ytr0, lambd=lambd, solver=SVM_solver)
    method_2.train_test_split(test_size=0.1)
    accuracy = method_2.validate(test_size=0.1, n_splits=10)
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_pow = pow

print(f"Best pow: {best_pow}")
print(f"Best average accuracy: {best_accuracy}")

In [669]:
average_accuracy = method_0.validate(test_size=0.1, n_splits=10)

Average Accuracy: 0.609
Min Accuracy: 0.585 Max Accuracy: 0.65


In [108]:
average_accuracy = method_0.validate(test_size=0.1, n_splits=20)

Average Accuracy: 0.63825
Min Accuracy: 0.57 Max Accuracy: 0.7


## 3.2 - Dataset 1

In [640]:
lambd_1 = 1e-4

method_1 = KernelMethod((K_1)[:2000][:, :2000], Ytr1, lambd=lambd_1, solver=SVM_solver)
method_1.train_test_split(test_size=0.1)
method_1.fit()
method_1.evaluate()

(1.0, 0.845)

In [631]:
#Grid Search for lambda
method_1.grid_search(np.logspace(-5, -3, 10), test_size=0.1)

Lambda: 1e-05, Accuracy: 0.79
Lambda: 1.6681005372000593e-05, Accuracy: 0.80
Lambda: 2.782559402207126e-05, Accuracy: 0.79
Lambda: 4.641588833612782e-05, Accuracy: 0.80
Lambda: 7.742636826811278e-05, Accuracy: 0.79
Lambda: 0.0001291549665014884, Accuracy: 0.79
Lambda: 0.00021544346900318823, Accuracy: 0.79
Lambda: 0.00035938136638046257, Accuracy: 0.79
Lambda: 0.0005994842503189409, Accuracy: 0.74
Lambda: 0.001, Accuracy: 0.71


(4.641588833612782e-05, 80.30000000000001)

In [None]:
# Grid search for pow
pow_values = np.linspace(0.5, 2, 5)

# Perform grid search
best_pow = None
best_accuracy = -np.inf

for pow in pow_values:
    method_2 = KernelMethod(((K_1-1)**pow+1)[:2000][:, :2000], Ytr1, lambd=lambd_1, solver=SVM_solver)
    method_2.train_test_split(test_size=0.1)
    accuracy = method_2.validate(test_size=0.1, n_splits=10)
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_pow = pow

print(f"Best pow: {best_pow}")
print(f"Best average accuracy: {best_accuracy}")

In [635]:
average_accuracy = method_1.validate(test_size=0.1, n_splits=10)

Average Accuracy: 0.7905
Min Accuracy: 0.755 Max Accuracy: 0.815


## 3.3 - Dataset 2

In [643]:
test = K_2_dict['9-1'][:,:2000]*K_2_dict['9-2'][:,:2000]+1

In [659]:
lambd_2 = 1e-4

method_2 = KernelMethod(test[:2000][:, :2000], Ytr2, lambd=lambd_2, solver=SVM_solver)
method_2.train_test_split(test_size=0.1)
method_2.fit()
method_2.evaluate()

(1.0, 0.69)

In [649]:
#Grid Search
method_2.grid_search(np.logspace(-5, -3, 10), test_size=0.1)

Lambda: 1e-05, Accuracy: 0.68
Lambda: 1.6681005372000593e-05, Accuracy: 0.67
Lambda: 2.782559402207126e-05, Accuracy: 0.68
Lambda: 4.641588833612782e-05, Accuracy: 0.71
Lambda: 7.742636826811278e-05, Accuracy: 0.69
Lambda: 0.0001291549665014884, Accuracy: 0.68
Lambda: 0.00021544346900318823, Accuracy: 0.67
Lambda: 0.00035938136638046257, Accuracy: 0.67
Lambda: 0.0005994842503189409, Accuracy: 0.58
Lambda: 0.001, Accuracy: 0.58


(4.641588833612782e-05, 70.6)

In [None]:
# Grid search for pow
pow_values = np.linspace(0.5, 3, 10)

# Perform grid search
best_pow = None
best_accuracy = -np.inf

for pow in pow_values:
    method_2 = KernelMethod((K_2**pow)[:2000][:, :2000], Ytr2, lambd=lambd_2, solver=SVM_solver)
    method_2.train_test_split(test_size=0.1)
    accuracy = method_2.validate(test_size=0.1, n_splits=20)
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_pow = pow

print(f"Best pow: {best_pow}")
print(f"Best average accuracy: {best_accuracy}")

In [657]:
average_accuracy = method_2.validate(test_size=0.1, n_splits=10)

Average Accuracy: 0.6880000000000001
Min Accuracy: 0.645 Max Accuracy: 0.735


# 4 - Apply Kernel Predictor on Test Data

In [670]:
def predict_test_labels(K, method):
    K_te = K
    alpha = method.alpha
    # Predictions
    Yte0 = np.sign(K_te @ alpha)
    return Yte0

In [679]:
Yte_file_name = 'Yte_mismatch_9x8-1-2.csv'

Yte0 = predict_test_labels(K_0[2000:][:, method_0.train_indices], method_0)
Yte1 = predict_test_labels(K_1[2000:][:,method_1.train_indices], method_1)
Yte2 = predict_test_labels(test[2000:][:,method_2.train_indices], method_2)

In [680]:
Yte = pd.read_csv('Yte_mismatch_9x8.csv', index_col=0)
Yte0 = Yte['Bound'].values[:1000]*2-1

In [681]:
Yte_file_name = 'Yte_mismatch_9x8-1-2.csv'

# Concatenate and add Id column
Yte = np.concatenate([Yte0, Yte1, Yte2])
Yte = pd.DataFrame(data=(Yte + 1) // 2, columns=['Bound'], dtype='int64')
Yte.insert(0, 'Id', Yte.index)

# Save the predictions
Yte.to_csv(Yte_file_name, index=False)