In [15]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import py_stringmatching as sm
import osqp
from collections import Counter
from scipy.sparse import csc_matrix

from Local_packages.kernels import compute_kernel_matrix, gaussian_kernel
from Local_packages.run import KernelMethod
from Local_packages.optimizer import KLR_solver, SVM_solver

# 1 - Load Data

In [16]:
# Load the dataset
Xtr0 = pd.read_csv('data/Xtr0.csv', index_col=0)
Xtr1 = pd.read_csv('data/Xtr1.csv', index_col=0)
Xtr2 = pd.read_csv('data/Xtr2.csv',  index_col=0)
Xte0 = pd.read_csv('data/Xte0.csv', index_col=0)
Xte1 = pd.read_csv('data/Xte1.csv', index_col=0)
Xte2 = pd.read_csv('data/Xte2.csv', index_col=0)

Xtr0_Xte0 = pd.concat([Xtr0, Xte0], ignore_index=True)
Xtr1_Xte1 = pd.concat([Xtr1, Xte1], ignore_index=True)
Xtr2_Xte2 = pd.concat([Xtr2, Xte2], ignore_index=True)

# Load the labels
Ytr0 = pd.read_csv('data/Ytr0.csv', index_col=0)
Ytr1 = pd.read_csv('data/Ytr1.csv', index_col=0)
Ytr2 = pd.read_csv('data/Ytr2.csv', index_col=0)
# Convert the labels to -1, 1
Ytr0 = 2*Ytr0['Bound'].values - 1
Ytr1 = 2*Ytr1['Bound'].values - 1
Ytr2 = 2*Ytr2['Bound'].values - 1

In [None]:
# Load the matrix representation of the sequences
Xtr0_mat100 = pd.read_csv('data/Xtr0_mat100.csv', header=None, sep=' ').values
Xtr1_mat100 = pd.read_csv('data/Xtr1_mat100.csv', header=None, sep=' ').values
Xtr2_mat100 = pd.read_csv('data/Xtr2_mat100.csv', header=None, sep=' ').values
Xte0_mat100 = pd.read_csv('data/Xte0_mat100.csv', header=None, sep=' ').values
Xte1_mat100 = pd.read_csv('data/Xte1_mat100.csv', header=None, sep=' ').values
Xte2_mat100 = pd.read_csv('data/Xte2_mat100.csv', header=None, sep=' ').values

Xtr0_Xte0_mat100 = np.concatenate([Xtr0_mat100, Xte0_mat100], axis=0)
Xtr1_Xte1_mat100 = np.concatenate([Xtr1_mat100, Xte1_mat100], axis=0)
Xtr2_Xte2_mat100 = np.concatenate([Xtr2_mat100, Xte2_mat100], axis=0)

# 2 - Compute Kernel Matrix

In [None]:
kernel = 'mismatch'

#Gaussian Kernel - On the matrix representation of the sequences
if kernel=='exp':
    args = {'sigma': 1}
#Smith-Waterman Local Alignment Score
elif kernel=='sw':
    args = {'sw': sm.SmithWaterman()}
#Spectrum Kernel
elif kernel=='spect':
    args = {'k': 5}
#Mismatch Kernel
elif kernel=='mismatch':
    args = {'k': 8, 'm': 2}
#LA Kernel
elif kernel=='LA':
    args = {'beta': 0.5, 'd': 11, 'e': 1}
elif kernel=='LA_gpu':
    args = {'beta': 0.5, 'd': 1, 'e': 0.5}

In [4]:
#K_exp_0 = compute_kernel_matrix(Xtr0_mat100, Xtr0_mat100, kernel, **args)
#K_exp_1 = compute_kernel_matrix(Xtr1_mat100, Xtr1_mat100, kernel, **args)
#K_exp_2 = compute_kernel_matrix(Xtr2_mat100, Xtr2_mat100, kernel, **args)

K_0 = compute_kernel_matrix(Xtr0_Xte0, Xtr0_Xte0, kernel, **args) + 1
K_1 = compute_kernel_matrix(Xtr1_Xte1, Xtr1_Xte1, kernel, **args) + 1
K_2 = compute_kernel_matrix(Xtr2_Xte2, Xtr2_Xte2, kernel, **args) + 1

Computing feature vectors: 100%|██████████| 3000/3000 [00:32<00:00, 92.54it/s] 
Collecting k-mers: 100%|██████████| 3000/3000 [01:49<00:00, 27.35it/s] 
Building sparse matrix entries: 100%|██████████| 3000/3000 [01:43<00:00, 28.85it/s]
Computing feature vectors: 100%|██████████| 3000/3000 [00:19<00:00, 150.07it/s]
Collecting k-mers: 100%|██████████| 3000/3000 [00:44<00:00, 66.89it/s] 
Building sparse matrix entries: 100%|██████████| 3000/3000 [01:46<00:00, 28.25it/s]
Computing feature vectors: 100%|██████████| 3000/3000 [00:17<00:00, 166.76it/s]
Collecting k-mers: 100%|██████████| 3000/3000 [00:41<00:00, 73.00it/s] 
Building sparse matrix entries: 100%|██████████| 3000/3000 [01:41<00:00, 29.70it/s]


In [17]:
K_0 = np.load('features/K_0_mismatch_8-2.npy')
K_1 = np.load('features/K_1_mismatch_8-2.npy')
K_2 = np.load('features/K_2_mismatch_8-2.npy')

In [None]:
K_gaussian = gaussian_kernel(K_0, sigma=1)

### 2.3 - Local Alignment Kernel  : <span style="color:green">TODO / Time Complexity too high + value too high</span>

The Local Alignment Kernel defined as:
$$K_{LA}^{(\beta)}(x,y) = \sum_{\pi\in\Pi(x,y)} s_{S,g}(\pi)$$

is symmetric positive definite.

We assume an affine gap penalty:
$$\left\{\begin{aligned}
&g(0) = 0 \\
&g(n) = d + e(n-1) \quad \text{for } n>0
\end{aligned}\right.$$

where $l(\pi)$ is the length of the alignment $\pi$.

We use the formula for the Local Alignment Kernel:
$$K_{LA}^{(\beta)}(x,y) = 1 + X_2(|x|,|y|)+ Y_2(|x|,|y|) + M(|x|,|y|)$$ 
where $X_2$, $Y_2$ and $M$ are defined recursively.

# 3 - Run Kernel Method on Train Data

In [90]:
# Compute the eigenvalues of K_0
eigenvalues, _ = np.linalg.eigh(K_gaussian)

# Find the smallest eigenvalue
min_eigenvalue = np.min(eigenvalues)

# If the smallest eigenvalue is negative, adjust the diagonal
if min_eigenvalue < 0:
    K_gaussian += np.eye(K_gaussian.shape[0]) * (-min_eigenvalue)

In [20]:
#Parameters
solver=SVM_solver #quad_solver

In [33]:
#Method
lambd = 1e-4
method_0 = KernelMethod(K_0[:2000][:, :2000], Ytr0, solver=solver)
method_0.lambd = lambd
method_0.train_test_split(test_size=0.1)
method_0.fit()
method_0.evaluate()

(0.9611111111111111, 0.695)

In [32]:
#Grid Search
method_0.grid_search(np.logspace(-4, -3, 10), test_size=0.1)

Lambda: 0.0001, Accuracy: 0.64
Lambda: 0.0001291549665014884, Accuracy: 0.61
Lambda: 0.0001668100537200059, Accuracy: 0.62
Lambda: 0.00021544346900318845, Accuracy: 0.61
Lambda: 0.0002782559402207126, Accuracy: 0.63
Lambda: 0.00035938136638046257, Accuracy: 0.63
Lambda: 0.00046415888336127773, Accuracy: 0.60
Lambda: 0.0005994842503189409, Accuracy: 0.62
Lambda: 0.000774263682681127, Accuracy: 0.59
Lambda: 0.001, Accuracy: 0.59


(0.0001, 63.800000000000004)

In [34]:
average_accuracy = method_0.validate(test_size=0.1, n_splits=10)

Average Accuracy: 0.6265
Min Accuracy: 0.57 Max Accuracy: 0.7


In [28]:
lambd_1 = 1e-4

method_1 = KernelMethod(K_1[:2000][:, :2000], Ytr1, lambd=lambd_1, solver=SVM_solver)
method_1.train_test_split(test_size=0.1)
method_1.fit()
method_1.evaluate()

(0.98, 0.805)

In [None]:
#Grid Search
method_0.grid_search(np.logspace(-5, -4, 10))

In [29]:
average_accuracy = method_1.validate(test_size=0.1, n_splits=10)

Average Accuracy: 0.7335
Min Accuracy: 0.69 Max Accuracy: 0.765


In [41]:
lambd_2 = 1.2e-4

method_2 = KernelMethod(K_2[:2000][:, :2000], Ytr2, lambd=lambd_2, solver=SVM_solver)
method_2.train_test_split(test_size=0.1)
method_2.fit()
method_2.evaluate()

(0.9544444444444444, 0.725)

In [36]:
#Grid Search
method_2.grid_search(np.logspace(-5, -3, 10), test_size=0.1)

Lambda: 1e-05, Accuracy: 0.66
Lambda: 1.6681005372000593e-05, Accuracy: 0.65
Lambda: 2.782559402207126e-05, Accuracy: 0.65
Lambda: 4.641588833612782e-05, Accuracy: 0.67
Lambda: 7.742636826811278e-05, Accuracy: 0.66
Lambda: 0.0001291549665014884, Accuracy: 0.68
Lambda: 0.00021544346900318823, Accuracy: 0.65
Lambda: 0.00035938136638046257, Accuracy: 0.66
Lambda: 0.0005994842503189409, Accuracy: 0.64
Lambda: 0.001, Accuracy: 0.63


(0.0001291549665014884, 67.60000000000001)

In [42]:
average_accuracy = method_2.validate(test_size=0.1, n_splits=10)

Average Accuracy: 0.6575000000000001
Min Accuracy: 0.61 Max Accuracy: 0.71


# 4 - Apply Kernel Predictor on Test Data

In [55]:
def predict_test_labels(K, method):
    K_te = K
    alpha = method.alpha
    # Predictions
    Yte0 = np.sign(K_te @ alpha)
    return Yte0

In [56]:
Yte_file_name = 'Yte_mismatch_8-2.csv'

Yte0 = predict_test_labels(K_0[2000:][:, method_0.train_indices], method_0)
Yte1 = predict_test_labels(K_1[2000:][:,method_1.train_indices], method_1)
Yte2 = predict_test_labels(K_2[2000:][:,method_2.train_indices], method_2)

In [57]:
# Concatenate and add Id column
Yte = np.concatenate([Yte0, Yte1, Yte2])
Yte = pd.DataFrame(data=(Yte + 1) // 2, columns=['Bound'], dtype='int64')
Yte.insert(0, 'Id', Yte.index)

# Save the predictions
Yte.to_csv(Yte_file_name, index=False)