In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from joblib import Parallel, delayed
import py_stringmatching as sm
import osqp
from collections import Counter
from scipy.sparse import csc_matrix

from Local_packages.kernels import compute_kernel_matrix, gaussian_kernel, normalize
from Local_packages.run import KernelMethod
from Local_packages.optimizer import KLR_solver, SVM_solver
from scipy.optimize import minimize

# 1 - Load Data

In [2]:
# Load the dataset
Xtr0 = pd.read_csv('data/Xtr0.csv', index_col=0)
Xtr1 = pd.read_csv('data/Xtr1.csv', index_col=0)
Xtr2 = pd.read_csv('data/Xtr2.csv',  index_col=0)
Xte0 = pd.read_csv('data/Xte0.csv', index_col=0)
Xte1 = pd.read_csv('data/Xte1.csv', index_col=0)
Xte2 = pd.read_csv('data/Xte2.csv', index_col=0)

Xtr0_Xte0 = pd.concat([Xtr0, Xte0], ignore_index=True)
Xtr1_Xte1 = pd.concat([Xtr1, Xte1], ignore_index=True)
Xtr2_Xte2 = pd.concat([Xtr2, Xte2], ignore_index=True)

# Load the labels
Ytr0 = pd.read_csv('data/Ytr0.csv', index_col=0)
Ytr1 = pd.read_csv('data/Ytr1.csv', index_col=0)
Ytr2 = pd.read_csv('data/Ytr2.csv', index_col=0)
# Convert the labels to -1, 1
Ytr0 = 2*Ytr0['Bound'].values - 1
Ytr1 = 2*Ytr1['Bound'].values - 1
Ytr2 = 2*Ytr2['Bound'].values - 1

In [None]:
# Load the matrix representation of the sequences
Xtr0_mat100 = pd.read_csv('data/Xtr0_mat100.csv', header=None, sep=' ').values
Xtr1_mat100 = pd.read_csv('data/Xtr1_mat100.csv', header=None, sep=' ').values
Xtr2_mat100 = pd.read_csv('data/Xtr2_mat100.csv', header=None, sep=' ').values
Xte0_mat100 = pd.read_csv('data/Xte0_mat100.csv', header=None, sep=' ').values
Xte1_mat100 = pd.read_csv('data/Xte1_mat100.csv', header=None, sep=' ').values
Xte2_mat100 = pd.read_csv('data/Xte2_mat100.csv', header=None, sep=' ').values

Xtr0_Xte0_mat100 = np.concatenate([Xtr0_mat100, Xte0_mat100], axis=0)
Xtr1_Xte1_mat100 = np.concatenate([Xtr1_mat100, Xte1_mat100], axis=0)
Xtr2_Xte2_mat100 = np.concatenate([Xtr2_mat100, Xte2_mat100], axis=0)

# 2 - Compute Kernel Matrix

In [3]:
kernel = 'mis_sub'

#Gaussian Kernel - On the matrix representation of the sequences
if kernel=='exp':
    args = {'sigma': 1}
#Smith-Waterman Local Alignment Score
elif kernel=='sw':
    args = {'sw': sm.SmithWaterman()}
#Spectrum Kernel
elif kernel=='spect':
    args = {'k': 5}
#Mismatch Kernel
elif kernel=='mismatch':
    args = {'k': 1, 'm': 0}
elif kernel=='mis_sub':
    args = {'k': 9, 'm': 2}
#LA Kernel
elif kernel=='LA':
    args = {'beta': 0.5, 'd': 11, 'e': 1}
elif kernel=='LA_gpu':
    args = {'beta': 0.5, 'd': 1, 'e': 0.5}

In [None]:
#K_exp_0 = compute_kernel_matrix(Xtr0_mat100, Xtr0_mat100, kernel, **args)
#K_exp_1 = compute_kernel_matrix(Xtr1_mat100, Xtr1_mat100, kernel, **args)
#K_exp_2 = compute_kernel_matrix(Xtr2_mat100, Xtr2_mat100, kernel, **args)

K_0 = compute_kernel_matrix(Xte0, Xtr0, kernel, **args)
K_1 = compute_kernel_matrix(Xte1, Xtr1, kernel, **args)
K_2 = compute_kernel_matrix(Xte2, Xtr2, kernel, **args)

In [None]:
def compute_kernel_for_row(i, Xte, kernel, args):
    return compute_kernel_matrix(Xte.iloc[[i]], Xte.iloc[[i]], kernel, **args)

K_te_0 = np.concatenate(Parallel(n_jobs=-1)(delayed(compute_kernel_for_row)(i, Xte0, kernel, args) for i in tqdm(range(len(Xte0)))), axis=0)
K_te_1 = np.concatenate(Parallel(n_jobs=-1)(delayed(compute_kernel_for_row)(i, Xte1, kernel, args) for i in tqdm(range(len(Xte1)))), axis=0)
K_te_2 = np.concatenate(Parallel(n_jobs=-1)(delayed(compute_kernel_for_row)(i, Xte2, kernel, args) for i in tqdm(range(len(Xte2)))), axis=0)

In [None]:
K_tr_0 = np.load('features/K_0_tr_mismatch_9-2.npy')
K_tr_1 = np.load('features/K_1_tr_mismatch_9-2.npy')
K_tr_2 = np.load('features/K_2_tr_mismatch_9-2.npy')

K_te_0 = np.load('features/K_0_te_diag_mismatch_9-2.npy')
K_te_1 = np.load('features/K_1_te_diag_mismatch_9-2.npy')
K_te_2 = np.load('features/K_2_te_diag_mismatch_9-2.npy')

K_diag_0 = np.concatenate([np.diag(K_tr_0),K_te_0.flatten()], axis=0)
K_diag_1 = np.concatenate([np.diag(K_tr_1),K_te_1.flatten()], axis=0)
K_diag_2 = np.concatenate([np.diag(K_tr_2),K_te_2.flatten()], axis=0)

K_tr_te_0 = np.load('features/K_0_te_mismatch_9-2.npy')
K_tr_te_1 = np.load('features/K_1_te_mismatch_9-2.npy')
K_tr_te_2 = np.load('features/K_2_te_mismatch_9-2.npy')

K_0 = np.concatenate([K_tr_0, K_tr_te_0], axis=0)
K_1 = np.concatenate([K_tr_1, K_tr_te_1], axis=0)
K_2 = np.concatenate([K_tr_2, K_tr_te_2], axis=0)

D_0 = np.diag(1/np.sqrt(K_diag_0))
D_1 = np.diag(1/np.sqrt(K_diag_1))
D_2 = np.diag(1/np.sqrt(K_diag_2))
K_0 = np.dot(np.dot(D_0, K_0), D_0[:K_tr_0.shape[0]][:,:K_tr_0.shape[0]]) + 1
K_1 = np.dot(np.dot(D_1, K_1), D_1[:K_tr_0.shape[0]][:,:K_tr_0.shape[0]]) + 1
K_2 = np.dot(np.dot(D_2, K_2), D_2[:K_tr_0.shape[0]][:,:K_tr_0.shape[0]]) + 1

In [57]:
np.save('features/K_0_mis_sub_9-2.npy', K_0)
np.save('features/K_1_mis_sub_9-2.npy', K_1)
np.save('features/K_2_mis_sub_9-2.npy', K_2)

In [211]:
K_gaussian_0 = gaussian_kernel(K_0[:2000,:2000], sigma=1)
K_gaussian_1 = gaussian_kernel(K_1[:2000,:2000], sigma=1)
K_gaussian_2 = gaussian_kernel(K_2[:2000,:2000], sigma=1)

In [212]:
print(K_gaussian_0.std(), K_gaussian_1.std(), K_gaussian_2.std())

0.023471652 0.022064814 0.027451018


### 2.3 - Local Alignment Kernel  : <span style="color:green">TODO / Time Complexity too high + value too high</span>

The Local Alignment Kernel defined as:
$$K_{LA}^{(\beta)}(x,y) = \sum_{\pi\in\Pi(x,y)} s_{S,g}(\pi)$$

is symmetric positive definite.

We assume an affine gap penalty:
$$\left\{\begin{aligned}
&g(0) = 0 \\
&g(n) = d + e(n-1) \quad \text{for } n>0
\end{aligned}\right.$$

where $l(\pi)$ is the length of the alignment $\pi$.

We use the formula for the Local Alignment Kernel:
$$K_{LA}^{(\beta)}(x,y) = 1 + X_2(|x|,|y|)+ Y_2(|x|,|y|) + M(|x|,|y|)$$ 
where $X_2$, $Y_2$ and $M$ are defined recursively.

# 3 - Run Kernel Method on Train Data

In [90]:
# Compute the eigenvalues of K_0
eigenvalues, _ = np.linalg.eigh(K_gaussian)

# Find the smallest eigenvalue
min_eigenvalue = np.min(eigenvalues)

# If the smallest eigenvalue is negative, adjust the diagonal
if min_eigenvalue < 0:
    K_gaussian += np.eye(K_gaussian.shape[0]) * (-min_eigenvalue)

In [58]:
#Parameters
solver=SVM_solver #quad_solver

## 3.1 - Dataset 0

In [None]:
#Method
lambd = 2e-4
method_0 = KernelMethod(K_gaussian_0[:2000][:, :2000], Ytr0, solver=solver)
method_0.lambd = lambd
method_0.train_test_split(test_size=0.1)
method_0.fit()
method_0.evaluate()

(0.9694444444444444, 0.64)

In [205]:
#Grid Search
method_0.grid_search(np.logspace(-5, -3, 10), test_size=0.1)

Lambda: 1e-05, Accuracy: 0.62
Lambda: 1.6681005372000593e-05, Accuracy: 0.62
Lambda: 2.782559402207126e-05, Accuracy: 0.64
Lambda: 4.641588833612782e-05, Accuracy: 0.64
Lambda: 7.742636826811278e-05, Accuracy: 0.62
Lambda: 0.0001291549665014884, Accuracy: 0.62
Lambda: 0.00021544346900318823, Accuracy: 0.65
Lambda: 0.00035938136638046257, Accuracy: 0.59
Lambda: 0.0005994842503189409, Accuracy: 0.60
Lambda: 0.001, Accuracy: 0.51


(0.00021544346900318823, 64.5)

In [None]:
# Grid search for test_size
test_size_values = np.linspace(0.01, 0.5, 10)

# Perform grid search
best_test_size = None
best_accuracy = -np.inf

for test_size in test_size_values:
    accuracy = method_0.validate(test_size=test_size, n_splits=10)
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_test_size = test_size

print(f"Best test size: {best_test_size}")
print(f"Best average accuracy: {best_accuracy}")

In [219]:
average_accuracy = method_0.validate(test_size=0.1, n_splits=10)

Average Accuracy: 0.619
Min Accuracy: 0.555 Max Accuracy: 0.66


## 3.1 - Dataset 1

In [None]:
lambd_1 = 7e-5

method_1 = KernelMethod(K_gaussian_1[:2000][:, :2000], Ytr1, lambd=lambd_1, solver=SVM_solver)
method_1.train_test_split(test_size=0.2)
method_1.fit()
method_1.evaluate()

(1.0, 0.755)

In [None]:
#Grid Search for lambda
method_1.grid_search(np.logspace(-5, -3, 10), test_size=0.1)

Lambda: 1e-05, Accuracy: 0.75
Lambda: 1.6681005372000593e-05, Accuracy: 0.76
Lambda: 2.782559402207126e-05, Accuracy: 0.76
Lambda: 4.641588833612782e-05, Accuracy: 0.76
Lambda: 7.742636826811278e-05, Accuracy: 0.77
Lambda: 0.0001291549665014884, Accuracy: 0.76
Lambda: 0.00021544346900318823, Accuracy: 0.75
Lambda: 0.00035938136638046257, Accuracy: 0.74
Lambda: 0.0005994842503189409, Accuracy: 0.71
Lambda: 0.001, Accuracy: 0.67


(7.742636826811278e-05, 76.55)

In [None]:
# Grid search for test_size
test_size_values = np.linspace(0.01, 0.5, 10)

# Perform grid search
best_test_size = None
best_accuracy = -np.inf

for test_size in test_size_values:
    accuracy = method_1.validate(test_size=test_size, n_splits=10)
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_test_size = test_size

print(f"Best test size: {best_test_size}")
print(f"Best average accuracy: {best_accuracy}")

In [217]:
average_accuracy = method_1.validate(test_size=0.2, n_splits=20)

Average Accuracy: 0.7545
Min Accuracy: 0.72 Max Accuracy: 0.78


## 3.3 - Dataset 2

In [224]:
lambd_2 = 1.67e-5

method_2 = KernelMethod(K_gaussian_2[:2000][:, :2000], Ytr2, lambd=lambd_2, solver=SVM_solver)
method_2.train_test_split(test_size=0.17)
method_2.fit()
method_2.evaluate()

(1.0, 0.638235294117647)

In [223]:
#Grid Search
method_2.grid_search(np.logspace(-5, -3, 10), test_size=0.17)

Lambda: 1e-05, Accuracy: 0.66
Lambda: 1.6681005372000593e-05, Accuracy: 0.67
Lambda: 2.782559402207126e-05, Accuracy: 0.67
Lambda: 4.641588833612782e-05, Accuracy: 0.65
Lambda: 7.742636826811278e-05, Accuracy: 0.67
Lambda: 0.0001291549665014884, Accuracy: 0.66
Lambda: 0.00021544346900318823, Accuracy: 0.65
Lambda: 0.00035938136638046257, Accuracy: 0.65
Lambda: 0.0005994842503189409, Accuracy: 0.62
Lambda: 0.001, Accuracy: 0.60


(1.6681005372000593e-05, 66.88)

In [None]:
# Grid search for test_size
test_size_values = np.linspace(0.01, 0.5, 10)

# Perform grid search
best_test_size = None
best_accuracy = -np.inf

for test_size in test_size_values:
    accuracy = method_2.validate(test_size=test_size, n_splits=10)
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_test_size = test_size

print(f"Best test size: {best_test_size}")
print(f"Best average accuracy: {best_accuracy}")

In [225]:
average_accuracy = method_2.validate(test_size=0.17, n_splits=20)

Average Accuracy: 0.6622058823529412
Min Accuracy: 0.6029411764705882 Max Accuracy: 0.7058823529411765


# 4 - Apply Kernel Predictor on Test Data

In [101]:
def predict_test_labels(K, method):
    K_te = K
    alpha = method.alpha
    # Predictions
    Yte0 = np.sign(K_te @ alpha)
    return Yte0

In [153]:
Yte_file_name = 'Yte_mismatch_9-2_17_perc_val.csv'

Yte0 = predict_test_labels(K_0[2000:][:, method_0.train_indices], method_0)
Yte1 = predict_test_labels(K_1[2000:][:,method_1.train_indices], method_1)
Yte2 = predict_test_labels(K_2[2000:][:,method_2.train_indices], method_2)

In [154]:
# Concatenate and add Id column
Yte = np.concatenate([Yte0, Yte1, Yte2])
Yte = pd.DataFrame(data=(Yte + 1) // 2, columns=['Bound'], dtype='int64')
Yte.insert(0, 'Id', Yte.index)

# Save the predictions
Yte.to_csv(Yte_file_name, index=False)