In [1]:
import numpy as np
import pandas as pd 
from tqdm import tqdm

# Kernel Methods for Machine Learning

## Data Loading and exploration

In [2]:
# Train features
df_Xtr0 = pd.read_csv("Data/Xtr0.csv")
df_Xtr1 = pd.read_csv("Data/Xtr1.csv")
df_Xtr2 = pd.read_csv("Data/Xtr2.csv")

# Train labels
df_Ytr0 = pd.read_csv("Data/Ytr0.csv")
df_Ytr1 = pd.read_csv("Data/Ytr1.csv")
df_Ytr2 = pd.read_csv("Data/Ytr2.csv")

# Test features
df_Xte0 = pd.read_csv("Data/Xte0.csv")
df_Xte1 = pd.read_csv("Data/Xte1.csv")
df_Xte2 = pd.read_csv("Data/Xte2.csv")

## 1. Spectrum Kernel

### 1.1. Define the kernel

In [3]:
def getSubString(mString, spectrum):
    tmpList = []
    if (spectrum == 0):
        tmpList = ['']
    else:
        for i in range(len(mString)-spectrum+1):
            mStringRes = ''
            for j in range(spectrum):
                mStringRes += mString[i+j]
            tmpList.append(mStringRes)
    return tmpList

def SpectrumKernelFunction(mString1, mString2, spectrum):
    subString1 = getSubString(mString1, spectrum)
    subString2 = getSubString(mString2, spectrum)
    kernel = 0
    for i in subString1:
        for j in subString2:
            if (i==j):
                kernel += 1
    return kernel

## We should improve this function to take less time
def SpectrumKernelMatrix_train(serie,spectrum):
    n = serie.shape[0]
    K = np.zeros((n,n))
    for i,seq1 in enumerate(tqdm(serie)):
        for j,seq2 in enumerate(serie):
            if i <= j :
                K[i,j] = SpectrumKernelFunction(seq1, seq2, spectrum)
                K[j,i] = SpectrumKernelFunction(seq1, seq2, spectrum)
    return(K)

def SpectrumKernelMatrix_test(serie_train, serie_test, spectrum):
    n = serie_train.shape[0]
    m = serie_test.shape[0]
    K = np.zeros((n,m))
    for i,seq1 in enumerate(tqdm(serie_test)):
        for j,seq2 in enumerate(serie_train):
            K[j,i] = SpectrumKernelFunction(seq1, seq2, spectrum)
    
    

We compute the Kernel matrix for each of the tree train sets and we save them in *Kernel_Matrix* directory

In [None]:
# We should parallelize this computation
K_Xtr0 = SpectrumKernelMatrix_train(df_Xtr0['seq'],spectrum=3)
np.save("Kernel_Matrix/spectrum_kernel_Xtr0.npy",K_Xtr0)

K_Xtr1 = SpectrumKernelMatrix_train(df_Xtr1['seq'],spectrum=3)
np.save("Kernel_Matrix/spectrum_kernel_Xtr1.npy",K_Xtr1)

K_Xtr2 = SpectrumKernelMatrix_train(df_Xtr2['seq'],spectrum=3)
np.save("Kernel_Matrix/spectrum_kernel_Xtr2.npy",K_Xtr2)


We compute the Kernel matrix for each of the tree test sets and we save them in *Kernel_Matrix* directory

In [None]:
# We should parallelize this computation
K_Xte0 = SpectrumKernelMatrix_train(df_Xtr0['seq'],df_Xte0['seq'],spectrum=3)
np.save("Kernel_Matrix/spectrum_kernel_Xte0.npy",K_Xte0)

K_Xte1 = SpectrumKernelMatrix_train(df_Xtr1['seq'],df_Xte1['seq'],spectrum=3)
np.save("Kernel_Matrix/spectrum_kernel_Xte1.npy",K_Xte1)

K_Xte2 = SpectrumKernelMatrix_train(df_Xtr2['seq'],df_Xte2['seq'],spectrum=3)
np.save("Kernel_Matrix/spectrum_kernel_Xte2.npy",K_Xte2)

### 1.2. Solve the standard weighted kernel ridge regression (WKRR) problem

In [4]:
def sigmoid(x):
    return 1/(1+np.exp(-x))

### We need to improve this ####
def sqrtMatrix(W):
    # To compute the square root of a symetric positive matrix
    D,V = np.linalg.eig(W)
    return np.dot(np.dot(V,np.diag(np.sqrt(D))),np.linalg.inv(V))
    


def solveWKRR(K,W,z,lambda_):
    n = K.shape[0]
    W_sqrt = np.real(sqrtMatrix(W))
    
    temp = np.dot(np.dot(W_sqrt,K),W_sqrt) +  n*lambda_*np.eye(n)
    return  np.dot(W,np.linalg.solve(temp,np.dot(W_sqrt,z)))

def solveKLR(K,y,alpha0,lambda_ = 1,itermax = 10, eps =1e-3):
    n = K.shape[0]
    
    iter_ = 0
    alpha = alpha0
    while(iter_< itermax): # Add stopping criteria
        print(iter_)
        m = np.dot(K,alpha)
        P = np.zeros((n,1))
        W = np.zeros((n,n))
        z = np.zeros((n,1))
        for i in range(n):
            P[i,0] = -sigmoid(-y[i]*m[i])
            W[i,i] = sigmoid(m[i])*sigmoid(-m[i])
            z[i,0] = m[i] - (P[i,0]*y[i])/W[i,i]
        alpha = solveWKRR(K,W,z,lambda_)
        iter_ = iter_ +1
      
    return alpha
            
    


In [5]:
K = np.load("Kernel_Matrix/spectrum_kernel_Xtr0.npy")
y = df_Ytr0['Bound']
n = y.shape[0]
alpha0 = np.zeros(n) 

In [6]:
alpha = solveKLR(K,y,alpha0) 

0
1
2
3
4
5
6
7
8
9


### 1.3 Results