In [1]:
import numpy as np
import pandas as pd 
#from tqdm import tqdm
import os 

# Kernel Methods for Machine Learning

## Data Loading and exploration

In [9]:
# Train features
df_Xtr0 = pd.read_csv("Data/Xtr0.csv")
df_Xtr1 = pd.read_csv("Data/Xtr1.csv")
df_Xtr2 = pd.read_csv("Data/Xtr2.csv")

df_Xtr0_mat100 = pd.read_csv("Data/Xtr0_mat100.csv", header=None, sep=' ')
df_Xtr1_mat100 = pd.read_csv("Data/Xtr1_mat100.csv", header=None, sep=' ')
df_Xtr2_mat100 = pd.read_csv("Data/Xtr2_mat100.csv", header=None, sep=' ')

# Train labels
df_Ytr0 = pd.read_csv("Data/Ytr0.csv")
df_Ytr1 = pd.read_csv("Data/Ytr1.csv")
df_Ytr2 = pd.read_csv("Data/Ytr2.csv")

# Test features
df_Xte0 = pd.read_csv("Data/Xte0.csv")
df_Xte1 = pd.read_csv("Data/Xte1.csv")
df_Xte2 = pd.read_csv("Data/Xte2.csv")

df_Xte0_mat100 = pd.read_csv("Data/Xte0_mat100.csv", header=None, sep=' ')
df_Xte1_mat100 = pd.read_csv("Data/Xte1_mat100.csv", header=None, sep=' ')
df_Xte2_mat100 = pd.read_csv("Data/Xte2_mat100.csv", header=None, sep=' ')

## 2. Gaussian Kernel

### 2.1 Define the kernel

In [74]:
## We need to parallize its computation
def GaussKernel(X1, X2, sigma = 1):
    n, m = X1.shape[0], X2.shape[0]
    K = np.zeros((n,m))
    
    for i in range(n):
        for j in range(m):
            K[i,j] = np.sqrt((1/(2*np.pi*sigma**2)))*np.exp(-((np.linalg.norm(X1[i]-X2[j]))**2)/(2*sigma**2))
    return K

We compute the Kernel matrix for each of the tree train sets and we save them in *Kernel_Matrix* directory

In [11]:
# Tranforming into numpy.arrays -- train
Xtr0_mat100 = np.array(df_Xtr0_mat100)
Xtr1_mat100 = np.array(df_Xtr1_mat100)
Xtr2_mat100 = np.array(df_Xtr2_mat100)

# Tranforming into numpy.arrays -- test
Xte0_mat100 = np.array(df_Xte0_mat100)
Xte1_mat100 = np.array(df_Xte1_mat100)
Xte2_mat100 = np.array(df_Xte2_mat100)

# Transforming the labels into numpy.arrays 
y0 = np.array(df_Ytr0)[:,1]
y1 = np.array(df_Ytr1)[:,1]
y2 = np.array(df_Ytr2)[:,1]

In [13]:
# We should parallelize this computation
K_Xtr0 = GaussKernel(Xtr0_mat100, Xtr0_mat100)
np.save("Kernel_Matrix/gaussian_kernel_Xtr0.npy",K_Xtr0)

K_Xtr1 = GaussKernel(Xtr1_mat100, Xtr1_mat100)
np.save("Kernel_Matrix/gaussian_kernel_Xtr1.npy",K_Xtr1)

K_Xtr2 = GaussKernel(Xtr2_mat100, Xtr2_mat100)
np.save("Kernel_Matrix/gaussian_kernel_Xtr2.npy",K_Xtr2)

### 2.2. Implement SVM with gaussian kernel

We solve the optimization problem $$\left\{\begin{matrix}
\underset{\alpha \in \mathbb{R}^{n}}{\text{max}} \hspace{0.1cm} 2\alpha^{T}y - \alpha^{T}K\alpha \\ 0 \leq y_i\alpha_i \leq \frac{1}{2\lambda n}, \hspace{0.5cm} \text{for} \hspace{0.3cm} i = 0...n
\end{matrix}\right. \Leftrightarrow \left\{\begin{matrix}
\underset{\alpha \in \mathbb{R}^{n}}{\text{min}} \hspace{0.1cm} \frac{1}{2}\alpha^{T}P\alpha + q^{t}\alpha  \\ G\alpha \leq h
\end{matrix}\right.   $$
Where $\tilde{P} = K$, $q = -y$, $G =\binom{\text{Diag}(y)}{-\text{Diag}(y)} $ and $h=\binom{\frac{1}{2\lambda n}\mathcal{1}}{0}$

In [14]:
import cvxopt
from cvxopt import matrix

def solve_dual_SVM(K,y, lambda_ = 1):
    n = K.shape[0] 
    G = np.vstack((np.diag(y),-np.diag(y)))
    h = np.vstack(((1/(2*lambda_*n))*np.ones((n,1)),np.zeros((n,1))))

    P = K
    q = -y.reshape(-1,1)
    #P = .5 * (P + P.T)  # make sure P is symmetric
    args = [matrix(P), matrix(q)]

    args.extend([matrix(G), matrix(h)])

    sol = cvxopt.solvers.qp(*args) 

    return np.array(sol['x']).reshape((P.shape[1],))
    


In [None]:
alpha_star0 = solve_dual_SVM(K_Xtr0,2*y0-1., lambda_= 0.000001)
alpha_star1 = solve_dual_SVM(K_Xtr1,2*y1-1., lambda_= 0.000001)
alpha_star2 = solve_dual_SVM(K_Xtr2,2*y2-1., lambda_= 0.000001)

### 2.3. Predictions on test set  

In [None]:
# We should parallelize this computation
K_Xte0 = GaussKernel(Xtr0_mat100, Xte0_mat100)
np.save("Kernel_Matrix/gaussian_kernel_Xte0.npy",K_Xte0)

K_Xte1 = GaussKernel(Xtr1_mat100, Xte1_mat100)
np.save("Kernel_Matrix/gaussian_kernel_Xte1.npy",K_Xte1)

K_Xte2 = GaussKernel(Xtr2_mat100, Xte2_mat100)
np.save("Kernel_Matrix/gaussian_kernel_Xte2.npy",K_Xte2)

In [None]:
prediction0 = alpha_star0.reshape(-1,1).T.dot(K_Xte0)
prediction0[prediction0>0] = 1
prediction0[prediction0 <0] = 0

prediction1 = alpha_star1.reshape(-1,1).T.dot(K_Xte1)
prediction1[prediction1>0] = 1
prediction1[prediction1 <0] = 0

prediction2 = alpha_star2.reshape(-1,1).T.dot(K_Xte2)
prediction2[prediction2>0] = 1
prediction2[prediction2 <0] = 0

### 2.4. Evaluation

In [None]:
train_prediction0 = (np.sign(alpha_star0.reshape(-1,1).T.dot(K_Xtr0))+1)/2
print('Train Accuracy 0:',1- np.abs(train_prediction0 - y0).sum()/y0.shape[0])

train_prediction1 = (np.sign(alpha_star1.reshape(-1,1).T.dot(K_Xtr1))+1)/2
print('Train Accuracy 1:',1- np.abs(train_prediction1 - y1).sum()/y1.shape[0])

train_prediction2 = (np.sign(alpha_star2.reshape(-1,1).T.dot(K_Xtr2))+1)/2
print('Train Accuracy 2:',1 - np.abs(train_prediction2 - y2).sum()/y2.shape[0])

### 2.5. Writting the results

In [None]:
predictions = np.squeeze(np.hstack((prediction0, prediction1, prediction2))).astype(int)
df = pd.DataFrame({'Bound': predictions,
                   'Id': np.arange(3000)})
df = df[['Id','Bound']]

df.to_csv("Predictions/gaussian_SVM.csv",index = False)

### 1.4 Sickit-learn

In [None]:
from sklearn.svm import SVC

clf = SVC(gamma='auto')
clf.fit(Xtr0_mat100, y0)
predciton0_sk = clf.predict(Xte0_mat100)

clf = SVC(gamma='auto')
clf.fit(Xtr1_mat100, y1)
predciton1_sk = clf.predict(Xte1_mat100)

clf = SVC(gamma='auto')
clf.fit(Xtr2_mat100, y2)
predciton2_sk = clf.predict(Xte2_mat100)

## 1. Spectrum Kernel

### 1.1. Define the kernel

In [None]:
def getSubString(mString, spectrum):
    
    dictionnary = {}
    for i in range(len(mString)-spectrum+1):
        if mString[i:i+spectrum] in dictionnary:
            dictionnary[mString[i:i+spectrum]] += 1
        else:
            dictionnary[mString[i:i+spectrum]] = 1
    return dictionnary

def SpectrumKernelFunction(mString1, mString2, spectrum):
    dictionnary1 = getSubString(mString1, spectrum)
    dictionnary2 = getSubString(mString2, spectrum)
    
    kernel = 0
    for i in dictionnary1:
        if i in dictionnary2:
            kernel += dictionnary1[i] * dictionnary2[i]
    return kernel

## We should improve this function to take less time
def SpectrumKernelMatrix_train(serie,spectrum):
    n = serie.shape[0]
    K = np.zeros((n,n))
    for i,seq1 in enumerate(serie):
        for j,seq2 in enumerate(serie):
            if i <= j :
                K[i,j] = SpectrumKernelFunction(seq1, seq2, spectrum)
                K[j,i] = K[i,j]
    return(K)

def SpectrumKernelMatrix_test(serie_train, serie_test, spectrum):
    n = serie_train.shape[0]
    m = serie_test.shape[0]
    K = np.zeros((n,m))
    for i,seq1 in enumerate(serie_test):
        for j,seq2 in enumerate(serie_train):
            K[j,i] = SpectrumKernelFunction(seq1, seq2, spectrum)
    return(K)
    

We compute the Kernel matrix for each of the tree train sets and we save them in *Kernel_Matrix* directory

In [None]:
# We should parallelize this computation

if os.path.isfile("Kernel_Matrix/spectrum_kernel_Xtr0.npy"):
    K_Xtr0 = np.load("Kernel_Matrix/spectrum_kernel_Xtr0.npy")
else:
    K_Xtr0 = SpectrumKernelMatrix_train(df_Xtr0['seq'],spectrum=3)
    np.save("Kernel_Matrix/spectrum_kernel_Xtr0.npy",K_Xtr0)

if os.path.isfile("Kernel_Matrix/spectrum_kernel_Xtr1.npy"):
    K_Xtr1 = np.load("Kernel_Matrix/spectrum_kernel_Xtr1.npy")
else:
    K_Xtr1 = SpectrumKernelMatrix_train(df_Xtr0['seq'],spectrum=3)
    np.save("Kernel_Matrix/spectrum_kernel_Xtr1.npy",K_Xtr1)

if os.path.isfile("Kernel_Matrix/spectrum_kernel_Xtr2.npy"):
    K_Xtr2 = np.load("Kernel_Matrix/spectrum_kernel_Xtr2.npy")
else:
    K_Xtr2 = SpectrumKernelMatrix_train(df_Xtr0['seq'],spectrum=3)
    np.save("Kernel_Matrix/spectrum_kernel_Xtr2.npy",K_Xtr2)

We compute the Kernel matrix for each of the tree test sets and we save them in *Kernel_Matrix* directory

In [None]:
# We should parallelize this computation
if os.path.isfile("Kernel_Matrix/spectrum_kernel_Xte0.npy"):
    K_Xte0 = np.load("Kernel_Matrix/spectrum_kernel_Xte0.npy")
else:
    K_Xte0 = SpectrumKernelMatrix_test(df_Xtr0['seq'],df_Xte0['seq'],spectrum=3)
    np.save("Kernel_Matrix/spectrum_kernel_Xte0.npy",K_Xtr0)

if os.path.isfile("Kernel_Matrix/spectrum_kernel_Xte1.npy"):
    K_Xte1 = np.load("Kernel_Matrix/spectrum_kernel_Xte1.npy")
else:
    K_Xte1 = SpectrumKernelMatrix_test(df_Xtr1['seq'],df_Xte1['seq'],spectrum=3)
    np.save("Kernel_Matrix/spectrum_kernel_Xte1.npy",K_Xtr1)

if os.path.isfile("Kernel_Matrix/spectrum_kernel_Xte2.npy"):
    K_Xte2 = np.load("Kernel_Matrix/spectrum_kernel_Xte2.npy")
else:
    K_Xte2 = SpectrumKernelMatrix_test(df_Xtr2['seq'],df_Xte2['seq'],spectrum=3)
    np.save("Kernel_Matrix/spectrum_kernel_Xte2.npy",K_Xtr2)

### 1.2. Solve the standard weighted kernel logisitc regression (WKLR) problem

In [None]:
def sigmoid(x):
    return 1/(1+np.exp(-x))

### We need to improve this ####
def sqrtMatrix(W):
    # To compute the square root of a symetric positive matrix
    D,V = np.linalg.eig(W)
    return np.dot(np.dot(V,np.diag(np.sqrt(D))),np.linalg.inv(V))

def solveWKRR(K,W,z,lambda_):
    n = K.shape[0]
    W_sqrt = np.real(sqrtMatrix(W))
    
    temp = np.dot(np.dot(W_sqrt,K),W_sqrt) +  n*lambda_*np.eye(n)
    return  np.dot(W_sqrt,np.linalg.solve(temp,np.dot(W_sqrt,z)))

def solveKLR(K,y,alpha0,lambda_ = 1,itermax = 30, eps =1e-6):
    n = K.shape[0]
    
    iter_ = 0
    last_alpha = 10*alpha0 + np.ones(alpha0.shape)
    alpha = alpha0
    
    while (iter_< itermax) and (np.linalg.norm(last_alpha-alpha)>eps) :         
        print(iter_,np.linalg.norm(last_alpha-alpha))
        last_alpha = alpha
        m = np.dot(K,alpha)
        P = np.zeros((n,1))
        W = np.zeros((n,n))
        z = np.zeros((n,1))
        for i in range(n):
            P[i,0] = -sigmoid(-y[i]*m[i])
            W[i,i] = sigmoid(m[i])*sigmoid(-m[i])
            z[i,0] = m[i] - (P[i,0]*y[i])/W[i,i]
        alpha = solveWKRR(K,W,z,lambda_)
        iter_ = iter_ +1
        
      
    return alpha        

In [None]:
K0 = K_Xtr0
y_0 = y0.reshape((y0.shape[0],1))
y_0 = 2*y_0-1
n = y_0.shape[0]
alpha0 = np.zeros((n,1))
alpha_0 = solveKLR(K0,y_0,alpha0,10) 

K1 = K_Xtr1
y_1 = y1.reshape((y1.shape[0],1))
y_1 = 2*y_1-1
n = y_1.shape[0]
alpha0 = np.zeros((n,1))
alpha_1 = solveKLR(K1,y_1,alpha0,10) 

K2 = K_Xtr2
y_2 = y2.reshape((y2.shape[0],1))
y_2 = 2*y_2-1
n = y_2.shape[0]
alpha0 = np.zeros((n,1))
alpha_2 = solveKLR(K2,y_2,alpha0,10) 

In [None]:
def sign(x):
    y = x
    n = x.shape[0]
    for i in range(n):
        if x[i,0] > 0:
            y[i,0] = 1
        else:
            y[i,0] = 0
    return y

print('Accuracy:',np.linalg.norm(1-sign(np.dot(K0,alpha_0))+y_0,1)/y_0.shape[0])
print('Accuracy:',np.linalg.norm(1-sign(np.dot(K1,alpha_1))+y_1,1)/y_0.shape[0])
print('Accuracy:',np.linalg.norm(1-sign(np.dot(K2,alpha_2))+y_2,1)/y_0.shape[0])

### 1.3 Results

## 3. Mismatch Kernel 

In [3]:
import numpy as np
from multiprocessing import Pool
from itertools import product

ngrams = lambda a, n: list(zip(*[a[i:] for i in range(n)])) #function that extract all the n grams in a given sequence

def AllPossibleCombinationlist(char_list,n):
    '''
    Compute all the possible ngrams that we can obtain from a list of char 
    This function will allow us to have a correspondance between all our histograms representing
    each sequences because the bin i will represent the same n gram (given by the i-th value of the list
    that we are returning) in all our histograms
    Param: char_list: (list) list of possible char
    n: (int) n in ngram - length of the subsequences considered
    '''
    #n corresponds to n in n gram
    return list(product(char_list,repeat=n))

def CreateHistogramSeq(Seq,AllCombinList,n):
    '''
    Create the embedding that allows to compute the spectrum kernel: histogram of all the subsequences of length n
    in the sequence
    Param: Seq: (str) DNA sequence containing only the letter A,C,G,T
    n: (int) length of the subsequences considered
    AllCombinList: (list) a list containing all the possible combination of length n that we can compute using the letters
    A C G T
    Return: value : np.array contains the representation of the sequence as an array
    '''
    decompose_seq= ngrams(Seq,n)
    value = np.zeros([len(AllCombinList),])
    for ngram in decompose_seq:
        index_ngram = AllCombinList.index(ngram)
        value[index_ngram] = value[index_ngram]+1
    return value

def CreateHistogramMismatchSeq(Seq,AllCombinList,n):
    '''
    Create the embedding that allows to compute the mismatch kernel: histogram of all the subsequences of length n
    in the sequence. This time allows one mismatch.
    Param: @Seq: (str) DNA sequence containing only the letter A,C,G,T
    @n: (int) length of the subsequences considered
    @AllCombinList: (list) a list containing all the possible combination of length n that we can compute using the letters
    A C G T
    Return: value : np.array contains the representation of the sequence as an array
    '''
    letters = ['A','C','G','T']
    decompose_seq= ngrams(Seq,n)
    value = np.zeros([len(AllCombinList),])
    for ngram in decompose_seq:
        index_ngram = AllCombinList.index(ngram)
        value[index_ngram] = value[index_ngram]+1
        copy_ngram = list(ngram)
        for ind,cur_letter in enumerate(copy_ngram):
            for letter in letters:
                if letter!=cur_letter:
                    new_ngram = list(copy_ngram)
                    new_ngram[ind]= letter
                    mismatch_ngram = tuple(new_ngram)
                    index_ngram = AllCombinList.index(mismatch_ngram)
                    value[index_ngram] = value[index_ngram]+0.1
    return value



def compute_idf(list_histograms):
    '''
    Compute the idf score for all the subsequences that appears in our computation.
    If a sequences appears rarely it will have a higher score than if it appears really frequently
    Param: @list_histograms: list of histograms that as been computed
    Return: (np.array) compute the idf score for all the bins in the histogram
    '''
    idf = 0.000001*np.ones((list_histograms.shape[1]))
    for sent in list_histograms:
        idf += np.array(sent)
    
    idf= np.maximum(1, np.log10(len(list_histograms) / (idf)))
    
    return idf


def compute_kernel_histogram(x1,x2):
    """ 
    Compute the scalar product between x1 and x2 (linear kernel in the embedding given in the Spectrum Kernel space)
    Param: @x1: (np.array) data 1 to use to feed the linear kernel computation
    @x2: (np.array) data 2 to use to feed the linear kernel computation
    """
    value= np.vdot(x1,x2)
    return value




'''
In order to allow the use of parallelization and the 
multiprocessing library we have computed some really basic functions using classes
The next few functions must be easy to understand
'''
compute_diag = lambda X,i: compute_kernel_histogram(X[i], X[i])
compute_element_kernel_square = lambda X1,sim_docs_kernel_value,i,j: compute_kernel_histogram(X1[i], X1[j])/(sim_docs_kernel_value[i] *sim_docs_kernel_value[j])**0.5
compute_element_kernel = lambda X1,X2,sim_docs_kernel_value,i,j: compute_kernel_histogram(X1[i], X2[j])/(sim_docs_kernel_value[1][i] *sim_docs_kernel_value[2][j])**0.5


class compute_diag_copy(object):
    def __init__(self, X):
        self.X = X
    def __call__(self, i):
        return compute_diag(self.X,i)

class compute_element_i(object):
    def __init__(self, X,sim_docs_kernel_value,i):
        self.X = X
        self.sim_docs_kernel_value = sim_docs_kernel_value
        self.i = i
    def __call__(self, j):
        return compute_element_kernel_square(self.X,self.sim_docs_kernel_value,self.i,j)

class compute_element_i_general(object):
    def __init__(self, X,X_p,sim_docs_kernel_value,i):
        self.X = X
        self.X_p = X_p
        self.sim_docs_kernel_value = sim_docs_kernel_value
        self.i = i
    def __call__(self, j):
        return compute_element_kernel(self.X,self.X_p,self.sim_docs_kernel_value,self.i,j)
    
    




def histogram_kernel(X1,X2=[]):
    '''
    This function computes the spectrum kernel gram matrix. (Because we assume that X1 and X2 are given
    in the rkhs space this kernel is equivalent to a basic linear kernel)
    Param: @X1: (np.array)(nb_sample,nb_features) Training data
    @X2: (np.array)(nb_sample,nb_features) Testing data (if empty compute the gram matrix for training else compute
    the gram matrix for testing)
    @n_proc: (int) allows to use more processor in order to compute the gram matrix quickly
    Return: Spectrum Kernel Gram matrix
    '''

    len_X2= len(X2)
    len_X1 = len(X1)
    sim_docs_kernel_value = {}
    if len_X2 ==0:
        gram_matrix = np.zeros((len_X1, len_X1), dtype=np.float32)
        for i in range(len_X1):
            sim_docs_kernel_value[i] = compute_diag_copy(X1)(i)
                            
        for i in range(len_X1):
            for j in range(i,len_X1):    
                gram_matrix[i, j]= compute_element_i(X1,sim_docs_kernel_value,i)(j)
                gram_matrix[j, i] = gram_matrix[i, j]
        #calculate Gram matrix
        return gram_matrix
    
    else:
        gram_matrix = np.zeros((len_X1, len_X2), dtype=np.float32)
    
        sim_docs_kernel_value[1] = {}
        sim_docs_kernel_value[2] = {}
        for i in range(len_X1):
            sim_docs_kernel_value[1][i] = compute_diag_copy(X1)(i)
        for j in range(len_X2):
            sim_docs_kernel_value[2][j] = compute_diag_copy(X2)(j)
    
        for i in range(len_X1):
            for j in range(len_X2):
                gram_matrix[i, j] = compute_element_i_general(X1,X2,sim_docs_kernel_value,i)(j)    
        return gram_matrix

In [4]:
# Tranforming into numpy.arrays -- train
Xtr0 = list(df_Xtr0['seq'])
Xtr1 = list(df_Xtr1['seq'])
Xtr2 = list(df_Xtr2['seq'])

# Tranforming into numpy.arrays -- test
Xte0 = list(df_Xte0['seq'])
Xte1 = list(df_Xte1['seq'])
Xte2 = list(df_Xte2['seq'])

# Transforming the labels into numpy.arrays 
y0 = np.array(df_Ytr0)[:,1]
y1 = np.array(df_Ytr1)[:,1]
y2 = np.array(df_Ytr2)[:,1]


In [5]:
nngram = 7 #param
list_all_combin_DNA = AllPossibleCombinationlist(['A','C','G','T'],nngram)
X_train_histo_0 = np.empty([len(Xtr0),len(list_all_combin_DNA)])
X_test_histo_0 = np.empty([len(Xte0),len(list_all_combin_DNA)])
X_train_histo_1 = np.empty([len(Xtr1),len(list_all_combin_DNA)])
X_test_histo_1 = np.empty([len(Xte1),len(list_all_combin_DNA)])
X_train_histo_2 = np.empty([len(Xtr2),len(list_all_combin_DNA)])
X_test_histo_2 = np.empty([len(Xte2),len(list_all_combin_DNA)])

for i in range(len(Xtr0)):
    X_train_histo_0[i,:] = CreateHistogramMismatchSeq(Xtr0[i],list_all_combin_DNA,nngram)
    
for j in range(len(Xte0)):
    X_test_histo_0[j,:] = CreateHistogramMismatchSeq(Xte0[j],list_all_combin_DNA,nngram)

    
for i in range(len(Xtr1)):
    X_train_histo_1[i,:] = CreateHistogramMismatchSeq(Xtr1[i],list_all_combin_DNA,nngram)
    
for j in range(len(Xte1)):
    X_test_histo_1[j,:] = CreateHistogramMismatchSeq(Xte1[j],list_all_combin_DNA,nngram)


for i in range(len(Xtr2)):
    X_train_histo_2[i,:] = CreateHistogramMismatchSeq(Xtr2[i],list_all_combin_DNA,nngram)
    
for j in range(len(Xte2)):
    X_test_histo_2[j,:] = CreateHistogramMismatchSeq(Xte2[j],list_all_combin_DNA,nngram)

In [37]:
X_train_split_0 = X_train_histo_0
y_train_split_0 = y0
y_train_split_0[y_train_split_0==0]=-1

X_train_split_1 = X_train_histo_1
y_train_split_1 = y1
y_train_split_1[y_train_split_1==0]=-1 

X_train_split_2 = X_train_histo_2
y_train_split_2 = y2
y_train_split_2[y_train_split_2==0]=-1

In [38]:
#################Compute the train gram matrices#######################
gram_train_multi_proc = histogram_kernel(X_train_split_0)          
gram_train_multi_proc_1 = histogram_kernel(X_train_split_1)
gram_train_multi_proc_2 = histogram_kernel(X_train_split_2)

#################Compute test gram matrices#######################
gram_test_final_0 =  histogram_kernel(X_train_split_0,X_test_histo_0)
gram_test_final_1 =  histogram_kernel(X_train_split_1,X_test_histo_1)
gram_test_final_2 =  histogram_kernel(X_train_split_2,X_test_histo_2)


In [43]:
gram_train_multi_proc.shape

(2000, 2000)

### 3.1. Define Kernel 

In [None]:
from strkernel.mismatch_kernel import MismatchKernel, preprocess

after_process = preprocess(list(df_Xtr0['seq']) + list(df_Xte0['seq']))
mismatch_kernel = MismatchKernel(l=4, k=5, m=1).get_kernel(after_process)
K_Xtr0 = mismatch_kernel.kernel
np.save("Kernel_Matrix/mismatch_kernel_Xtr0.npy",K_Xtr0)

after_process = preprocess(list(df_Xtr1['seq']))
mismatch_kernel = MismatchKernel(l=4, k=5, m=1).get_kernel(after_process)
K_Xtr1 = mismatch_kernel.kernel
np.save("Kernel_Matrix/mismatch_kernel_Xtr1.npy",K_Xtr1)

after_process = preprocess(list(df_Xtr2['seq']))
mismatch_kernel = MismatchKernel(l=4, k=5, m=1).get_kernel(after_process)
K_Xtr2 = mismatch_kernel.kernel
np.save("Kernel_Matrix/mismatch_kernel_Xtr2.npy",K_Xtr2)

In [None]:
after_process = preprocess(list(df_Xtr0['seq']) + list(df_Xte0['seq']))
mismatch_kernel = MismatchKernel(l=4, k=4, m=1).get_kernel(after_process)

In [None]:
mismatch_kernel = 0

## 4.tf_idf

### 4.1.spectrum_embedding

In [65]:
import itertools

def k_mers(Alphabet, k):
    'Compute all possible words of size k from Alphabet'
    'Store the result as a dictionnary where the keys are the words and the values ar integer Ids'
    all_kmers_tuple = list(itertools.product(Alphabet, repeat=k))
    all_kmers = list(map(lambda tup: ''.join(tup), all_kmers_tuple))
    return dict(zip(all_kmers, range(len(all_kmers))))

    
def spectrum_embedding(sequence, all_kmers_dict, k):
    'Compute the k-sepctrum embedding of sequence'
    'The result is a vector of size vocabulary'
    embedding = np.zeros(len(all_kmers_dict))
    for idx in range(len(sequence)-k+1): # slidding window of size k on the sequence
        word_id = all_kmers_dict[sequence[idx:idx+k]]
        embedding[word_id] += 1  
    return(embedding)

def data_embedding(df, all_kmers_dict, k):
    nb_sequences = df.shape[0]
    embedding_dict = dict.fromkeys(range(nb_sequences))
    for idx,sequence in enumerate(list(df['seq'])):
        embedding = spectrum_embedding(sequence, all_kmers_dict, k)
        embedding_dict[idx] = embedding  
    return embedding_dict
    
        

Compute the embeddings for all data sets 

In [66]:
k = 4
Alphabet = ['G', 'C', 'A', 'T']

all_kmers_dict = k_mers(Alphabet, k)


Xtr0_embedding = data_embedding(df_Xtr0,all_kmers_dict,k)
Xtr1_embedding = data_embedding(df_Xtr1,all_kmers_dict,k)
Xtr2_embedding = data_embedding(df_Xtr2,all_kmers_dict,k)

Xte0_embedding = data_embedding(df_Xte0,all_kmers_dict,k)
Xte1_embedding = data_embedding(df_Xte1,all_kmers_dict,k)
Xte2_embedding = data_embedding(df_Xte2,all_kmers_dict,k)

### 4.2. Computing tf_idf scores 

In [67]:
def counting_array(embedding_dict, all_kmers_dict):
    'Compute the count matrix of kmer by sequence'
    'Output is an array of size nb_seq*vocab_size'
    D = len(embedding_dict) # Number of documents
    T = len(all_kmers_dict) # Vocabulary size
    
    output = np.zeros((D,T))
    for document in embedding_dict.keys():
        output[document] = embedding_dict[document]
    return output

def tf_idf(D):
    'Input : D is a conting matrix (nb_seq*vocab_size)'
    'Ouptut: array of tf_idf socres'
    N = D.shape[0] # number of documents
    idf = np.log(N/np.count_nonzero(D,axis=1)).reshape(-1,1)
    tf = np.log(1+D/np.sum(D,axis = 1).reshape(-1,1))
    return tf*idf

Compute the tf_idf for each dataset

In [68]:
# Tf-Idf numpy.arrays -- train
D_tr0 = counting_array(Xtr0_embedding, all_kmers_dict)
Xtr0 = tf_idf(D_tr0)

D_tr1 = counting_array(Xtr1_embedding, all_kmers_dict)
Xtr1 = tf_idf(D_tr1)

D_tr2 = counting_array(Xtr2_embedding, all_kmers_dict)
Xtr2 = tf_idf(D_tr2)


# Tf-Idf numpy.arrays -- test
D_te0 = counting_array(Xte0_embedding, all_kmers_dict)
Xte0 = tf_idf(D_te0)

D_te1 = counting_array(Xte1_embedding, all_kmers_dict)
Xte1 = tf_idf(D_te1)

D_te2 = counting_array(Xte2_embedding, all_kmers_dict)
Xte2 = tf_idf(D_te2)

# Transforming the labels into numpy.arrays 
y0 = 2*np.array(df_Ytr0)[:,1] - 1
y1 = 2*np.array(df_Ytr1)[:,1] - 1
y2 = 2*np.array(df_Ytr2)[:,1] - 1

### 4.3.  SVM + Guaussian on tf_idf embeddings

Compute guassian kernel based on this embeddings

In [69]:
# We should parallelize this computation
if os.path.isfile("Kernel_Matrix/tf_idf_kernel_Xtr0.npy"):
    K_Xtr0 = np.load("Kernel_Matrix/tf_idf_kernel_Xtr0.npy")
else:
    K_Xtr0 = GaussKernel(Xtr0, Xtr0, sigma = 0.1)
    np.save("Kernel_Matrix/tf_idf_kernel_Xtr0.npy",K_Xtr0)
    
if os.path.isfile("Kernel_Matrix/tf_idf_kernel_Xtr1.npy"):
    K_Xtr1 = np.load("Kernel_Matrix/tf_idf_kernel_Xtr1.npy")
else:
    K_Xtr1 = GaussKernel(Xtr1, Xtr1, sigma = 0.1)
    np.save("Kernel_Matrix/tf_idf_kernel_Xtr1.npy",K_Xtr1)
    
if os.path.isfile("Kernel_Matrix/tf_idf_kernel_Xtr2.npy"):
    K_Xtr2 = np.load("Kernel_Matrix/tf_idf_kernel_Xtr2.npy")
else:
    K_Xtr2 = GaussKernel(Xtr2, Xtr2, sigma = 0.1)
    np.save("Kernel_Matrix/tf_idf_kernel_Xtr1.npy",K_Xtr2)
    
if os.path.isfile("Kernel_Matrix/tf_idf_kernel_Xte0.npy"):
    K_Xte0 = np.load("Kernel_Matrix/tf_idf_kernel_Xte0.npy")
else:
    K_Xte0 = GaussKernel(Xtr0, Xte0, sigma = 0.1)
    np.save("Kernel_Matrix/tf_idf_kernel_Xte1.npy",K_Xte0)

if os.path.isfile("Kernel_Matrix/tf_idf_kernel_Xte1.npy"):
    K_Xte1 = np.load("Kernel_Matrix/tf_idf_kernel_Xte1.npy")
else:
    K_Xte1 = GaussKernel(Xtr1, Xte1, sigma = 0.1)
    np.save("Kernel_Matrix/tf_idf_kernel_Xte1.npy",K_Xte1)
    
if os.path.isfile("Kernel_Matrix/tf_idf_kernel_Xte2.npy"):
    K_Xte2 = np.load("Kernel_Matrix/tf_idf_kernel_Xte2.npy")
else:
    K_Xte2 = GaussKernel(Xtr2, Xte2, sigma = 0.1)
    np.save("Kernel_Matrix/tf_idf_kernel_Xte1.npy",K_Xte2)
 

# Transforming the labels into numpy.arrays 
y0 = 2*np.array(df_Ytr0)[:,1] - 1.
y1 = 2*np.array(df_Ytr1)[:,1] - 1.
y2 = 2*np.array(df_Ytr2)[:,1] - 1.

Solve SVM like in part I

In [70]:
alpha_star0 = solve_dual_SVM(K_Xtr0,y0, lambda_= 1e-8)
alpha_star1 = solve_dual_SVM(K_Xtr1,y1, lambda_= 1e-8)
alpha_star2 = solve_dual_SVM(K_Xtr2,y2, lambda_= 1e-8)

     pcost       dcost       gap    pres   dres
 0:  6.9775e+10 -2.1049e+11  3e+11  5e-17  3e-11
 1:  1.6532e+10 -2.4954e+10  4e+10  2e-16  2e-11
 2:  2.4917e+09 -2.9045e+09  5e+09  2e-16  9e-12
 3:  3.5990e+08 -4.0697e+08  8e+08  2e-16  3e-12
 4:  5.1575e+07 -5.7631e+07  1e+08  2e-16  1e-12
 5:  7.3441e+06 -8.2776e+06  2e+07  2e-16  5e-13
 6:  1.0322e+06 -1.2046e+06  2e+06  2e-16  2e-13
 7:  1.3983e+05 -1.8047e+05  3e+05  2e-16  8e-14
 8:  1.6649e+04 -2.9087e+04  5e+04  2e-16  2e-14
 9:  7.6100e+02 -5.6383e+03  6e+03  2e-16  1e-14
10: -8.9775e+02 -1.6757e+03  8e+02  2e-16  3e-15
11: -9.8192e+02 -1.0202e+03  4e+01  2e-16  2e-15
12: -9.8227e+02 -9.8293e+02  7e-01  2e-16  1e-15
13: -9.8227e+02 -9.8228e+02  7e-03  2e-16  1e-15
14: -9.8227e+02 -9.8227e+02  7e-05  2e-16  1e-15
Optimal solution found.
     pcost       dcost       gap    pres   dres
 0:  7.1272e+10 -2.2178e+11  3e+11  5e-17  3e-11
 1:  1.5578e+10 -1.7887e+10  3e+10  2e-16  2e-11
 2:  2.2706e+09 -2.6716e+09  5e+09  2e-16  9e-1

In [71]:
prediction0 = alpha_star0.reshape(-1,1).T.dot(K_Xte0)
prediction0[prediction0>0] = 1
prediction0[prediction0 <0] = 0

prediction1 = alpha_star1.reshape(-1,1).T.dot(K_Xte1)
prediction1[prediction1>0] = 1
prediction1[prediction1 <0] = 0

prediction2 = alpha_star2.reshape(-1,1).T.dot(K_Xte2)
prediction2[prediction2>0] = 1
prediction2[prediction2 <0] = 0

### 4.4. Evaluation

In [72]:
train_prediction0 = (np.sign(alpha_star0.reshape(-1,1).T.dot(K_Xtr0))+1)/2
print('Train Accuracy 0:',1- np.abs(train_prediction0 - y0).sum()/y0.shape[0])

train_prediction1 = (np.sign(alpha_star1.reshape(-1,1).T.dot(K_Xtr1))+1)/2
print('Train Accuracy 1:',1- np.abs(train_prediction1 - y1).sum()/y1.shape[0])

train_prediction2 = (np.sign(alpha_star2.reshape(-1,1).T.dot(K_Xtr2))+1)/2
print('Train Accuracy 2:',1 - np.abs(train_prediction2 - y2).sum()/y2.shape[0])

Train Accuracy 0: 0.48850000000000005
Train Accuracy 1: 0.499
Train Accuracy 2: 0.49950000000000006


### 4.5. Writting the results

In [59]:
predictions = np.squeeze(np.hstack((prediction0, prediction1, prediction2))).astype(int)
df = pd.DataFrame({'Bound': predictions,
                   'Id': np.arange(3000)})
df = df[['Id','Bound']]

df.to_csv("Predictions/tf_idf_SVM.csv",index = False)