In [2]:
import csv
import numpy as np
!pip install multiprocessing
import multiprocessing

def get_data(name, label = False):
    
    with open(name, newline='') as myFile: 
        reader = csv.reader(myFile)
        
        if label:
            return np.array([ [int(row[1])] for row in reader if row != ['Id', 'Bound']])
        else:
            return np.array([ row[1] for row in reader ])[1:]
    
def encode(text):
    
    index = ['A','C','G','T']
    X = [ index.index(text[i]) for i in range(len(text)) ]
    
    return X

def get_mat100(name):

    with open(name, newline='') as myFile: 
        reader = csv.reader(myFile)
        
        return np.array([ [float(val) for val in row[0].split(' ')] for row in reader ])
    
def split_data(X,Y,proportion):
    
    X, Y = np.array(X), np.array(Y)
    n = int(Y.shape[0]*proportion)
    
    return X[n:,:], Y[n:,:], X[:n,:], Y[:n,:]

Collecting multiprocessing
  Using cached https://files.pythonhosted.org/packages/b8/8a/38187040f36cec8f98968502992dca9b00cc5e88553e01884ba29cbe6aac/multiprocessing-2.6.2.1.tar.gz
[31mCommand "python setup.py egg_info" failed with error code 1 in /tmp/pip-install-fffcr279/multiprocessing/[0m


In [0]:
import string

def embed(x,n):
    return [x[i:i+n] for i in range(len(x) - n + 1)]

def embed_number(x,n, letter_encoder = False):  #for each n_sequence of x, return a number corresponding to that 3-sequence
    
    x_embed = [] # list of embedings
    n_basis = np.array([4**i for i in range(n)]) # we count in base 4 because there are 4 letters
    x = np.array(x)
    
    if not letter_encoder: # return int encoding
        return [ np.dot(x[i:i+n],n_basis) for i in range(len(x) - n + 1)]
    
    else: # return letter encoding with dictionnary letter_encoder to convert ints to letters
        return [ letter_encoder[np.dot(x[i:i+n],n_basis)] for i in range(len(x) - n + 1)]
    
def nb_letters(x):
    x = list(x)
    return np.array([x.count(i) for i in range(4)])

In [0]:
# define kernels 

def K1(u,v):
    return np.dot(u,v)

def K2(u,v):
    k = K1(u,v)
    return np.array(k + k**2 + k**3)


def K3(u,v):
    k = K1(u,v)
    kk = K2(u,v)
    return np.array(2*k + 3*kk)
   
    
def RBfF_kernel(u,v, gamma = 1.):
    n, p = u.shape[0], v.shape[0]
    result = np.zeros((n,p))
    for i in range(n):
        for j in range(p):
            result[i,j] = np.exp(-0.5*(np.linalg.norm(u[i,:] - v[j,:]))*gamma)
    return result

def RBF_kernel(u,v, gamma = 1.):
    return  np.exp(-0.5*(np.linalg.norm(u - v))*gamma)
  
def embed(x,n=2):
    return [x[i:i+n] for i in range(len(x) - n + 1)]

def K_substring(u,v,gamma = 1):
    s1, s2 = list(embed(u)), list(embed(v))
    score = 0
    for i in range(len(s1)):
        score += (s1[i] == s2[i]) # This is an unoptimized way to do this. 
    try:
      return score[0]
    except:
      return score

In [0]:
# get data 0
X0, Y0 = get_data('Xtr0.csv'), get_data('Ytr0.csv', label = True)
X0_encoded = np.array([encode(sent) for sent in X0])
X0_mat100 = get_mat100('Xtr0_mat100.csv')

# get data 1
X1, Y1 = get_data('Xtr1.csv'), get_data('Ytr1.csv', label = True)
X1_encoded = [encode(sent) for sent in X1]
X1_mat100 = get_mat100('Xtr1_mat100.csv')

# get data 2
X2, Y2 = get_data('Xtr2.csv'), get_data('Ytr2.csv', label = True)
X2_encoded = np.array([encode(sent) for sent in X2])
X2_mat100 = get_mat100('Xtr2_mat100.csv')

In [0]:
from cvxopt import solvers, matrix
opts = {'maxiters' : 70, 'abstol' : 1e-8,  'reltol': 1e-8, 'feastol' : 1e-8, 'show_progress': True }

class SVM:
    
    def __init__(self, kernel = RBF_kernel, gamma = 1):
        self.kernel = kernel
        self.gamma = gamma
        
    def compute_K(self,X):
        n = X.shape[0]
        K = np.zeros((n,n))
        for i in range(n):
            if (i%10==0):
              print(i,'/',n)
            for j in range(i+1):
                K[i,j] = self.kernel(X[i,:],X[j,:], gamma = self.gamma)
                K[j,i] = K[i,j]
        return K
    
    def compute_K_pred(self,X_test,X):
        n= X.shape[0]
        return np.array([[self.kernel(X[i,:],X_test, gamma = self.gamma) for i in range(n)]])
        
    def fit(self,X,Y):
        
        n=Y.shape[0]
        Y[Y == 0] = -1
        
        # compute K
        print('Computing K matrix optim problem..', end = '\r')
        #multip = multiprocessing.Pool(2)
        #K = multip.map(self.compute_K, (X,))
        self.K = np.array(self.compute_K(X))
        
    # solve dual problem...
        
        D = np.diag(Y.flatten()) # diagonal matrix with D(i,i) = y(i)
        p = matrix( np.array(np.dot(D, np.dot(self.K, D ))) +1e-30*np.eye(n) , tc='d')
        q = matrix(-1*np.ones((n, 1)), tc='d')
        G = matrix(-1*np.eye(n),tc='d')
        h = matrix(np.zeros((n, 1)), tc='d')
        A = matrix(Y.flatten().reshape((1, n)), tc='d')
        b = matrix(np.array([0]), tc = 'd')

    #warning off; % suppress ’Warning: Larg-scale method ...’µ

        print('Solving dual problem..', end = '\r')
        alpha = solvers.qp(p, q, G, h, A, b, options = opts)
        
        th_alpha=alpha['x']
        S = np.where(np.array(th_alpha).flatten() > np.finfo(float).eps)[0].tolist()
        YS = np.vstack(Y[S])
        self.beta = np.vstack([th_alpha[i]*Y[i,0] for i in S])
        self.XS =  np.vstack([ X[i,:] for i in S])

    # also, compute/estimate w0 (bias parameter) ...
        K_XS = self.K[S,S] #replaces self.compute_K(self.XS)
        self.w0 = np.mean(YS - K_XS@self.beta)
        print('Done !', end = '\r')
        
    def predict(self,Xtest):
        
        print('Computing prediction..', end = '\r')
        Ypred = np.sign([sum(self.compute_K_pred(Xtest[i].reshape(1, -1), self.XS)@self.beta)+self.w0 for i in range(Xtest.shape[0])])
        Ypred[Ypred == -1.] = 0
        print('Prediction done !', end = '\r')
        
        return Ypred

## 0

In [7]:
from sklearn import svm

#encode, embed
#X_3_embed = np.array([ embed_number(X0_encoded[i],2) for i in range(len(X0))])
#X_0 = np.concatenate((X_3_embed,X0_mat100),axis=1)

print(X0)
X_t, Y_t, X_v, Y_v = split_data(X0.reshape(2000,1),Y0,0.2)

clf0 = svm.SVC(gamma='scale')
clf0.fit(X_t, Y_t.flatten())
Y_pred = clf0.predict(X_v)
print(' Prediction error is: %s' %np.mean(np.abs(Y_v.flatten() - Y_pred)))

['GGAGAATCATTTGAACCCGGGAGGTGGAGGTTGCCGTGAGCTGAGATTGCGCCATTGCACTCCAGCCTGGGCAACAAGAGCAAAACTCTGTCTCACAAAAC'
 'ACCCTGCCTACACCGCGGCGGGGACAGGTGGAGGTTTCAACCCCTGTTTGGCAACCTCGGGCGCAGCCAGGCCCCGCCCAGAAATTTCCGGGACACGCCCC'
 'TGCAAATCTGTAAGCATTTCTCAGGCAATGAATTATGTCAACACAATTGCACCATCATTGATGGACTTGGAAATGCAGACAGAACTGAAGAGGAGCGTCTC'
 ...
 'TGGCCGGCCCGGCGCCGTCACCGCCCTCAAAAGACATGGCGGCGCCTTGCGTCACGTCCGCGCAGTTGCCCCGCCTCCTCTCCGCACACTCCGCCTCCCTT'
 'GCTTCATCATCACAGTGCAGCAGCTGACTACTTGCTAACTAACGTGTTATGCCATTGATAACCAGGAACTTCCTGACTGACGTTGAACGGGAAAAGGATGG'
 'GCAGAATGAAGCTCTTGTTGCTTTGGCATTAATAGCAGCTTTAGAATTGGGTAAGTACCCCAGTGACAAACTTATTTTCTTCTATTTTTATCTTGGATGAA']


ValueError: ignored

In [0]:
#encode, embed

svm_0 = SVM(kernel = K_substring )
svm_0.fit(X_t,Y_t)
Y_pred = svm_0.predict(X_v)

print(' Prediction error is: %s ' %np.mean(np.abs(Y_v - Y_pred)))

## 1

In [8]:
#encode, embed
import numpy as np
from sklearn import svm

X_3_embed = np.array([ embed_number(X1_encoded[i],2) for i in range(len(X1))])

#embed
X_nb_letters = np.array([nb_letters(sent) for sent in X1_encoded])
X_1 = np.concatenate((X_nb_letters,X_3_embed),axis=1)

#split data
X_t1,Y_t1, X_ver1, Y_ver1 = split_data(X1_encoded,Y1,0.2)

#predict
clf1 = svm.SVC(gamma='scale')
clf1.fit(X_t1, Y_t1.flatten())
Y_pred = clf1.predict(X_ver1)
print('Prediction error is:%s' %np.mean(np.abs(Y_ver1.flatten() - Y_pred)))

Prediction error is:0.4275


In [21]:
def Kernel_1(u,v,gamma=1.):

    #print(u,v)
    # get 2 spectrum
    u_2_spectrum = np.array(embed_number(u,2))  
    v_2_spectrum = np.array(embed_number(v,2))
    # get number of letters
    u_nb_letters = nb_letters(u)
    v_nb_letters = nb_letters(v)
    # concatenate the 2 (amounts to summing the kernels)
    #print(u_2_spectrum,u_nb_letters)
    u = np.concatenate((u_2_spectrum,u_nb_letters),axis=0)
    v = np.concatenate((v_2_spectrum,v_nb_letters),axis=0)
    return RBF_kernel(u,v)

#encode, embed
svm_1 = SVM(Kernel_1)
svm_1.fit(X_t1,Y_t1)
Y_pred = svm_1.predict(X_ver1)

print(' Prediction error is: %s ' %np.mean(np.abs(Y_ver1 - Y_pred)))

Computing K matrix optim problem..0 / 1600
10 / 1600
20 / 1600
30 / 1600
40 / 1600
50 / 1600
60 / 1600
70 / 1600
80 / 1600
90 / 1600
100 / 1600
110 / 1600
120 / 1600
130 / 1600
140 / 1600
150 / 1600
160 / 1600
170 / 1600
180 / 1600
190 / 1600
200 / 1600
210 / 1600
220 / 1600
230 / 1600
240 / 1600
250 / 1600
260 / 1600
270 / 1600
280 / 1600
290 / 1600
300 / 1600
310 / 1600
320 / 1600
330 / 1600
340 / 1600
350 / 1600
360 / 1600
370 / 1600
380 / 1600
390 / 1600
400 / 1600
410 / 1600
420 / 1600
430 / 1600
440 / 1600
450 / 1600
460 / 1600
470 / 1600
480 / 1600
490 / 1600
500 / 1600
510 / 1600
520 / 1600
530 / 1600
540 / 1600
550 / 1600
560 / 1600
570 / 1600
580 / 1600
590 / 1600
600 / 1600
610 / 1600
620 / 1600
630 / 1600
640 / 1600
650 / 1600
660 / 1600
670 / 1600
680 / 1600
690 / 1600
700 / 1600
710 / 1600
720 / 1600
730 / 1600
740 / 1600
750 / 1600
760 / 1600
770 / 1600
780 / 1600
790 / 1600
800 / 1600
810 / 1600
820 / 1600
830 / 1600
840 / 1600
850 / 1600
860 / 1600
870 / 1600
880 / 16

ValueError: ignored

## 2

In [0]:
from sklearn import svm
X_2 = X2_mat100
X_t2,Y_t2, X_ver2, Y_ver2 = split_data(X_2,Y2,0.2)

clf2 = svm.SVC(gamma='scale')
clf2.fit(X_t2, Y_t2.flatten())
Y_pred = clf2.predict(X_ver2)
print(' Prediction error is: %s' %np.mean(np.abs(Y_ver2.flatten() - Y_pred)))

clf2 = svm.SVC(gamma='scale')
clf2.fit(X2_mat100, Y2.flatten())

In [0]:
#encode, embed
#encode, embed
svm_2 = SVM()
svm_2.fit(X_t2,Y_t2)
Y_pred = svm_2.predict(X_ver2)

print(' Prediction error is: %s ' %np.mean(np.abs(Y_ver2 - Y_pred)))

In [0]:
# Train for the 3 on all the data :

# Predict for the 3 :
svm_0 = SVM(gamma = 0.1)
svm_0.fit(X_0,Y0)

svm_1 = SVM()
svm_1.fit(X_1,Y1)

svm_2 = SVM()
svm_2.fit(X_2,Y2)

# get data 0
X0 = get_data('Xte0.csv')
X0_encoded = np.array([encode(sent) for sent in X0])
X0_mat100 = get_mat100('Xte0_mat100.csv')

# get data 1
X1 = get_data('Xte1.csv')
X1_encoded = [encode(sent) for sent in X1]
X1_mat100 = get_mat100('Xte1_mat100.csv')

# get data 2
X2 = get_data('Xte2.csv')
X2_encoded = np.array([encode(sent) for sent in X2])
X2_mat100 = get_mat100('Xte2_mat100.csv')

In [0]:
## 0

#encode, embed
X_3_embed = np.array([ embed_number(X0_encoded[i],2) for i in range(len(X0))])
X_0_te = np.concatenate((X_3_embed,X0_mat100),axis=1)

Y_pred0 = svm_0.predict(X_0_te)

## 1

#encode, embed
X_3_embed = np.array([ embed_number(X1_encoded[i],2) for i in range(len(X1))])
X_nb_letters = np.array([nb_letters(sent) for sent in X1_encoded])
X_1_te = np.concatenate((X_nb_letters,X_3_embed),axis=1)

Y_pred1 = svm_1.predict(X_1_te)

## 2

X_2_te = X2_mat100

Y_pred2 = svm_2.predict(X_2_te)


result = list(Y_pred0) + list(Y_pred1) + list(Y_pred2)
print(len(result))

In [0]:
def write_data(Y):
    
    with open('results.csv', mode='w') as csv_file: 
        fieldnames = ['Id', 'Bound']
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames, lineterminator = '\n')
        
        writer.writeheader()
        for i in range(len(Y)):
            writer.writerow({'Id': str(i), 'Bound': str(int(Y[i][0]))})
            
    csv_file.close()
        
write_data(result)

### Mismatch Kernel

In [0]:
import os
import matplotlib.pyplot as plt
import itertools
from collections import Counter
import numpy as np
import copy
import pandas as pd 
from SVM import *

class mismatch_Kernel(object):
    def __init__(self,k,m):
        self.k=k
        self.m=m
        
    def get_ksequence(self,string):
        k_sequence=[]
        for i in range(len(string)-self.k):
            k_sequence.append(string[i:(i+self.k)])
        return(Counter(k_sequence))
        
    def get_mismatches(self,string):
        k_sequence=self.get_ksequence(string)
        mismatches=dict(zip(k_sequence.keys(),np.zeros(len(k_sequence))))
        leaves={}
        self.aux_mismatches(k_sequence,leaves,mismatches,[],i=0)
        return(leaves)
        
    def aux_mismatches(self,k_sequence,leaves,mismatches,alpha,i=0):
        if i==self.k:
            mismatches={key:v for key,v in mismatches.items() if v<=self.m}
            if len(mismatches)>0:
                leaves["".join(alpha)]=sum([k_sequence[v] for v in mismatches.keys()])
        else:
            for w in ["A","C","G","T"]:
                aux_mismatch={key:v if key[i]==w else v+1 for key,v in mismatches.items() if v<=self.m}
                if len(aux_mismatch)==0:
                    break
                self.aux_mismatches(k_sequence,leaves,aux_mismatch,alpha+[w],i+1)
        
        
        
    def K(self,data,normalize=True):
        n=len(data)
        K_=np.zeros((n,n))
        data_sequences=[self.get_mismatches(item) for item in data]
        for i in range(n):
            print(i)
            for j in range(i,n):
                K_[i,j]=sum([data_sequences[j][x]*data_sequences[i][x] for x in set(data_sequences[i].keys()).intersection(set(data_sequences[j].keys()))])
                K_[j,i]=1.*K_[i,j]
        self.K_nonormalize=K_       
        if normalize:
        
            denominator=1.0*np.eye(n)/(1.*np.sqrt(np.diag(K_)))
            K_normalized=denominator.dot(K_.dot(denominator))
            K_=K_normalized
            
        
        self.train_sequences=data_sequences
        return(K_)
        
    def K_x(self,x,data,normalize=True):       
        
        data_sequences=self.train_sequences
            
        m=len(x)
        n=len(data_sequences)
        K_x_=np.zeros((m,n))
        x_sequences=[self.get_mismatches(item) for item in x]
        for i in range(m):
            print(i)
            for j in range(n):
                K_x_[i,j]=sum([x_sequences[i][x]*data_sequences[j][x] for x in set(x_sequences[i].keys()).intersection(set(data_sequences[j].keys()))])
                
        if normalize:
            denominator_data=np.sqrt(np.diag(self.K_nonormalize))
            for i in range(m):
                K_xx=np.sqrt(sum([x_sequences[i][x]**2 for x in x_sequences[i].keys()]))
                K_x_[i,:]=1.0*K_x_[i,:]/(1.0*denominator_data*K_xx)                   
        self.K_x_=K_x_
        return(K_x_)
    
K = mismatch_Kernel(3,1)