# Notebook dédiée au test du noyau gaussien avec les différents classifieurs





## 0.Import des fonctions, créations des fonctions utiles et des classifieurs

In [1]:
import numpy as np 
import pandas as pd 
import scipy
import numexpr as ne
from scipy.stats import uniform,rankdata
from scipy.linalg import solve,lstsq
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV,StratifiedKFold
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.metrics import accuracy_score
from cvxopt import matrix
from cvxopt import solvers
from tqdm import tqdm
from itertools import product,compress,combinations
import pickle
import time
import matplotlib.pyplot as plt 


### 0.1 Kernel Ridge Regression

In [2]:
def Kernel_Ridge_Regression(X_train,y_train,lbd,weight,gamma,degree,c0,k,biais,kernel):
    if kernel=="rbf":
        K=rbf_kernel(X_train,gamma)
    elif kernel=="poly":
        K=poly_kernel(X_train,X_train,degree,c0)
    elif kernel=="spectrum":
        K=spectrum_kernel(X_train,k)
    elif kernel=="precomputed":
        K=X_train
    n=K.shape[0]
    w=weight
    if not(biais):
        if isinstance(weight,bool):
            A=(K+n*lbd*np.eye(n))
            alpha=solve(A,y_train,assume_a="sym")
            return alpha
        elif isinstance(weight,str):
                w1=(y_train==1).mean()
                w0=1-w1
                w=np.where(y_train==1,w1,w0)
        wi=(1/w)
        
        A=K+n*lbd*wi*np.eye(n)
        alpha=solve(A,y_train,assume_a="sym")
     
        return alpha
    else:
        
        Kb=addbiais(K)
        
        K0=addzeros(K)
    
        if isinstance(weight,bool):
            A=(Kb.T.dot(Kb)+lbd*n*K0)
            B=Kb.T.dot(y_train)
            alpha=solve(A,B,assume_a="sym")
            return alpha
        elif isinstance(weight,str):
                w1=(y_train==1).mean()
                w0=1-w1
                w=np.where(y_train==1,w1,w0)
        W=np.diag(w)
        A=(Kb.T.dot(W.dot(Kb))+lbd*n*K0)
        B=Kb.T.dot(W.dot(y_train),assume_a="sym")
        alpha=solve(A,B)
        return alpha
        

In [3]:
class KernelRR(BaseEstimator,ClassifierMixin):
    def __init__(self,lbd=1,weight=False,gamma="auto",degree=2,c0=1,k=3,biais=False,kernel="rbf"):
        self.lbd=lbd
        self.weight=weight
        self.gamma=gamma
        self.degree=degree
        self.c0=c0
        self.k=k
        self.biais=biais
        self.kernel=kernel
    def fit(self,X,y):
        self.classes_ = np.unique(y)
        self.Xtr=X
        if isinstance(self.gamma,str) and self.kernel=="rbf":
            self.gamma=1/self.Xtr.shape[1]
        self.alpha=Kernel_Ridge_Regression(X,y,self.lbd,self.weight,self.gamma,self.degree,self.c0,self.k,self.biais,self.kernel)
        return self
    def decision_function(self,X):
        if self.kernel=="precomputed":
            return X.dot(self.alpha)
        if not(self.biais):
            if self.kernel=="rbf":
                return K_rbf_kernel(X,self.Xtr,self.gamma).dot(self.alpha) 
            elif self.kernel=="poly":
                return poly_kernel(X,self.Xtr,self.degree,self.c0).dot(self.alpha) 
            elif self.kernel=="spectrum":
                return K_spectrum_kernel(X,self.Xtr,self.k).dot(self.alpha) 
        else:
            if self.kernel=="rbf":
                return addbiais(K_rbf_kernel(X,self.Xtr,self.gamma)).dot(self.alpha)
            elif self.kernel=="poly":
                return addbiais(poly_kernel(X,self.Xtr,self.degree,self.c0)).dot(self.alpha)
            elif self.kernel=="spectrum":
                return addbiais(K_spectrum_kernel(X,self.Xtr,self.k)).dot(self.alpha)

    def predict(self,X,y=None):
        scores=self.decision_function(X)
        if len(scores.shape) == 1:
            indices = (scores > 0).astype(np.int)
        else:
            indices = scores.argmax(axis=1)
        return self.classes_[indices]
   
    def get_params(self, deep=True):
    
        return {"lbd": self.lbd,"weight":self.weight,"gamma":self.gamma,"degree":self.degree,"c0":self.c0,"k":self.k,
                "biais":self.biais,"kernel":self.kernel}

    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self

### 0.2 Kernel Logistic Regression

In [4]:
def sigmoid(v):
    return 1/(1+np.exp(-v))
def log_loss(v):
    return np.log(1+np.exp(-v))


In [5]:
def IRLS(X_train,y_train,lbd,ga,degree,c0,k,bs,ker,n_iter,eps=10**-6,method='slow'):
    n=y_train.shape[0]
  
    if ker=="rbf":
        K=rbf_kernel(X_train,ga)
    elif ker=="poly":
        K=poly_kernel(X_train,X_train,degree,c0)
    elif ker=="spectrum":
        K=spectrum_kernel(X_train,k)
    elif ker=="precomputed":
        K=X_train
    #alpha=Kernel_Ridge_Regression(K,y_train,lbd,False,1,bs,"precomputed")
    #alpha=np.zeros(n)
    #l=[]
  
    if bs :
        Kb=addbiais(K)
        K0=addzeros(K)
        alpha=np.zeros(n+1)
    else:
        alpha=np.zeros(n)
    for i in range(n_iter):
   
        alpha_old=alpha
       
        if bs:
            m=Kb.dot(alpha)
            #l.append(log_loss(y_train*m).mean()+lbd*alpha[:-1].dot(K.dot(alpha[:-1])))
        
        else:
            m=K.dot(alpha)
            #l.append(log_loss(y_train*m).mean()+lbd*alpha.dot(m))
        
        
        p=sigmoid(m)
       
        weight=p*(1-p)
       
        weight=np.where(weight<0.000001,0.000001,weight)
       
   
        u=np.where(sigmoid(y_train*m)<0.000001,0.000001,sigmoid(y_train*m))
        z = m + y_train/u
    
        if not(bs):
        
            S = np.diag(weight**-1)
            A=(K+2*lbd*n*S)
            alpha=solve(A,z,assume_a="sym")
            
            #print(np.linalg.norm(alpha_old-alpha))
            
            if np.linalg.norm(alpha_old-alpha)<eps:
                break
        else:
            S = np.diag(weight)
            A=(Kb.T.dot(S.dot(Kb))+2*lbd*n*K0)
            B=Kb.T.dot(S.dot(z))
            if method=="slow":
                alpha=lstsq(A,B)[0]
            else:
                alpha=solve(A,B,assume_a="sym")
            #print(np.linalg.norm(alpha_old-alpha))
            if np.linalg.norm(alpha_old-alpha)<eps:
                break
                
       
    return alpha #,l
        

In [6]:
class KernelLR(BaseEstimator,ClassifierMixin):
    def __init__(self,lbd=1,gamma='auto',degree=2,c0=1,k=3,biais=False,kernel="rbf",n_iter=15,method="slow"):
        self.lbd=lbd
        self.gamma=gamma
        self.degree=degree
        self.c0=c0
        self.k=k
        self.biais=biais
        self.kernel=kernel
        self.n_iter=n_iter
        self.method=method
    def fit(self,X,y):
        self.classes_ = np.unique(y)
        self.Xtr=X
        if isinstance(self.gamma,str) and self.kernel=="rbf":
            self.gamma=1/self.Xtr.shape[1]
        self.alpha=IRLS(X,y,self.lbd,self.gamma,self.degree,self.c0,self.k,self.biais,self.kernel,self.n_iter,method=self.method)
        return self
    def decision_function(self,X):
        if not(self.biais):
            if self.kernel=="precomputed":
                return X.dot(self.alpha)
            if self.kernel=="rbf":
                return K_rbf_kernel(X,self.Xtr,self.gamma).dot(self.alpha) 
            elif self.kernel=="poly":
                return poly_kernel(X,self.Xtr,self.degree,self.c0).dot(self.alpha) 
            elif self.kernel=="spectrum":
                return K_spectrum_kernel(X,self.Xtr,self.k).dot(self.alpha) 
        else:
            if self.kernel=="rbf":
                return addbiais(K_rbf_kernel(X,self.Xtr,self.gamma)).dot(self.alpha)
            elif self.kernel=="poly":
                return addbiais(poly_kernel(X,self.Xtr,self.degree,self.c0)).dot(self.alpha)
            elif self.kernel=="spectrum":
                return addbiais(K_spectrum_kernel(X,self.Xtr,self.k)).dot(self.alpha)
        

    def predict(self,X,y=None):
        scores=self.decision_function(X)
        if len(scores.shape) == 1:
            indices = (scores > 0).astype(np.int)
        else:
            indices = scores.argmax(axis=1)
        return self.classes_[indices]
    def predict_proba_(self,X,y=None):
        p=sigmoid(self.decision_function(X)).reshape(-1,1)
        return hstack((p,1-p))
    def get_params(self, deep=True):
    
        return {"lbd": self.lbd,"gamma":self.gamma,"degree":self.degree,"c0":self.c0,"k":self.k,"biais":self.biais,
                "kernel":self.kernel,"n_iter":self.n_iter,"method":self.method}

    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self

### 0.3 Kernel SVM

In [7]:
def SVM(X_train,y_train,C,gamma,degree,c0,k,kernel,MKL=False):
    n=y_train.shape[0]
    if kernel=="rbf":
        K=rbf_kernel(X_train,gamma)
    elif kernel=="poly":
        K=poly_kernel(X_train,X_train,degree,c0)
    elif kernel=="spectrum":
        K=spectrum_kernel(X_train,k)
    elif kernel=="precomputed":
        K=X_train
    P=matrix(K,tc='d')
    q=matrix(-y_train,tc='d')
    g1=np.diag(y_train)
    G=matrix(np.vstack((g1,-g1)),tc='d')
    h=matrix(np.hstack((np.repeat(C,n),np.zeros(n))),tc='d')
    solvers.options['show_progress'] = False
    sol=solvers.qp(P,q,G,h)
    if not(MKL):
        return np.array(sol['x']).reshape(-1,)
    return np.array(sol['x']).reshape(-1,),sol['dual objective']

In [8]:
class KernelSVM(BaseEstimator,ClassifierMixin):
    def __init__(self,C=1,gamma='auto',degree="2",c0=1,k=3,kernel="rbf"):
        self.C=C
        self.gamma=gamma
        self.degree=degree
        self.c0=c0
        self.k=k
        self.kernel=kernel
    def fit(self,X,y):
        self.classes_ = np.unique(y)
        self.Xtr=X
        if isinstance(self.gamma,str) and self.kernel=="rbf":
            self.gamma=1/self.Xtr.shape[1]
        self.alpha=SVM(X,y,self.C,self.gamma,self.degree,self.c0,self.k,self.kernel)
        #idx=self.alpha>10**-5
        #self.Xtr=self.Xtr[idx]
        #self.alpha=self.alpha[idx]
        return self
    def decision_function(self,X):
        if self.kernel=="precomputed":
            return X.dot(self.alpha)
        elif self.kernel=="rbf":
            return K_rbf_kernel(X,self.Xtr,self.gamma).dot(self.alpha) 
        elif self.kernel=="poly":
            return poly_kernel(X,self.Xtr,self.degree,self.c0).dot(self.alpha) 
        elif self.kernel=="spectrum":
            return K_spectrum_kernel(X,self.Xtr,self.k).dot(self.alpha) 
            
           
    def predict(self,X,y=None):
        scores=self.decision_function(X)
        if len(scores.shape) == 1:
            indices = (scores > 0).astype(np.int)
        else:
            indices = scores.argmax(axis=1)
        return self.classes_[indices]
    def get_params(self, deep=True):
    
        return {"C": self.C,"gamma":self.gamma,"degree":self.degree,"c0":self.c0,"k":self.k,"kernel":self.kernel}

    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self

In [9]:
def proj(a, y):
 
    l = y/a
    idx = np.argsort(l)
    d = len(l)
 
    evalpL = lambda k: np.sum(a[idx[k:]]*(y[idx[k:]] - l[idx[k]]*a[idx[k:]]) ) -1
 
 
    def bisectsearch():
        idxL, idxH = 0, d-1
        L = evalpL(idxL)
        H = evalpL(idxH)
 
        if L<0:
            return idxL
 
        while (idxH-idxL)>1:
            iMid = int((idxL+idxH)/2)
            M = evalpL(iMid)
 
            if M>0:
                idxL, L = iMid, M
            else:
                idxH, H = iMid, M
 
        return idxH
 
    k = bisectsearch()
    lam = (np.sum(a[idx[k:]]*y[idx[k:]])-1)/np.sum(a[idx[k:]])
 
    x = np.maximum(0, y-lam*a)
 
    return x


In [10]:
A=np.array([[1,2,5],[1,3,4]])
B=np.array([[2,-3,7],[0,1,4]])
alpha=np.array([0.5,2,2])
s=np.array([A,B,2*B])
print(0.5*A + 2*B+4*B)
print((alpha.reshape(-1,1,1)*s).sum(axis=0))
(np.expand_dims(alpha,(0,1)).T*np.array(s)).sum(axis=0)

[[ 12.5 -17.   44.5]
 [  0.5   7.5  26. ]]
[[ 12.5 -17.   44.5]
 [  0.5   7.5  26. ]]


array([[ 12.5, -17. ,  44.5],
       [  0.5,   7.5,  26. ]])

In [11]:
A=np.array([[1,2],[1,3]])
B=np.array([[2,-3],[0,1]])
alpha=np.array([1,2])
u=A.dot(alpha)
v=B.dot(alpha)
print(u,v)
s=np.array([A,B])
print(s,s.shape)
print(s.dot(alpha))
print(alpha.dot(u),alpha.dot(v))
alpha.dot((s.dot(alpha)).T)

[5 7] [-4  2]
[[[ 1  2]
  [ 1  3]]

 [[ 2 -3]
  [ 0  1]]] (2, 2, 2)
[[ 5  7]
 [-4  2]]
19 0


array([19,  0])

### 0.4 MKL

In [135]:
def MKL_SVM(X_train,Y_train,list_kernel,C,tau,n_iter,eps=10**-8):
    weight=np.ones(len(list_kernel))/len(list_kernel)
  
    m=len(list_kernel)
    n=Y_train.shape[0]
    list_obj=[]
    list_weight=[weight]
   
    for i in range(n_iter):
        K=0
        for j in range(m):
            K+=weight[j]*list_kernel[j]
        weight_old=weight.copy()
        idx=np.argmax(weight)
        print(idx)
        #K=(np.expand_dims(weight,(0,1)).T*np.array(list_kernel)).sum(axis=0)
      
        alpha,objective=SVM(K,Y_train,C,1,1,1,1,"precomputed",MKL=True)
        
        
        
        grad_weight=np.zeros(m)
        reduced_grad=np.zeros(m)
        for j in range(m):
            grad_weight[j]=-0.5*alpha.dot(list_kernel[j].dot(alpha))
        for j in range(m):
            if weight[j]==0 and grad_weight[j]-grad_weight[idx]>0:
                reduced_grad[j]=0
            elif j!=idx and weight[j]>0:
                reduced_grad[j]=-grad_weight[j]+grad_weight[idx]
            else:
                
                reduced_grad[j]=(-grad_weight[j]+np.delete(grad_weight[weight>0],idx)).sum()
    
      
        #weight_before_proj=weight-tau*0.5*alpha.dot((np.array(list_kernel).dot(alpha)).T)
        weight=weight+tau*reduced_grad
        u=False
        for i in range(m):
            if weight[i]<10**-2:
                weight[i]=0
                u=True
        if u:      
            weight=proj(np.ones(m),weight)
            
        print(weight,weight.sum())
        list_weight.append(weight)
        print(np.linalg.norm(weight-weight_old))
        if np.linalg.norm(weight-weight_old)<eps:
            break
    return weight,list_weight,list_obj

In [137]:
list_kernel=[dic_K0[k] for k in [4,7,9,8,5,6]]
w,lw,lo=MKL_SVM(X_0train,Y_0train,list_kernel,1,0.0001,100)

0
[0.15859808 0.16847808 0.16920183 0.16894291 0.16704523 0.16773388] 1.0
0.009015207087027693
2
[0.15612903 0.16777198 0.17616212 0.16869001 0.16494396 0.16630289] 1.0
0.007846494612920625
2
[0.15371547 0.16708589 0.18294949 0.16844416 0.16289468 0.16491032] 1.0
0.007652681408420263
2
[0.15135426 0.16641856 0.18957406 0.16820489 0.16089437 0.16355386] 1.0
0.007470218168882739
2
[0.14904255 0.16576888 0.1960449  0.16797183 0.15894037 0.16223147] 1.0
0.0072978745302523665
2
[0.14677776 0.16513585 0.20237025 0.16774462 0.15703024 0.16094128] 0.9999999999999999
0.007134763941408126
2
[0.14455752 0.16451855 0.20855772 0.16752292 0.15516173 0.15968156] 0.9999999999999998
0.006980160684109752
2
[0.14237963 0.16391613 0.21461431 0.16730644 0.15333275 0.15845074] 0.9999999999999999
0.006833400642552724
2
[0.14024209 0.1633278  0.22054649 0.16709491 0.15154138 0.15724733] 0.9999999999999998
0.0066938646572726425
2
[0.13814303 0.16275286 0.22636021 0.16688806 0.14978586 0.15606999] 0.99999999999

KeyboardInterrupt: 

In [62]:
 '''for j in range(m):
            
            grad_weight[j]=-0.5*gamma.dot(list_kernel[j].dot(gamma))
        for j in range(m):
            if weight[j]==0 and grad_weight[j]-grad_weight[idx]>0:
                reduced_grad[j]=0
            elif j!=idx:
                reduced_grad[j]=-grad_weight[j]-grad_weight[idx]
            else:
 
 grad_weight[idx]='''
    
     grad_weight=np.zeros(m)
        reduced_grad=np.zeros(m)

[-1.7144022114303221, -1.7243372089271167]

### 0.5 Fonctions utiles 

In [157]:
def Cross_val_spectrum(dic_K,X_train,Y_train,model,hps,cv=5):
    CV=StratifiedKFold(cv)
    dic_K_prime={k:dic_K[k] for k in hps["k"]}
    list_hp=[]
    list_val_score=[]
    idx_k=list(hps.keys()).index("k")
    
    for i in tqdm(product(*hps.values())):
        
        dic_hp={keys:values for keys,values in zip(hps.keys(),i)}
        list_hp.append(dic_hp)
        
        model.set_params(**dic_hp)
        acc_mean=0
        for train_idx,val_idx in CV.split(X_train,Y_train):
            K=dic_K_prime[i[idx_k]]
            model.fit(K[train_idx][:,train_idx],Y_train[train_idx])
            Y_pred=model.predict(K[val_idx][:,train_idx])
            acc_mean+=accuracy_score(Y_train[val_idx],Y_pred)
        list_val_score.append(acc_mean/cv)
    return {"params":list_hp,"mean_test_score":np.array(list_val_score),"rank_test_score":rankdata([-s for s in list_val_score],method='min')}
            

In [156]:
def Randomized_Cross_val_spectrum(dic_K,X_train,Y_train,model,hps,n_iter,cv=5):
    CV=StratifiedKFold(cv)
    dic_K_prime={k:dic_K[k] for k in hps["k"]}
    list_hp=[]
    list_val_score=[]
    idx_k=list(hps.keys()).index("k")
    for keys,values in hps.items():
        if isinstance(values,scipy.stats._distn_infrastructure.rv_frozen):
            hps[keys]=values.rvs(size=n_iter)
        else:
            hps[keys]=np.random.choice(values,n_iter)
    
    for i in tqdm(zip(*hps.values())):
        
        dic_hp={keys:values for keys,values in zip(hps.keys(),i)}
        list_hp.append(dic_hp)
        
        model.set_params(**dic_hp)
        acc_mean=0
        for train_idx,val_idx in CV.split(X_train,Y_train):
            K=dic_K_prime[i[idx_k]]
            model.fit(K[train_idx][:,train_idx],Y_train[train_idx])
            Y_pred=model.predict(K[val_idx][:,train_idx])
            acc_mean+=accuracy_score(Y_train[val_idx],Y_pred)
        list_val_score.append(acc_mean/cv)
    return {"params":list_hp,"mean_test_score":np.array(list_val_score),"rank_test_score":rankdata([-s for s in list_val_score],method='min')}
            

In [154]:
def présentation_résultat2(search,n):
    mask=search['rank_test_score']<=n
    params=list(compress(search['params'], list(mask)))
    mean_test_score=search['mean_test_score'][mask]
    a={}
    for i in range(mean_test_score.size):
        k=''
        for key, value in params[i].items():
            k+=" "+key+" "+str(value)
        a.update({k:mean_test_score[i]})
        sortedDict = sorted(a.items(), key=lambda x: x[1],reverse=True)
    l=[]
    for i in sortedDict:
        u=i[0].split(sep=' ')
        del(u[0])
        lp=[]
        for j in u[1::2]:
            lp.append(j)
        lp.append(i[1])
        l.append(lp)
    head=list(params[0].keys())+["mean_test_score"]

    return(pd.DataFrame(l,columns=head))
        

In [153]:
def addbiais(X):
    return np.hstack((X,np.ones((X.shape[0],1))))
def addzeros(X):
    n,_=X.shape
    A=np.zeros((n+1,n+1))
    A[:n,:n]=X
    return(A)

In [152]:
def csv_file_string_kernel(models,filename): #models is a list of 3 models
    Y_test=np.empty(0)
    for K_train, Y_train, K_test_train, model  in zip(list_K_train, [Y_0train,Y_1train,Y_2train], list_K_train_test, models):
        model.fit(K_train, Y_train)
        Y_pred=model.predict(K_test_train)
        Y_test=np.concatenate((Y_test,np.where(Y_pred==-1,0,Y_pred)), axis=0)
    
    Y_test=Y_test.reshape(len(Y_test),1)
    
    ids=np.arange(Y_test.shape[0])
    ids=ids.reshape(len(ids),1)
    
    df=pd.DataFrame(data=np.concatenate((ids,Y_test), axis=1), columns=['Id','Bound'],dtype=np.int)
    
    return df.to_csv('Predictions/'+filename, index = False, header=True)

### 0.5 Kernel spectrum 

In [49]:
def K_spectrum_kernel(X,Y,k,alphabet="ACGT"):
    voc=product(alphabet, repeat=k)
    voc=[''.join(elt) for elt in voc]
    phi_X=np.vstack(X.apply(lambda x: [x[i:i+k] for i in range(0, len(x)-k+1)]).apply(lambda x: np.array([x.count(v) for v in voc])).to_numpy())
    phi_Y=np.vstack(Y.apply(lambda x: [x[i:i+k] for i in range(0, len(x)-k+1)]).apply(lambda x: np.array([x.count(v) for v in voc])).to_numpy())
    return phi_X.dot(phi_Y.T)
def spectrum_kernel(X,k,alphabet="ACGT"):
    voc=product(alphabet, repeat=k)
    voc=[''.join(elt) for elt in voc]
    phi_X=np.vstack(X.apply(lambda x: [x[i:i+k] for i in range(0, len(x)-k+1)]).apply(lambda x: np.array([x.count(v) for v in voc])).to_numpy())
   
    return phi_X.dot(phi_X.T)


def build_dic_voc(k):
    return {''.join(v):i for i,v in enumerate(product("ACGT",repeat=k))}
    

def phi_X(X,voc):
    x=np.zeros(len(voc))
    for u in X:
        x[voc[u]]+=1
    return x
def spectrum_kernel2(X,k):
    voc=build_dic_voc(k)
    phi_x=np.vstack(X.apply(lambda x: [x[i:i+k] for i in range(0, len(x)-k+1)]).apply(phi_X,args=(voc,)).to_numpy())
    return (phi_x.dot(phi_x.T))


def K_spectrum_kernel2(X,Y,k):
    voc=build_dic_voc(k)
    phi_x=np.vstack(X.apply(lambda x: [x[i:i+k] for i in range(0, len(x)-k+1)]).apply(phi_X,args=(voc,)).to_numpy())
    phi_y=np.vstack(Y.apply(lambda x: [x[i:i+k] for i in range(0, len(x)-k+1)]).apply(phi_X,args=(voc,)).to_numpy())
    return phi_x.dot(phi_y.T)
    
 
def phi_X_mismatch(x,voc,k,m,rate):
    phi_X=np.zeros(len(voc))
    for i in range(len(x)-k+1):
        bo=m
        while bo>=0:
            if i+k+bo<=len(x):
                
                power=0
                suite=1
                for sequence in combinations(x[i+1:i+k+m],k-1):
                    u=''.join(sequence)
                    
                    phi_X[voc[x[i]+u]]+=rate**power
                    if power<bo:
                        power+=1
                    else:
                        power=suite
                        suite+=1
                bo=-1
            else:
                bo-=1
                             
    return phi_X
            
def mismatch_kernel(X,voc,k,m,rate):
    phi_X=np.vstack(X.apply(phi_X_mismatch,args=(voc,k,m,rate)).to_numpy())
    return phi_X.dot(phi_X.T)

def K_mismatch_kernel(X,voc,k,m,rate):
    phi_X=np.vstack(X.apply(phi_X_mismatch,args=(voc,k,m,rate)).to_numpy())
    return phi_X.dot(phi_X.T)
    

## 1.Test des différents classifieurs

In [6]:
print(list(combinations("ABCD",2)))

[('A', 'B'), ('A', 'C'), ('A', 'D'), ('B', 'C'), ('B', 'D'), ('C', 'D')]


### 1.0 Import des données et précomputation du kernel !!! (étape hyper importante) 

In [41]:
X_0train=pd.read_csv('Data/Xtr0.csv', sep=',')['seq']
X_1train=pd.read_csv('Data/Xtr1.csv', sep=',')['seq']
X_2train=pd.read_csv('Data/Xtr2.csv', sep=',')['seq']

X_0test=pd.read_csv('Data/Xte0.csv', sep=',')['seq']
X_1test=pd.read_csv('Data/Xte1.csv', sep=',')['seq']
X_2test=pd.read_csv('Data/Xte2.csv', sep=',')['seq']

Y_0=pd.read_csv("Data/Ytr0.csv",sep=',')
Y_1=pd.read_csv("Data/Ytr1.csv",sep=',')
Y_2=pd.read_csv("Data/Ytr2.csv",sep=',')

Y_0train=np.where(Y_0["Bound"]==0,-1,Y_0["Bound"])
Y_1train=np.where(Y_1["Bound"]==0,-1,Y_1["Bound"])
Y_2train=np.where(Y_2["Bound"]==0,-1,Y_2["Bound"])

### J'ai stocker les kernels matrices pour k allant de 2 à 9 dans un dictionnaire mais pour que vous evitiez de perdre du temps à la recalculer je l'ai mis dans un fichier pickle et vous avez plus qu'a lancé la cellule suivante pour récupérer ce que j'ai fait entre les paires de 3 apostrophes (ce qu'il y a en rouge)

In [15]:
'''list_k=[2,3,4,5,6,7,8,9]
dic_K0={k:spectrum_kernel(X_0train,k) for k in tqdm(list_k)}
dic_K1={k:spectrum_kernel(X_1train,k) for k in tqdm(list_k)}
dic_K2={k:spectrum_kernel(X_2train,k) for k in tqdm(list_k)} '''



fichier_stockage0=open("stockage_dic_K0.pickle","rb")
dic_K0=pickle.load(fichier_stockage0)
fichier_stockage0.close()


fichier_stockage1=open("stockage_dic_K1.pickle","rb")
dic_K1=pickle.load(fichier_stockage1)
fichier_stockage1.close()


fichier_stockage2=open("stockage_dic_K2.pickle","rb")
dic_K2=pickle.load(fichier_stockage2)
fichier_stockage2.close()



### 1.1 Tests de Kernel Ridge Regression

In [69]:
KRR=KernelRR(kernel="precomputed")
print(KRR)

hps0={'lbd':[10**-6,10**-5,10**-4,10**-3,10**-2,10**-1,1,10],'k':[2,3,4,5,6,7,8,9]}
searchKRR0= Cross_val_spectrum(dic_K0,X_0train,Y_0train,KRR,hps0)


hps1={'lbd':[10**-6,10**-5,10**-4,10**-3,10**-2,10**-1,1,10],'k':[2,3,4,5,6,7,8,9]}
searchKRR1= Cross_val_spectrum(dic_K1,X_1train,Y_1train,KRR,hps1)



hps2={'lbd':[10**-6,10**-5,10**-4,10**-3,10**-2,10**-1,1,10],'k':[2,3,4,5,6,7,8,9]}
searchKRR2= Cross_val_spectrum(dic_K2,X_2train,Y_2train,KRR,hps2)

0it [00:00, ?it/s]

KernelRR(kernel='precomputed')


64it [00:51,  1.24it/s]
64it [00:52,  1.22it/s]
64it [00:52,  1.22it/s]


In [70]:
présentation_résultat2(searchKRR0,10)

Unnamed: 0,lbd,k,mean_test_score
0,0.1,7,0.654
1,0.1,8,0.647
2,1.0,7,0.647
3,0.01,8,0.6435
4,0.0001,8,0.641
5,0.001,8,0.6405
6,1e-06,8,0.64
7,1e-05,8,0.64
8,0.1,9,0.64
9,0.01,9,0.6385


In [71]:
présentation_résultat2(searchKRR1,10)

Unnamed: 0,lbd,k,mean_test_score
0,1.0,6,0.658
1,0.1,6,0.649
2,1.0,5,0.6465
3,10.0,6,0.6405
4,1.0,7,0.6395
5,0.1,7,0.638
6,0.1,5,0.6365
7,10.0,5,0.6355
8,10.0,7,0.632
9,1.0,8,0.631


In [72]:
présentation_résultat2(searchKRR2,10)

Unnamed: 0,lbd,k,mean_test_score
0,0.1,7,0.737
1,0.01,7,0.734
2,0.1,6,0.7315
3,0.001,7,0.7275
4,0.0001,8,0.726
5,1e-06,8,0.7255
6,1e-05,8,0.7255
7,0.01,8,0.7255
8,0.0001,7,0.725
9,0.01,9,0.724


In [73]:
hps0={'lbd':uniform(loc=0.05,scale=1.2),'k':[7]}
searchKRR0= Randomized_Cross_val_spectrum(dic_K0,X_0train,Y_0train,KRR,hps0,200)

hps1={'lbd':uniform(loc=0.05,scale=1.5),'k':[6]}
searchKRR1= Randomized_Cross_val_spectrum(dic_K1,X_1train,Y_1train,KRR,hps1,200)


hps2={'lbd':uniform(loc=0.001,scale=0.5),'k':[7]}
searchKRR2= Randomized_Cross_val_spectrum(dic_K2,X_2train,Y_2train,KRR,hps2,200)

200it [02:31,  1.32it/s]
200it [02:28,  1.35it/s]
200it [02:28,  1.34it/s]


In [75]:
présentation_résultat2(searchKRR0,10)

Unnamed: 0,lbd,k,mean_test_score
0,0.1111601564080122,7,0.6555
1,0.2887381428183806,7,0.6545
2,0.2886288046086186,7,0.6545
3,0.1878456240507801,7,0.6545
4,0.1893052077085477,7,0.6545
5,0.1768149826361781,7,0.6545
6,0.0990668607091159,7,0.6545
7,0.3035454384450984,7,0.654
8,0.1728668179491131,7,0.654
9,0.1022838990352923,7,0.654


In [76]:
présentation_résultat2(searchKRR1,10)

Unnamed: 0,lbd,k,mean_test_score
0,1.0046766993880858,6,0.6585
1,1.0022300439630745,6,0.6585
2,1.128847105906943,6,0.658
3,1.1624823373575035,6,0.658
4,1.118720665169339,6,0.658
5,1.10641010633656,6,0.658
6,1.128599813229448,6,0.658
7,1.1198298035153174,6,0.658
8,1.113855088697709,6,0.658
9,1.1050015050740216,6,0.658


In [77]:
présentation_résultat2(searchKRR2,10)

Unnamed: 0,lbd,k,mean_test_score
0,0.0592248691525737,7,0.743
1,0.0594232475683195,7,0.7425
2,0.0696619022501242,7,0.741
3,0.0503092626686699,7,0.7405
4,0.0494028323702597,7,0.7405
5,0.0457293649258126,7,0.74
6,0.0488049618294712,7,0.74
7,0.047938666765173,7,0.7395
8,0.0896499868333213,7,0.7395
9,0.074805082320527,7,0.7395


### 1.2 Tests de Kernel Logistic Regression

In [80]:
KLR=KernelLR(kernel="precomputed")
print(KLR)

hps0={'lbd':[10**-6,10**-5,10**-4,10**-3,10**-2,10**-1,1,10],'k':[2,3,4,5,6,7,8,9]}
searchKLR0= Cross_val_spectrum(dic_K0,X_0train,Y_0train,KLR,hps0)


hps1={'lbd':[10**-6,10**-5,10**-4,10**-3,10**-2,10**-1,1,10],'k':[2,3,4,5,6,7,8,9]}
searchKLR1= Cross_val_spectrum(dic_K1,X_1train,Y_1train,KLR,hps1)



hps2={'lbd':[10**-6,10**-5,10**-4,10**-3,10**-2,10**-1,1,10],'k':[2,3,4,5,6,7,8,9]}
searchKLR2= Cross_val_spectrum(dic_K2,X_2train,Y_2train,KLR,hps2)

0it [00:00, ?it/s]

KernelLR(kernel='precomputed')


64it [04:33,  4.28s/it]
64it [04:36,  4.31s/it]
64it [04:41,  4.39s/it]


In [81]:
présentation_résultat2(searchKLR0,10)

Unnamed: 0,lbd,k,mean_test_score
0,0.01,7,0.6525
1,0.1,7,0.648
2,1e-05,8,0.646
3,0.0001,8,0.646
4,0.001,7,0.6455
5,1e-06,8,0.644
6,0.01,8,0.6435
7,0.001,8,0.6425
8,0.0001,7,0.6405
9,0.1,8,0.6405


In [82]:
présentation_résultat2(searchKLR1,10)

Unnamed: 0,lbd,k,mean_test_score
0,0.1,6,0.6565
1,0.01,6,0.6495
2,0.1,5,0.6475
3,1.0,6,0.643
4,0.01,7,0.6405
5,0.1,7,0.64
6,10.0,6,0.6395
7,0.01,5,0.6375
8,1.0,5,0.6375
9,1.0,7,0.6325


In [83]:
présentation_résultat2(searchKLR2,10)

Unnamed: 0,lbd,k,mean_test_score
0,0.01,7,0.7385
1,0.001,7,0.738
2,1e-05,7,0.7365
3,0.0001,7,0.7355
4,1e-06,7,0.734
5,0.01,6,0.731
6,0.001,6,0.729
7,0.001,8,0.7265
8,1e-05,8,0.726
9,1e-06,8,0.7255


In [84]:
hps0={'lbd':uniform(loc=0.005,scale=0.5),'k':[7]}
searchKLR0= Randomized_Cross_val_spectrum(dic_K0,X_0train,Y_0train,KLR,hps0,100)

hps1={'lbd':uniform(loc=0.005,scale=0.5),'k':[6]}
searchKLR1= Randomized_Cross_val_spectrum(dic_K1,X_1train,Y_1train,KLR,hps1,100)


hps2={'lbd':uniform(loc=0.0005,scale=0.05),'k':[7]}
searchKLR2= Randomized_Cross_val_spectrum(dic_K2,X_2train,Y_2train,KLR,hps2,100)

100it [04:13,  2.54s/it]
100it [04:08,  2.49s/it]
100it [06:16,  3.77s/it]


In [85]:
présentation_résultat2(searchKLR0,10)

Unnamed: 0,lbd,k,mean_test_score
0,0.0264942796243052,7,0.6555
1,0.0278311569144027,7,0.655
2,0.0320142215221474,7,0.6535
3,0.0152184261970016,7,0.6535
4,0.0406577133447053,7,0.6535
5,0.0206221047921693,7,0.6535
6,0.0332748188490289,7,0.653
7,0.0506578983917141,7,0.653
8,0.0350124981365166,7,0.6525
9,0.0108990586044125,7,0.6525


In [86]:
présentation_résultat2(searchKLR1,10)

Unnamed: 0,lbd,k,mean_test_score
0,0.1250796395660564,6,0.6585
1,0.122085729251496,6,0.658
2,0.1419119916841419,6,0.6575
3,0.0904901133639264,6,0.6575
4,0.1184867036147113,6,0.6575
5,0.0880563965178216,6,0.657
6,0.1155073587474562,6,0.657
7,0.1573736352302941,6,0.656
8,0.1048262360321177,6,0.656
9,0.2342571316387444,6,0.656


In [87]:
présentation_résultat2(searchKLR2,10)

Unnamed: 0,lbd,k,mean_test_score
0,0.0039989262731859,7,0.7455
1,0.004848356729876,7,0.7445
2,0.004674408398636,7,0.7445
3,0.0034464359758682,7,0.7445
4,0.0048536617516602,7,0.7445
5,0.0036187451393856,7,0.7445
6,0.003244202565624,7,0.7435
7,0.0029095758548116,7,0.7425
8,0.0028558892146783,7,0.7425
9,0.0064638326934942,7,0.742


### 1.3 Test de Kernel SVM

In [92]:
KSVM=KernelSVM(kernel="precomputed")
print(KSVM)

hps0={'C':[10**-2,10**-1,1,10,50,100,500,1000],'k':[2,3,4,5,6,7,8,9]}
searchKSVM0= Cross_val_spectrum(dic_K0,X_0train,Y_0train,KSVM,hps0)


hps1={'C':[10**-2,10**-1,1,10,50,100,500,1000],'k':[2,3,4,5,6,7,8,9]}
searchKSVM1= Cross_val_spectrum(dic_K1,X_1train,Y_1train,KSVM,hps1)



hps2={'C':[10**-2,10**-1,1,10,50,100,500,1000],'k':[2,3,4,5,6,7,8,9]}
searchKSVM2= Cross_val_spectrum(dic_K2,X_2train,Y_2train,KSVM,hps2)

0it [00:00, ?it/s]

KernelSVM(kernel='precomputed')


41it [18:42, 27.39s/it]


KeyboardInterrupt: 

In [None]:
présentation_résultat2(searchKSVM0,10)

In [None]:
présentation_résultat2(searchKSVM1,10)

In [None]:
présentation_résultat2(searchKSVM2,10)

In [None]:
hps0={'C':uniform(loc=0.005,scale=0.5),'k':[8]}
searchKSVM0= Randomized_Cross_val_spectrum(dic_K0,Y_0train,KSVM,hps0,50)


hps1={'C':uniform(loc=0.005,scale=0.5),'k':[5]}
searchKSVM1= Randomized_Cross_val_spectrum(dic_K1,Y_1train,KSVM,hps1,50)



hps2={'C':uniform(loc=0.005,scale=0.5),'k':[7]}
searchKSVM2= Randomized_Cross_val_spectrum(dic_K2,Y_2train,KSVM,hps2,50)


In [None]:
présentation_résultat2(searchKSVM0,10)

In [None]:
présentation_résultat2(searchKSVM1,10)

In [None]:
présentation_résultat2(searchKSVM2,10)

# 2 Elaboration du model final et création du fichier csv

### 2.1 Precomputation du kernel test_train selon le modèle choisi

In [88]:
list_k_chosen=[7,6,7]
model0=KernelLR(lbd=0.02649427962430528,k=list_k_chosen[0],kernel="precomputed")
model1=KernelLR(lbd=0.12507963956605644,k=list_k_chosen[1],kernel="precomputed")
model2=KernelLR(lbd=0.003998926273185998,k=list_k_chosen[2],kernel="precomputed")
list_K_train=[dic_K0[list_k_chosen[0]],dic_K1[list_k_chosen[1]],dic_K2[list_k_chosen[2]]]
list_K_train_test=[K_spectrum_kernel(X_test,X_train,k) for X_test,X_train,k in tqdm(zip([X_0test,X_1test,X_2test],[X_0train,X_1train,X_2train],list_k_chosen))]




3it [03:04, 61.51s/it]


In [90]:
for k in list_K_train_test:
    print(k.shape)

(1000, 2000)
(1000, 2000)
(1000, 2000)


In [91]:
csv_file_string_kernel([model0,model1,model2],"kernel_spectrum_try3.csv")