# With encoded sequences 

In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from models import kernel_ridge_regression, kernel_spectral_clustering, kernel_logistic_regression
from preprocess import preprocess 


In [24]:
df_train, labels_train, labels__train_onehot = preprocess("data/Xtr0_mat100.csv", "data/Xtr1_mat100.csv", "data/Xtr1_mat100.csv", labels_1="data/Ytr0.csv", labels_2="data/Ytr1.csv", labels_3="data/Ytr2.csv", numeric_data=True)

df_test = preprocess("data/Xte0_mat100.csv", "data/Xte1_mat100.csv", "data/Xte1_mat100.csv", train=False, numeric_data=True)

In [101]:
# On télécharge les données d'entrainements
df_1 = pd.read_csv("data/Xtr0_mat100.csv", sep=" ", header=None)
df_2 = pd.read_csv("Xtr1_mat100.csv", sep=" ", header=None)
df_3 = pd.read_csv("Xtr2_mat100.csv", sep=" ", header=None)

labels_1 = pd.read_csv("Ytr0.csv")
labels_1_train = labels_1.loc[:, 'Bound'].to_numpy()

labels_2 = pd.read_csv("Ytr1.csv")
labels_2_train = labels_2.loc[:, 'Bound'].to_numpy()

labels_3 = pd.read_csv("Ytr2.csv")
labels_3_train = labels_3.loc[:, 'Bound'].to_numpy()

# On encode en faisant un one hot encoding 
labels_train = np.concatenate((labels_1_train, labels_2_train, labels_3_train))
encoder = OneHotEncoder(sparse_output=False)
labels__train_onehot = encoder.fit_transform(labels_train.reshape(-1, 1))

# On télécharge les données de tests
df_1_test = pd.read_csv("Xte0_mat100.csv", sep=" ", header=None)
df_2_test = pd.read_csv("Xte1_mat100.csv", sep=" ", header=None)
df_3_test = pd.read_csv("Xte2_mat100.csv", sep=" ", header=None)

# On concatène les données d'entrainement 
df_train = pd.concat([df_1, df_2, df_3], axis=0)

# On concatène les données de tests
df_test = pd.concat([df_1_test, df_2_test, df_3_test], axis=0)

df_2

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.000000,0.010870,0.010870,0.043478,0.032609,0.000000,0.000000,0.010870,0.000000,0.054348,...,0.000000,0.000000,0.010870,0.000000,0.032609,0.000000,0.000000,0.000000,0.021739,0.010870
1,0.010870,0.010870,0.000000,0.010870,0.000000,0.010870,0.021739,0.010870,0.010870,0.010870,...,0.043478,0.021739,0.021739,0.021739,0.000000,0.021739,0.065217,0.010870,0.000000,0.000000
2,0.010870,0.010870,0.000000,0.000000,0.000000,0.000000,0.010870,0.000000,0.021739,0.000000,...,0.000000,0.010870,0.000000,0.000000,0.000000,0.010870,0.000000,0.000000,0.021739,0.021739
3,0.021739,0.021739,0.000000,0.021739,0.021739,0.010870,0.000000,0.000000,0.000000,0.010870,...,0.000000,0.021739,0.000000,0.021739,0.000000,0.021739,0.010870,0.000000,0.010870,0.010870
4,0.000000,0.000000,0.021739,0.000000,0.000000,0.000000,0.010870,0.000000,0.010870,0.000000,...,0.000000,0.010870,0.021739,0.010870,0.000000,0.000000,0.282609,0.010870,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0.010870,0.010870,0.010870,0.000000,0.010870,0.021739,0.000000,0.010870,0.000000,0.021739,...,0.000000,0.000000,0.010870,0.000000,0.010870,0.010870,0.000000,0.021739,0.032609,0.021739
1996,0.000000,0.000000,0.032609,0.010870,0.032609,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.010870,0.010870,0.000000,0.000000,0.000000,0.032609,0.000000
1997,0.000000,0.000000,0.010870,0.000000,0.000000,0.021739,0.010870,0.000000,0.000000,0.010870,...,0.000000,0.010870,0.000000,0.021739,0.010870,0.010870,0.000000,0.010870,0.010870,0.043478
1998,0.000000,0.000000,0.021739,0.010870,0.000000,0.021739,0.000000,0.021739,0.010870,0.021739,...,0.000000,0.010870,0.000000,0.000000,0.010870,0.000000,0.032609,0.010870,0.032609,0.010870


### Kernel ridge regression

In [25]:
# Training data
X = df_train

# Split train/test
X_train, X_test, y_train, y_test = train_test_split(X, labels__train_onehot, test_size=0.2, random_state=42)

In [44]:
model = kernel_ridge_regression(lambda_=0.15, kernel='RBF', num_classes=2, sigma=0.1)

model.fit(X_train, y_train)

In [45]:
y_pred = model.predict(X_test)

print('the accuracy score for a kernel ridge regression with gaussian kernel is {}'.format(accuracy_score(np.argmax(y_test, axis=1), y_pred)))

the accuracy score for a kernel ridge regression with gaussian kernel is 0.535


#### cross-validation for the Ridge regression

In [9]:
ridge = kernel_ridge_regression(kernel='RGB', num_classes=2)

param_grid = {
    'lambda_': [0.1, 0.15, 0.3, 1],
    'sigma' : [0.05, 0.1, 0.15],
    'kernel': ['RBF']
}

def accuracy_ridge(y_true, y_pred):
    y_final = np.argmax(y_true, axis=1)
    return accuracy_score(y_final, y_pred)

ridge_score = make_scorer(accuracy_ridge, greater_is_better=True)

grid_search = GridSearchCV(ridge, param_grid, cv=5, scoring=ridge_score, n_jobs=-1)

In [None]:
grid_search.fit(X_train, y_train)

In [34]:
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print('the best accuracy reaches {} and is computed with these parameters : {}'.format(best_score, best_params))

{'lambda_': 0.15, 'sigma': 0.1}

### Kernel logistic regression 

In [46]:
# Training data
X = df_train

# Split train/test (classes are -1 or 1 here)
X_train, X_test, y_train, y_test = train_test_split(X, labels_train, test_size=0.2, random_state=42)

# classes are -1 or +1
y_train[y_train == 0] = - 1
y_test[y_test == 0] = - 1

In [61]:
model = kernel_logistic_regression(alpha0_coeff = 0, lambda_=10, kernel='RBF', n_iter=50, sigma=0.1)

model.fit(X_train, y_train)

In [62]:
y_pred = model.predict(X_test)

print('the accuracy score here is {}'.format(accuracy_score(y_test, y_pred)))

the accuracy score here is 0.5325


#### Cross-validation for the kernel Logistic regression

In [157]:
logistic_gaussian= kernel_logistic_regression(num_classes=2)

param_grid = {
    'alpha0_coeff' : [0.1, 0.5, 1, 10],
    'lambda_': [0.1, 0.5, 1, 10],
    'sigma' : [0.05, 0.1, 0.15, 0.2, 0.3],
    'kernel':['RBF']
}
logistic_gaussian.set_params(kernel='RBF')

def accuracy_logistic(y_true, y_pred):
    
    y_final = y_true
    y_final[y_true == 0] = -1
    y_pred[y_pred > 0] = 1
    y_pred[y_pred < 0] = -1
    return accuracy_score(y_final, y_pred)

logistic_score = make_scorer(accuracy_logistic, greater_is_better=True)

grid_search = GridSearchCV(logistic_gaussian, param_grid, cv=5, scoring=logistic_score, n_jobs=-1) #au dessus de 0.57?

In [158]:
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
best_params

{'alpha0_coeff': 0.1, 'kernel': 'RBF', 'lambda_': 0.5, 'sigma': 0.1}

In [159]:
best_score = grid_search.best_score_
print(best_score)
best_params = grid_search.best_params_
best_params 

0.5377083333333333


{'alpha0_coeff': 0.1, 'kernel': 'RBF', 'lambda_': 0.5, 'sigma': 0.1}

### KMeans avec deux centres suivis de deux modèles de ridge regression

In [3]:
import numpy as np

In [4]:
# Données d'entrainement
X = df_train

# Split train/test
X_train, X_test, y_train, y_test = train_test_split(X, labels_train, test_size=0.2, random_state=42)

In [14]:
kmeans = kernel_spectral_clustering(kernel='RGB', k_neighbors = 3, sigma=0.1)

train_classes = kmeans.classification_train_set(X_train)


(4800, 3)


In [15]:
test_classes = kmeans.predict_class_test(X_test)

In [126]:
model_cluster_1 = kernel_ridge_regression(lambda_=0.05, kernel='RGB', num_classes=2, sigma=0.1) 
model_cluster_1.fit(X_train.loc[(train_classes == 0), :], y_train[train_classes == 0])

In [16]:
model = kernel_logistic_regression()
param_grid = {
    'n_iter' : [8],
    'alpha0_coeff' : [10, 1],
    'lambda_': [0.05, 0.1, 0.5, 1, 10],
    'sigma' : [0.01, 0.07, 0.1, 0.5, 1]
}

def accuracy_ridge(y_true, y_pred):
    y_final = np.argmax(y_true, axis=1)
    return accuracy_score(y_final, np.argmax(y_pred, axis=1))

def accuracy_logistic(y_true, y_pred):
    y_true[y_true == 0] = - 1

    y_pred[y_pred > 0] = 1
    y_pred[y_pred < 0] = -1

    return accuracy_score(y_true, y_pred)

logistic_score = make_scorer(accuracy_logistic, greater_is_better=True)

grid_search = GridSearchCV(model, param_grid, cv=5, scoring=logistic_score, n_jobs=-1)

In [19]:
X_train_1 = X_train.loc[(train_classes == 0) | (train_classes == 2), :]
y_train_1 = y_train[(train_classes == 0)| (train_classes == 2)]

X_train_2 = X_train.loc[(train_classes == 1), :]
y_train_2 = y_train[(train_classes == 1)]



print(len(y_train_1), len(y_train_2))

2371 2429


In [20]:
grid_search.fit(X_train_1, y_train_1) #0.662
best_params = grid_search.best_params_
print(best_params)
best_score1 = grid_search.best_score_
print(best_score1)

{'alpha0_coeff': 10, 'lambda_': 0.5, 'n_iter': 8, 'sigma': 0.01}
0.6157637130801688


In [21]:
grid_search.fit(X_train_2, y_train_2) #0.617
best_params = grid_search.best_params_
print(best_params)
best_score2 = grid_search.best_score_
print(best_score2)

{'alpha0_coeff': 1, 'lambda_': 0.05, 'n_iter': 8, 'sigma': 0.01}
0.5928361121717365


In [217]:
#0.643077456075229
grid_search.fit(X_train_3, y_train_3)
best_params = grid_search.best_params_
print(best_params)
best_score3 = grid_search.best_score_
print(best_score3)

{'lambda_': 0.2, 'sigma': 0.1}
0.643077456075229


In [22]:
#0.6304514875397231 pour ridge 
print('the average accuracy is {}'.format((best_score1 * len(y_train_1) + best_score2 * len(y_train_2)) / len(y_train)))

the average accuracy is 0.6041613917037975


### Calcul sur les données de tests

In [224]:
X_test_1 = X_test.loc[(test_classes == 0), :]
y_test_1 = y_test[(test_classes == 0)]

X_test_2 = X_test.loc[(test_classes == 1), :]
y_test_2 = y_test[(test_classes == 1)]

X_test_3 = X_test.loc[(test_classes == 2), :]
y_test_3 = y_test[(test_classes == 2)]

print(len(y_test_1))

0


In [221]:
model_cluster_1 = kernel_ridge_regression(lambda_=0.8, kernel='RGB', num_classes=2, sigma=0.1) 
model_cluster_1.fit(X_train_1, y_train_1)

model_cluster_2 = kernel_ridge_regression(lambda_=1, kernel='RGB', num_classes=2, sigma=0.07) 
model_cluster_2.fit(X_train_2, y_train_2)

model_cluster_3 = kernel_ridge_regression(lambda_=0.2, kernel='RGB', num_classes=2, sigma=0.1) 
model_cluster_3.fit(X_train_3, y_train_3)

In [None]:
#y_pred_1 = model_cluster_1.predict(X_test_1)
y_pred_2 = model_cluster_2.predict(X_test_2)
y_pred_3 = model_cluster_3.predict(X_test_3)

#y_pred_1 = np.argmax(y_pred_1, axis=1)
y_pred_2 = np.argmax(y_pred_2, axis=1)
y_pred_3 = np.argmax(y_pred_3, axis=1)

#print(accuracy_score(np.argmax(y_test_1, axis=1), y_pred_1))
print(accuracy_score(np.argmax(y_test_2, axis=1), y_pred_2))
print(accuracy_score(np.argmax(y_test_3, axis=1), y_pred_3))

print((accuracy_score(np.argmax(y_test_1, axis=1), y_pred_1) * len(y_pred_1) + accuracy_score(np.argmax(y_test_2, axis=1), y_pred_2) * len(y_pred_2) + accuracy_score(np.argmax(y_test_3, axis=1), y_pred_3) * len(y_pred_3)) / len(y_test))

0.5918057663125948
0.6303142329020333
0.6091666666666666


In [None]:
class_0 = np.where(test_classes == 0)[0]
class_1 = np.where(test_classes == 1)[0]

y_pred[class_0] = y_pred_1
y_pred[class_1] = y_pred_2
print(accuracy_score(np.argmax(y_test, axis=1), np.argmax(y_pred, axis=1)))

### Sauvegarde des données de tests en fichiers csv

In [110]:
model_cluster_2 = kernel_ridge_regression(lambda_=10, kernel='RGB', num_classes=2, sigma=0.1)

model_cluster_2.fit(X_train.loc[(train_classes == 1), :], y_train[(train_classes == 1)])

In [125]:
y_pred_1 = model_cluster_1.predict(X_test.loc[(test_classes == 0), :])
y_pred_1= np.argmax(y_pred_1, axis=1)
y_test_1 = y_test[test_classes == 0]

Acc1 = accuracy_score(np.argmax(y_test_1, axis=1), y_pred_1) * len(y_pred_1)

print(Acc1 / len(y_pred_1))

0.5935727788279773


In [None]:
y_pred_2 = model_cluster_2.predict(X_test.loc[(test_classes == 1), :])
y_pred_2 = np.argmax(y_pred_2, axis=1)
y_test_2 = y_test[(test_classes == 1)]

Acc2 = accuracy_score(np.argmax(y_test_2, axis=1), y_pred_2) * len(y_pred_2)

print(Acc2 / len(y_pred_2))

print('The final accuracy is {}'.format((Acc1 + Acc2) / test_classes.shape[0]))

0.6214605067064084
The final accuracy is 0.6166666666666667


# With string sequences 

### Kernel ridge regression

In [2]:
df_train, labels_train, labels__train_onehot = preprocess("data/Xtr0.csv", "data/Xtr1.csv", "data/Xtr2.csv", labels_1="data/Ytr0.csv", labels_2="data/Ytr1.csv", labels_3="data/Ytr2.csv")

df_test = preprocess("data/Xte0.csv", "data/Xte1.csv", "data/Xte2.csv", train=False)

In [66]:
# Données d'entrainement
X = df_train
# Split train/test
X_train, X_test, y_train, y_test = train_test_split(X, labels__train_onehot, test_size=0.2, random_state=42)

In [67]:
model2 = kernel_ridge_regression(lambda_=0.4, kernel='spectrum', num_classes=2, k=4)

model2.fit(X_train, y_train)

0.60916 avec k = 7 et lambda=0.1

0.60083 avec k = 5 et lambda = 0.5

0.56 avec k = 3 et lambda = 5 

0.57416 avec k = 3 et lambda = 0.1

In [69]:
y_pred = model2.predict(X_test)

print(accuracy_score(np.argmax(y_test, axis=1), y_pred)) 

0.6025


#### Cross-validation for the kernel ridge regression with DNA sequences

In [10]:
ridge_spectrum = kernel_ridge_regression(num_classes=2)

param_grid = {
    'lambda_': [0.3, 0.4, 0.5, 0.6],
    'k' : [7],
    'kernel': ['spectrum']
}
ridge_spectrum.set_params(kernel='spectrum')

def accuracy_ridge(y_true, y_pred):
    y_final = np.argmax(y_true, axis=1)
    return accuracy_score(y_final, np.argmax(y_pred, axis=1))

ridge_score = make_scorer(accuracy_ridge, greater_is_better=True)

grid_search = GridSearchCV(ridge_spectrum, param_grid, cv=5, scoring=ridge_score, n_jobs=-1)

In [11]:
grid_search.fit(X_train, y_train)

In [12]:
best_score = grid_search.best_score_
best_params = grid_search.best_params_

print('the best accuracy reaches {} and it is obtain with these parameters {}'.format(best_score, best_params))

0.6314583333333333


{'k': 7, 'kernel': 'spectrum', 'lambda_': 0.3}

*Ridge regression obtain 0.6302 with k=5, lambda=0.4 for param_grid = {
    'lambda_': [0.3, 0.4, 0.5, 0.6],
    'k' : [3, 4, 5, 6],
}*

#### Save the labels of the test set

In [None]:
# Entraienement du modele
model = kernel_ridge_regression(lambda_=0.3, kernel='spectrum', num_classes=2, k=7)
model.fit(X_train, y_train)

y_pred = model.predict(df_test)

In [None]:
# Prediction
data = y_pred 

# Creation of DataFrame 
df = pd.DataFrame({
    "Id": np.arange(0, 3000, 1),
    "Bound": data
})
#df.loc[:, 'Id'] = df_test.loc[:, 'Id']

# Save in a CSV file
df.to_csv("output_ridge_spectrum.csv", index=False)

### Kernel Logistic regression

In [3]:
# Données d'entrainement
X = df_train
# Split train/test
X_train, X_test, y_train, y_test = train_test_split(X, labels_train, test_size=0.2, random_state=42)

# Classes here are -1 or 1
y_train[y_train == 0] = - 1
y_test[y_test == 0] = - 1

In [30]:
y_train[y_train == 0] = - 1
y_test[y_test == 0] = - 1
                                                                #ou 0.01 pour 0.5833333334 /0.5775 pour n_iter=500
model = kernel_logistic_regression(alpha0_coeff = 0, lambda_ = 0.01, k=3, kernel='spectrum', n_iter=100, threshold=0)

model.fit(X_train, y_train)

In [31]:
y_pred = model.predict(X_test)

print(accuracy_score(y_test, y_pred)) #0.5258 auparavant avec n_iter=8

0.58


#### Cross-validation for the kernel Logistic regression with DNA sequences

In [7]:
logistic_spectrum = kernel_logistic_regression(num_classes=2)

param_grid = {
    'alpha_coeff' : [0],
    'lambda_': [0.01, 0.05, 0.1, 0.15],
    'k' : [3],
    'kernel' : ['spectrum'],
    'n_iter': [100]
}

def accuracy_logistic(y_true, y_pred):

    y_true[y_true == 0] = -1

    return accuracy_score(y_true, y_pred)

ridge_score = make_scorer(accuracy_logistic, greater_is_better=True)

grid_search = GridSearchCV(logistic_spectrum, param_grid, cv=4, scoring=ridge_score, n_jobs=-1)

In [8]:
grid_search.fit(X_train, y_train)

In [9]:
best_score = grid_search.best_score_
print(best_score)

best_params = grid_search.best_params_
best_params
#0.5614 pour {'alpha_coeff': 0.1, 'k': 3, 'kernel': 'spectrum', 'lambda_': 1.5}
#0.5664583333333333 pour {'alpha_coeff': 0.1, 'k': 3, 'kernel': 'spectrum', 'lambda_': 5}

#0.5664583333333333
#{'alpha_coeff': 0.005, 'k': 3, 'kernel': 'spectrum', 'lambda_': 5}

0.5608333333333333


{'alpha_coeff': 0, 'k': 3, 'kernel': 'spectrum', 'lambda_': 0.1, 'n_iter': 100}

### Staking model to compute both models

In [33]:
from staking_model import StackingModel

In [34]:
# Données d'entrainement
X = df_train

# Split train/test
X_train, X_test, y_train_onehot, y_test_onehot = train_test_split(X, labels__train_onehot, test_size=0.2, random_state=42)

_, _, y_train, y_test = train_test_split(X, labels_train, test_size=0.2, random_state=42)

In [None]:
base_models = [kernel_ridge_regression(lambda_=0.3, kernel='spectrum', num_classes=2, k=7),
               kernel_logistic_regression(alpha0_coeff = 0, lambda_ = 10, k=3, kernel='spectrum', n_iter=100)]
meta_model = kernel_ridge_regression(lambda_=0.1, kernel='linear', num_classes=2)

stak_model = StackingModel(base_models, meta_model)

stak_model.fit(X_train, y_train, y_train_onehot)

y_pred = stak_model.predict(X_test)

In [36]:
print('the accuracy score obtain is {}'.format(accuracy_score(y_pred, y_test)))

the accuracy score obtain is 0.605, which is less than the kernel ridge regression model
