In [None]:
import os
from tqdm.notebook import tqdm as tqdm
import numpy as np
import pandas as pd
from kernel import *
from classifiers import *
from utils.utils import return_training_datasets, split_train_val

In [None]:
def return_inference_datasets(model_dir):
    Xte0 = pd.read_csv(os.path.join(model_dir, "Xte0.csv"))
    Xte1 = pd.read_csv(os.path.join(model_dir, "Xte1.csv"))
    Xte2 = pd.read_csv(os.path.join(model_dir, "Xte2.csv"))

    X = [Xte0, Xte1, Xte2]
    return X

In [None]:
all_x_test = return_inference_datasets('./kernel_data')

In [None]:
all_x, all_y = return_training_datasets('./kernel_data')

# ----XTR0-----

In [None]:
X, y = all_x[0], all_y[0]
X_infer = all_x_test[0]
x_train, x_val, y_train, y_val = split_train_val(X,y)

In [None]:
best_params_SVM = [(0.1, 10), (0.01, 11)]
best_params_LR = [(0.005, 11), (0.01, 7)]

In [None]:
models_SVM = []
for params in best_params_SVM:
  C, k = params
  Kernel = SpectrumKernel(k=k, add_inverse=True)
  clf = SVM(Kernel, C=C, save_dir='./kernel_data/', version="Xtr0")
  path_2_load = os.path.join(clf.save_dir, clf.kernel.name + f"_{clf.version}" + ".pkl")
  kernel = Kernel.load_kernel(path_2_load)
  clf.kernel = kernel
  clf._fit(y_train.Bound)
  models_SVM.append(clf)

 1: -1.6107e+01 -1.5883e+02  2e+02  3e-01  3e-16
 2: -1.5481e+01 -5.8772e+01  5e+01  5e-02  3e-16
 3: -1.6122e+01 -1.9004e+01  3e+00  1e-03  4e-16
 4: -1.6192e+01 -1.6379e+01  2e-01  7e-05  3e-16
 5: -1.6197e+01 -1.6212e+01  1e-02  5e-06  2e-16
 6: -1.6198e+01 -1.6198e+01  5e-04  1e-07  2e-16
 7: -1.6198e+01 -1.6198e+01  1e-05  2e-09  2e-16
Optimal solution found.
     pcost       dcost       gap    pres   dres
 0: -1.6437e+01 -3.2960e+01  3e+03  6e+01  7e-16
 1: -1.6429e+01 -3.2569e+01  2e+02  2e+00  2e-16
 2: -1.5781e+01 -3.0203e+01  4e+01  5e-01  2e-16
 3: -1.4823e+01 -2.1500e+01  7e+00  1e-02  5e-16
 4: -1.5422e+01 -1.6151e+01  7e-01  1e-03  2e-16
 5: -1.5675e+01 -1.5756e+01  8e-02  2e-05  2e-16
 6: -1.5720e+01 -1.5725e+01  5e-03  3e-07  2e-16
 7: -1.5723e+01 -1.5724e+01  2e-04  5e-09  2e-16
 8: -1.5724e+01 -1.5724e+01  1e-05  1e-10  1e-16
Optimal solution found.


In [None]:
models_LR = []
for params in best_params_LR:
  lmbd, k = params
  Kernel = SpectrumKernel(k=k, add_inverse=True)
  clf = KernelLogisticRegression(Kernel, lambda_=lmbd, save_dir='/content/drive/MyDrive/Kernel_Challenge/kernel_data/', max_iter=10000, version="Xtr0")
  path_2_load = os.path.join(clf.save_dir, clf.kernel.name + f"_{clf.version}" + ".pkl")
  kernel = Kernel.load_kernel(path_2_load)
  clf.kernel = kernel
  clf._fit(y_train.Bound)
  models_LR.append(clf)

Minimum tolerance between two iterations reached
Minimum tolerance between two iterations reached


In [None]:
predictions_proba = list()
predictions_brut = list()
predictions_net = list()
for model in models_SVM + models_LR:
  probs, brut = model.predict_proba(x_val)
  predictions_proba.append(probs)
  predictions_brut.append(brut)
  predictions_net.append(model.predict(x_val))

## Ensemble

We will just take the best model.

In [None]:
best_model = models_LR[0]
y_proba, _ = best_model.predict_proba(X_infer)
y_infer = np.where(y_proba >= 0.5, 1, 0)

In [None]:
X_infer.head()

Unnamed: 0,Id,seq
0,0,AAGGCCGAGCCCGGCGCGGACGCAGGCGGCTCCGGGCGGGCTCAGC...
1,1,TCTGGGCTCTTAATGTAAAGGTTGCCACTGATGCTGTGTCACCAGC...
2,2,GCCCGCACCGCTGGGCTATTTTTAGCGTCCACTAAACTTAGCCGAC...
3,3,GAGGCGCTGGCAATGGACTAGGAAGCTCGGCTGCCGCTGCTACTGC...
4,4,GGCGAGACTCCATCTCTACAGAAAATTTTTTTAAAAATTAGCTGGA...


In [None]:
xte0 = pd.DataFrame({'Id': X_infer.Id, 'Bound': y_infer})
xte0.head()

Unnamed: 0,Id,Bound
0,0,1
1,1,1
2,2,0
3,3,0
4,4,0


# ----XTR1-----

In [None]:
X, y = all_x[1], all_y[1]
X_infer = all_x_test[1]
x_train, x_val, y_train, y_val = split_train_val(X,y)

In [None]:
best_params_SVM = [(0.1, 7), (1, 7)]
best_params_LR = [(0.01, 7), (0.005, 7)]

In [None]:
models_SVM = []
for params in best_params_SVM:
  C, k = params
  Kernel = SpectrumKernel(k=k, add_inverse=True)
  clf = SVM(Kernel, C=C, save_dir='/content/drive/MyDrive/Kernel_Challenge/kernel_data/', version="Xtr1")
  path_2_load = os.path.join(clf.save_dir, clf.kernel.name + f"_{clf.version}" + ".pkl")
  kernel = Kernel.load_kernel(path_2_load)
  clf.kernel = kernel
  clf._fit(y_train.Bound)
  models_SVM.append(clf)

     pcost       dcost       gap    pres   dres
 0: -1.8979e+01 -1.8666e+02  4e+03  1e+01  1e-15
 1: -1.8847e+01 -1.6468e+02  3e+02  5e-01  9e-16
 2: -1.7935e+01 -7.1511e+01  6e+01  8e-02  1e-15
 3: -1.8532e+01 -2.4177e+01  6e+00  5e-05  2e-15
 4: -1.8794e+01 -1.9251e+01  5e-01  4e-06  8e-16
 5: -1.8824e+01 -1.8852e+01  3e-02  2e-07  8e-16
 6: -1.8827e+01 -1.8832e+01  5e-03  3e-08  8e-16
 7: -1.8827e+01 -1.8828e+01  2e-04  2e-10  9e-16
 8: -1.8827e+01 -1.8827e+01  3e-06  3e-12  9e-16
Optimal solution found.
     pcost       dcost       gap    pres   dres
 0: -1.4718e+01 -1.7325e+03  7e+03  1e+00  2e-15
 1: -7.0043e+00 -5.7828e+02  8e+02  7e-02  2e-15
 2: -1.2260e+01 -1.3338e+02  1e+02  1e-02  1e-15
 3: -1.8105e+01 -2.6507e+01  8e+00  2e-16  2e-15
 4: -1.8771e+01 -1.9386e+01  6e-01  2e-16  1e-15
 5: -1.8820e+01 -1.8875e+01  6e-02  2e-16  8e-16
 6: -1.8827e+01 -1.8831e+01  4e-03  2e-16  9e-16
 7: -1.8827e+01 -1.8828e+01  1e-04  2e-16  9e-16
 8: -1.8827e+01 -1.8827e+01  3e-06  2e-16  8e-1

In [None]:
models_LR = []
for params in best_params_LR:
  lmbd, k = params
  Kernel = SpectrumKernel(k=k, add_inverse=True)
  clf = KernelLogisticRegression(Kernel, lambda_=lmbd, save_dir='/content/drive/MyDrive/Kernel_Challenge/kernel_data/', max_iter=10000, version="Xtr1")
  path_2_load = os.path.join(clf.save_dir, clf.kernel.name + f"_{clf.version}" + ".pkl")
  kernel = Kernel.load_kernel(path_2_load)
  clf.kernel = kernel
  clf._fit(y_train.Bound)
  models_LR.append(clf)

Minimum tolerance between two iterations reached
Minimum tolerance between two iterations reached


In [None]:
predictions_proba = list()
predictions_brut = list()
for model in models_SVM + models_LR:
  probs, brut = model.predict_proba(x_val)
  predictions_proba.append(probs)
  predictions_brut.append(brut)

## Ensemble

We take both the SVM model and the first Logistic Regression Model

In [None]:
predictions_proba = list()
for model in [models_SVM[0], models_SVM[1], models_LR[0]]:
  probs, brut = model.predict_proba(X_infer)
  predictions_proba.append(probs)
y_proba = np.concatenate([x.reshape(-1, 1) for x in predictions_proba], axis=1)
y_proba = np.mean(y_proba, axis=1)
y_infer = np.where(y_proba >= 0.5, 1, 0)

In [None]:
xte1 = pd.DataFrame({'Id': X_infer.Id, 'Bound': y_infer})
xte1.head()

Unnamed: 0,Id,Bound
0,1000,1
1,1001,1
2,1002,1
3,1003,0
4,1004,0


# ----XTR2-----

In [None]:
X, y = all_x[2], all_y[2]
X_infer = all_x_test[2]
x_train, x_val, y_train, y_val = split_train_val(X,y)

In [None]:
best_params_SVM = [(0.1, 8), (1, 7)]
best_params_LR = [(0.001, 9), (0.005, 7)]

In [None]:
models_SVM = []
for params in best_params_SVM:
  C, k = params
  Kernel = SpectrumKernel(k=k, add_inverse=True)
  clf = SVM(Kernel, C=C, save_dir='/content/drive/MyDrive/Kernel_Challenge/kernel_data/', version="Xtr2")
  path_2_load = os.path.join(clf.save_dir, clf.kernel.name + f"_{clf.version}" + ".pkl")
  kernel = Kernel.load_kernel(path_2_load)
  clf.kernel = kernel
  clf._fit(y_train.Bound)
  models_SVM.append(clf)

In [None]:
models_LR = []
for params in best_params_LR:
  lmbd, k = params
  Kernel = SpectrumKernel(k=k, add_inverse=True)
  clf = KernelLogisticRegression(Kernel, lambda_=lmbd, save_dir='/content/drive/MyDrive/Kernel_Challenge/kernel_data/', max_iter=10000, version="Xtr2")
  path_2_load = os.path.join(clf.save_dir, clf.kernel.name + f"_{clf.version}" + ".pkl")
  kernel = Kernel.load_kernel(path_2_load)
  clf.kernel = kernel
  clf._fit(y_train.Bound)
  models_LR.append(clf)

In [None]:
predictions_proba = list()
predictions_brut = list()
for model in models_SVM + models_LR:
  probs, brut = model.predict_proba(x_val)
  predictions_proba.append(probs)
  predictions_brut.append(brut)

## Ensemble

We take the first SVM model and the first LR model

In [None]:
predictions_proba = list()
for model in [models_SVM[0], models_LR[0]]:
  probs, brut = model.predict_proba(X_infer)
  predictions_proba.append(probs)
y_proba = np.concatenate([x.reshape(-1, 1) for x in predictions_proba], axis=1)
y_proba = np.mean(y_proba, axis=1)
y_infer = np.where(y_proba >= 0.5, 1, 0)

In [None]:
xte2 = pd.DataFrame({'Id': X_infer.Id, 'Bound': y_infer})
xte2.head()

Unnamed: 0,Id,Bound
0,2000,0
1,2001,0
2,2002,1
3,2003,1
4,2004,0


# Concatening all the predictions

In [None]:
total_submissions = pd.concat([xte0, xte1, xte2])

In [None]:
total_submissions.head()

Unnamed: 0,Id,Bound
0,0,1
1,1,1
2,2,0
3,3,0
4,4,0


In [None]:
total_submissions.to_csv('./submissions.csv', index=False)