In [6]:
# imports:
from sklearnex import patch_sklearn
patch_sklearn()


import numpy as np
import pandas as pd
import os
import sklearn
import csv
import pickle
import time
import matplotlib.pyplot as plt

%matplotlib notebook

from tsfresh import extract_features
from tsfresh import extract_relevant_features
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest, VarianceThreshold 
from sklearn.kernel_ridge import KernelRidge
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import Normalizer, StandardScaler, RobustScaler, MinMaxScaler
from sklearn.preprocessing import PowerTransformer, QuantileTransformer
from sklearn.model_selection import GroupKFold, StratifiedKFold, KFold, StratifiedGroupKFold, cross_validate
from sklearn.model_selection import learning_curve
from sklearn.compose import make_column_transformer

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [11]:
extracted_features =  pd.read_csv('raw_data.csv', index_col=0)
data = extracted_features.dropna(axis=1)
df_meta = pd.read_csv("train_meta.csv")
y = df_meta["label"]
y = y.to_numpy()

In [12]:
n_folds =  15# Max 15 folds
cv = KFold(n_splits=n_folds, shuffle=True,random_state=1000)

In [21]:
preprocessing = Pipeline([
                        ('VT1', VarianceThreshold(threshold=0)),
                        ('QT', QuantileTransformer()),
                        ('VT2', VarianceThreshold(threshold=0))
                         ])  

#  Feature selection, you can also try others
feature_selection = Pipeline([('selectkbest', SelectKBest())])  

classifier = Pipeline([('clf', KernelRidge())])

pipe = Pipeline([
    ('preprocessing', preprocessing),
    ('feature_selection', feature_selection),
    ('classifier', classifier)])

# You may need to specify other hyperparameters here if you investigate other blocks
k_grid = [50,40,30,20]
alpha_grid = [0.001,0.01,0.05]
gamma_grid = [0.05,0.1,0.15,0.2]
kernel_grid = ['laplacian']

# And make sure to add them to your param_grid

param_grid = {
    'feature_selection__selectkbest__k' : k_grid,
    'classifier__clf__alpha' : alpha_grid,
    'classifier__clf__kernel' : kernel_grid,
    'classifier__clf__gamma' : gamma_grid
}

# define GridSearchCV object

gridsearch = GridSearchCV(pipe, param_grid, n_jobs=4, cv=cv, verbose=3, 
                          return_train_score=True,scoring='neg_mean_absolute_error')

# Train the pipeline

gridsearch.fit(data,y)

Fitting 15 folds for each of 48 candidates, totalling 720 fits


GridSearchCV(cv=KFold(n_splits=15, random_state=1000, shuffle=True),
             estimator=Pipeline(steps=[('preprocessing',
                                        Pipeline(steps=[('VT1',
                                                         VarianceThreshold(threshold=0)),
                                                        ('QT',
                                                         QuantileTransformer()),
                                                        ('VT2',
                                                         VarianceThreshold(threshold=0))])),
                                       ('feature_selection',
                                        Pipeline(steps=[('selectkbest',
                                                         SelectKBest())])),
                                       ('classifier',
                                        Pipeline(steps=[('clf',
                                                         KernelRidge())]))]),
             n

In [22]:
results = gridsearch.cv_results_
train_score = results['mean_train_score'][gridsearch.best_index_]
validation_score = results['mean_test_score'][gridsearch.best_index_]

print('Average training accuracy across folds: {:.3}'.format(train_score))
print('Average validation accuracy across folds: {:.3}'.format(validation_score))

print(gridsearch.best_params_)

Average training accuracy across folds: -0.00448
Average validation accuracy across folds: -0.121
{'classifier__clf__alpha': 0.01, 'classifier__clf__gamma': 0.15, 'classifier__clf__kernel': 'laplacian', 'feature_selection__selectkbest__k': 40}


In [23]:
with open('kernelridge_v1','wb') as file:
    pickle.dump(gridsearch, file)