In [1]:
# imports:
from sklearnex import patch_sklearn
patch_sklearn()


import numpy as np
import pandas as pd
import os
import sklearn
import csv
import pickle
import time
import matplotlib.pyplot as plt

%matplotlib notebook

from tsfresh import extract_features
from tsfresh import extract_relevant_features
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest, VarianceThreshold, SelectFromModel 
from sklearn.linear_model import LinearRegression, SGDRegressor, Ridge
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import Normalizer, StandardScaler, RobustScaler, MinMaxScaler
from sklearn.preprocessing import PowerTransformer, QuantileTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GroupKFold, StratifiedKFold, KFold, StratifiedGroupKFold, cross_validate
from sklearn.model_selection import learning_curve
from sklearn.compose import make_column_transformer
from sklearn.decomposition import KernelPCA
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVC
from sklearn.kernel_ridge import KernelRidge
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ExpSineSquared, Exponentiation, ConstantKernel

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [2]:
filename= os.path.join('train', 'train_0.csv')
df0 = pd.read_csv(filename, index_col=0)
df0['time'] = df0.index
df0['index'] = 0
filename= os.path.join('train', 'train_1.csv')
df1 = pd.read_csv(filename, index_col=0)
df1['time'] = df1.index
df1['index'] = 1
df0 = pd.concat([df0,df1])
df0.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4000 entries, 0 to 1999
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       4000 non-null   float64
 1   1       4000 non-null   float64
 2   2       4000 non-null   float64
 3   time    4000 non-null   int64  
 4   index   4000 non-null   int64  
dtypes: float64(3), int64(2)
memory usage: 187.5 KB


In [3]:
DATAPATH = 'train'
DATASET = 'train_'

df = pd.DataFrame(dtype=float)
for i in range(1100):
    filename= os.path.join(DATAPATH, DATASET +str(i)+ '.csv')
    df_ = pd.read_csv(filename, index_col=0)
    df_['time'] = df_.index
    df_['index'] = i
    df = pd.concat([df, df_])

In [4]:
extracted_features =  pd.read_csv('raw_data.csv', index_col=0)
extracted_features.info()

<class 'pandas.core.frame.DataFrame'>
Float64Index: 1100 entries, 0.0 to 0.0
Columns: 2366 entries, 0__has_duplicate_max to 2__mean_n_absolute_max__number_of_maxima_7
dtypes: float64(2366)
memory usage: 19.9 MB


In [5]:
data = extracted_features.dropna(axis=1)

In [6]:
df_meta = pd.read_csv("train_meta.csv")
y = df_meta["label"]
groups = []
for i in range(len(y)):
    groups.append(df_meta.loc[i]["direction"]+str(df_meta.loc[i]["label"])+str(df_meta.loc[i]["speed"])) #
groups = pd.DataFrame(groups)
#groups = df_meta["group"]
y = y.to_numpy()

In [7]:
n_folds =  15# Max 15 folds
cv = KFold(n_splits=n_folds, shuffle=True,random_state=1000)

In [8]:
preprocessing = Pipeline([
                        ('VT1', VarianceThreshold(threshold=0)),
                        ('QT', QuantileTransformer()),
                        ('VT2', VarianceThreshold(threshold=0))
                         ])  

#  Feature selection, you can also try others
feature_selection = Pipeline([('selectkbest', SelectKBest())])  

classifier = Pipeline([('clf', GaussianProcessRegressor())])

pipe = Pipeline([
    ('preprocessing', preprocessing),
    ('feature_selection', feature_selection),
    ('classifier', classifier)])

# You may need to specify other hyperparameters here if you investigate other blocks
k_grid = [80] #all = 2186
alpha_grid = [0.1]
#C_grid = 10**np.arange(0,1,0.2)
#C_grid = [10**i for i in range(-2,2)]
# And make sure to add them to your param_grid

param_grid = {
    'feature_selection__selectkbest__k' : k_grid,
    #'classifier__clf__C' : C_grid,
    'classifier__clf__alpha' : alpha_grid
}

# define GridSearchCV object

gridsearch = GridSearchCV(pipe, param_grid, n_jobs=4, cv=cv, verbose=3, 
                          return_train_score=True,scoring='neg_mean_absolute_error')

# Train the pipeline
#y_transformed = LabelEncoder().fit_transform(y)
#gridsearch.fit(data,y_transformed)

gridsearch.fit(data,y)

Fitting 15 folds for each of 1 candidates, totalling 15 fits


In [9]:
results = gridsearch.cv_results_
train_score = results['mean_train_score'][gridsearch.best_index_]
validation_score = results['mean_test_score'][gridsearch.best_index_]

print('Average training accuracy across folds: {:.3}'.format(train_score))
print('Average validation accuracy across folds: {:.3}'.format(validation_score))

print(gridsearch.best_params_)

Average training accuracy across folds: -0.0497
Average validation accuracy across folds: -0.13
{'classifier__clf__alpha': 0.1, 'feature_selection__selectkbest__k': 80}


In [10]:
with open('Guassian_0.1_80','wb') as file:
    pickle.dump(gridsearch, file)