## Version 5: use all the eyesclosed Microstates features.

In [20]:
cd ../pipelines

/Users/pldelacour/Documents/PL_Ecole/data_sc_lab/data_sc_lab/task_01/pipelines


In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from ipynb.fs.full.tool_functions import *
from yellowbrick.regressor import AlphaSelection
from sklearn.metrics import mean_squared_error

In [22]:
df_beha = pd.read_csv('../../../data/Behavioral/AllData.csv')#, nrows=1000)

df_psd_cluster = pd.read_csv("../../../data/EEG/RestingEEG_PSD_Cluster.csv") #, nrows=1000)
df_psd_channel = pd.read_csv("../../../data/EEG/RestingEEG_PSD_Channel.csv") #, nrows=5)
df_spectro_cluster = pd.read_csv("../../../data/EEG/RestingEEG_Spectro_Cluster.csv") #, nrows=1000)
df_spectro_channel = pd.read_csv("../../../data/EEG/RestingEEG_Spectro_Channel.csv") #, nrows=5)

df_microstate = pd.read_csv("../../../data/EEG/RestingEEG_Microstates.csv") #, nrows=5)

  interactivity=interactivity, compiler=compiler, result=result)


In [23]:
# Extract relevant info from the Behavioral data 
y_labels = df_beha[["Patient_ID", "Age", "DX_01_Cat"]]
y_labels_healthy = y_labels[y_labels["DX_01_Cat"].str.contains("No Diagnosis Given") == True]

y_labels = y_labels[["Patient_ID", "Age"]]
y_labels_healthy = y_labels_healthy[["Patient_ID", "Age"]]

In [24]:
df_psd_cluster.rename(columns={'id': 'Patient_ID'}, inplace=True)
df_spectro_cluster.rename(columns={'id': 'Patient_ID'}, inplace=True)
df_microstate.rename(columns={'id': 'Patient_ID'}, inplace=True)
print("Original lengths: " + str(len(df_psd_cluster)) + "(psd), " + \
     str(len(df_spectro_cluster)) + "(spectro), " + str(len(df_microstate)) + "(micro)")

# Aggregate with label
df_label_psd_cluster = pd.merge(y_labels_healthy, df_psd_cluster, on='Patient_ID')
df_label_spectro_cluster = pd.merge(y_labels_healthy, df_spectro_cluster, on='Patient_ID')
df_label_microstate = pd.merge(y_labels_healthy, df_microstate, on='Patient_ID')
print("Only healthy people, resulting lengths: " + str(len(df_label_psd_cluster)) + "(psd), " + \
     str(len(df_label_spectro_cluster)) + "(spectro), " + str(len(df_label_microstate)) + "(micro)")

# Drop NaN values
df_label_psd_cluster = fill_with_median(df_label_psd_cluster)
df_label_spectro_cluster = fill_with_median(df_label_spectro_cluster)
df_label_microstate = fill_with_median(df_label_microstate)
print("Replace NaN by median, resulting lenghts: " + str(len(df_label_psd_cluster)) + "(psd), " + \
     str(len(df_label_spectro_cluster)) + "(spectro), " + str(len(df_label_microstate)) + "(micro)")

Original lengths: 1485(psd), 1485(spectro), 1108(micro)
Only healthy people, resulting lengths: 174(psd), 174(spectro), 128(micro)
Replace NaN by median, resulting lenghts: 174(psd), 174(spectro), 128(micro)


## Taking Only features for version 5

Version 5: use all the eyesclosed Microstates features.

In [25]:
df_label_microstate


col=  df_label_microstate.columns.str.contains('eyesclosed')
print("# of features = " , np.count_nonzero(col))
x_spev5 = df_label_microstate.loc[:,col]

y_train_df = df_label_microstate.loc[:,'Age']

# of features =  25


## Train Test Split

In [26]:
from sklearn.model_selection import train_test_split

In [27]:
test_size = 0.2

In [28]:
x_train , x_test, y_train , y_test = train_test_split(x_spev5,y_train_df,test_size=test_size, random_state = 10)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)

(102, 25)
(26, 25)
(102,)


## Remove Outliers 

In [29]:
forest = IsolationForest()
forest.fit(x_train)

# Outlier indices for training
outliers_training = forest.predict(x_train)
outliers_training_indices = np.argwhere(outliers_training == 1).flatten()

# Drop signal outliers in training data
x_train = x_train[outliers_training == 1]
y_train = y_train[outliers_training == 1]

print(x_train.shape)
print(y_train.shape)

print(x_test.shape)
print(y_test.shape)

(87, 25)
(87,)
(26, 25)
(26,)


## SVR

In [30]:
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [31]:
svr= SVR()
#svr = make_pipeline(StandardScaler(), SVR(C=1, epsilon= 0.2))
parameters = {'kernel' : ( 'rbf' , 'poly', 'linear', 'sigmoid' ), 
              'C':[0.1, 1,10,100], 
              'gamma':['auto', 10e-2, 10e-1, 1, 2]}
clf = GridSearchCV(svr, parameters ,cv =3)
clf.fit(x_train , y_train)

KeyboardInterrupt: 

In [None]:
plt.plot(clf.predict(x_train) , y_train , 'o')
plt.plot(clf.predict(x_test), y_test , 'o')
plt.plot(y_train , y_train)

## XG BOOST

In [None]:
from sklearn.model_selection import GridSearchCV
import sklearn
from xgboost import plot_importance
from matplotlib import pyplot

In [None]:
xg_boost = fit_xgboost_with_grid_search(x_train, y_train)

In [None]:
best_param = xg_boost.best_params_
xgbr = XGBRegressor(verbosity=1, 
                    max_depth=best_param['max_depth'], 
                    min_child_weight= best_param['min_child_weight'], 
                    reg_lambda=best_param['reg_lambda']) 
xgbr.fit(x_train, y_train)

In [None]:
test_mse_xg = mean_squared_error(y_test , xgbr.predict(x_test))
print('TEST MSE = ', test_mse_xg)

In [None]:
plt.plot(xgbr.predict(x_train) , y_train , 'o')
plt.plot(xgbr.predict(x_test), y_test , 'o')
plt.plot(y_train , y_train)

In [None]:
plot_importance(xgbr, max_num_features=10)
pyplot.show()

## Gaussian Process


In [None]:
gpr = GaussianProcessRegressor(n_restarts_optimizer=5)
    
# Grid search
parameters = {
    'kernel': [RationalQuadratic(), RBF(), Matern(length_scale=1, nu=1.5),  DotProduct()], 'alpha': [1e-10, 1e-5, 1e-3, 1e-1, 1., 1.5, 2.]}
clf_gp = GridSearchCV(gpr, parameters, scoring='r2', n_jobs=4, iid=False, cv=5)
clf_gp.fit(x_train, y_train)
clf_gp.best_params_

In [None]:
y_pred_gp = clf_gp.predict(x_test)
test_mse_gp = mean_squared_error(y_test ,y_pred_svr)
print('Test MSE GP = ', test_mse_gp)

In [None]:
plt.plot(clf_gp.predict(x_train) , y_train , 'o')
plt.plot(clf_gp.predict(x_test), y_test , 'o')
plt.plot(y_train , y_train)