In [14]:
from glob import glob
import os
import mne
import numpy as np
import pandas
import matplotlib.pyplot as plt
from scipy import stats
from features import concatenate_features
from read_data import read_data


In [15]:
%%capture
all_fp=glob('all_SP_edf/*.edf') #the path of all files is in all_fp

healthy_fp= [ i for i in all_fp if 'h' in i.split('\\')[1]] #List of path of all healthy subjects
patient_fp= [ i for i in all_fp if 's' in i.split('\\')[1]]#List path of all sick subjects

healthy_epochs_array=[read_data(i) for i in healthy_fp] # (no of healthy subjects) * (No.of epochs/trials, channels, length of signal)
patient_epochs_array=[read_data(i) for i in patient_fp] # # (no of Sick subjects) * (No.of epochs/trials, channels, length of signal)

data_list=healthy_epochs_array+patient_epochs_array
data_array = np.vstack(data_list)#converting to numpy array, (#total trails , #no of channels, #length of signal) 


In [16]:
print(len(healthy_epochs_array),len(patient_epochs_array), len(data_list), len(data_array),data_array.shape)

14 14 28 9605 (9605, 19, 1000)


In [17]:
features=[] #stores the extracted features list, features are extracted channel wise and concatenated together

for data in data_array:
    features.append(concatenate_features(data)) #calculates all features for each trial 

In [18]:
features_array = np.array(features)
features_array.shape # (no of epochs/trials, #number of channels*number of features) #pehley channel key 13 uskey bad doesrey channel key 13 and so on

(9605, 247)

In [19]:
#We have features array till now time for target/label array

#whats tricky about splitting the data

#i guess label array shouldnt be split for training and testing since the model would have already seen the data from the same patient  as it
# would be splitting based on epochs and epochs can be from the same patient

#creating lables 0 for healthy and 1 for pd
healthy_epochs_array_lable= [[0]*len(i) for i in healthy_epochs_array ]# create arrays with 0's, where number of zeros in each entry is equal to the number of trials in that file, for eg first entry will have 47 0's
patient_epochs_array_lable= [[1]*len(i) for i in patient_epochs_array ] # 1 instead of 0

lable_list= healthy_epochs_array_lable+ patient_epochs_array_lable

lable_array= np.hstack(lable_list) #stacking each entry 

print(len(lable_list),len(lable_array),lable_array.shape)

28 9605 (9605,)


In [20]:
lable_array

array([0, 0, 0, ..., 1, 1, 1])

In [21]:
##Time to make groups so epochs of the same subjct have a particular same number/tag used for GroupKfold test train split later
group_list=[[i]*len(j) for i,j in enumerate(data_list)]
group_array=np.hstack(group_list) #stacking 
print(len(group_list), len(group_array))

28 9605


In [22]:
group_array

array([ 0,  0,  0, ..., 27, 27, 27])

In [23]:
##ML Model Invoker with scaling 
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
#from sklearn.svm import SVC
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import GroupKFold
from sklearn.utils import shuffle

def get_score_4_model_Scaling(model, X_train, X_test, y_train, y_test):
    pipe =make_pipeline(StandardScaler(), model)
    pipe.fit(X_train,y_train)
    return pipe.score(X_test,y_test)

In [24]:
##Spliting data in train and test based on groups and lables??
from sklearn.model_selection import GroupKFold
from sklearn.utils import shuffle

X=features_array
y=lable_array
groups=group_array


In [16]:
#Spliting data into test and train kfold and getting scores for each model for every iteration and seeing their avg
scores_LR=[]
scores_SVM=[]
scores_RF=[]
scores_NB=[]

#X, y, groups = shuffle(X, y, groups, random_state = 0)
group_kfold = GroupKFold(n_splits=5)# split 3 means (2 block) for train (1 block) for test and so on
group_kfold.get_n_splits(X, y, groups)

for train_index, test_index in group_kfold.split(X, y, groups=groups):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    scores_LR.append(get_score_4_model_Scaling(LogisticRegression(multi_class='auto', C=10, n_jobs=22), X_train, X_test, y_train, y_test)) 
    scores_SVM.append(get_score_4_model_Scaling(svm.SVC(), X_train, X_test, y_train, y_test))                 
    scores_RF.append(get_score_4_model_Scaling(RandomForestClassifier(n_estimators=60), X_train, X_test, y_train, y_test))
    scores_NB.append(get_score_4_model_Scaling(GaussianNB(), X_train, X_test, y_train, y_test))
    
print("Score for NB : ", scores_NB)
#print("AVG Score for NB : ", round(mean(scores_NB),4))
print("Scores for LR : ",scores_LR)
#print("AVG Scores for LR : ",round(mean(scores_LR),4))
print("Score for SVM : ",scores_SVM )
#print("AVG Score for SVM : ",round(mean(scores_SVM),4))
print("Score for Rf : ", scores_RF)
#print("AVG Score for Rf : ", round(mean(scores_RF),4)) 

Score for NB :  [0.6459083890890376, 0.5628803245436106, 0.3939745075318656, 0.46797781139687344, 0.47400302877334677]
AVG Score for NB :  0.5089
Scores for LR :  [0.7267112712300566, 0.43356997971602435, 0.668018539976825, 0.827029752899647, 0.7809187279151943]
AVG Scores for LR :  0.6872
Score for SVM :  [0.7205352547606794, 0.3960446247464503, 0.694090382387022, 0.6893595562279374, 0.7062089853609288]
AVG Score for SVM :  0.6412
Score for Rf :  [0.6953165208440556, 0.4193711967545639, 0.7294322132097335, 0.6822995461422088, 0.7778899545683998]
AVG Score for Rf :  0.6609


In [30]:
#hyper parameter tuning for LR 
#Grid Search does permutation of all parameters (for automatic parameter tuning on scaled model ) #this also perfors iterative kfold cross validation on its own
#multi_class='auto',
model = LogisticRegression(multi_class='auto')

pipe=Pipeline([('scaler',StandardScaler()),('classifier',model)])

#param_grid = {'classifier__C': [200,100, 10]}

# param_grid = [    
#     {'classifier__penalty' : ['l1', 'l2', 'elasticnet', 'none'],
#     'classifier__C' : np.logspace(-4, 4, 20),
#     'classifier__solver' : ['lbfgs','newton-cg','liblinear','sag','saga'],
#     'classifier__max_iter' : [100,1000,2500, 5000]
#     }
# ]

param_grid = [    
    {'classifier__penalty' : ['l2'],
    'classifier__C' : [200, 100, 10, 1],
    'classifier__solver' : ['lbfgs','newton-cg','liblinear'],
    'classifier__max_iter' : [100,1000,2500]
    }
]

gscvLr=GridSearchCV(pipe,param_grid,cv=5) # 1st parameter model(in our case scaled model), second diffrent parameters we wana tweak of model
                                                       #third number of splits in k fold             
gscvLr.fit(X,y,groups=groups)

print(gscvLr.best_params_ , gscvLr.best_score_)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

{'classifier__C': 200, 'classifier__max_iter': 100, 'classifier__penalty': 'l2', 'classifier__solver': 'newton-cg'} 0.7082769390942217


#### hyper parameter tuning for RandomForest
#Grid Search does permutation of all parameters (for automatic parameter tuning on scaled model ) #this also perfors iterative kfold cross validation on its own

model=RandomForestClassifier()
pipe=Pipeline([('scaler',StandardScaler()),('classifier',model)])
param_grid = {'classifier__n_estimators': [60]} ##best is 60

gscvRF =  GridSearchCV(pipe,param_grid, cv=5, return_train_score=False)             
gscvRF.fit(X,y,groups=groups)

print(gscvRF.best_params_ , gscvRF.best_score_)

In [18]:
#hyper parameter tuning for SVC
#Grid Search does permutation of all parameters (for automatic parameter tuning on scaled model ) #this also perfors iterative kfold cross validation on its own

model = svm.SVC()

pipe=Pipeline([('scaler',StandardScaler()),('classifier',model)])

param_grid = {'classifier__C': [1,10,20],'classifier__kernel': ['rbf','linear']}

gscvSVC =  GridSearchCV(pipe,param_grid, cv=3, return_train_score=False)             
gscvSVC.fit(X,y,groups=groups)

print(gscvSVC.best_params_ , gscvSVC.best_score_)

{'classifier__C': 20, 'classifier__kernel': 'rbf'} 0.7022329907704384
