# Initialize libraries

In [1]:
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt 
from matplotlib.pyplot import imshow

from sklearn import decomposition, linear_model,metrics
from sklearn.decomposition import KernelPCA
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, LabelEncoder
class_labels = LabelEncoder()
from sklearn.model_selection import cross_val_score,GridSearchCV,StratifiedKFold,KFold,train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import classification_report, confusion_matrix,mean_squared_error,r2_score
from sklearn.metrics import auc, RocCurveDisplay, roc_curve, f1_score
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,VotingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.feature_selection import RFE, SelectKBest, f_classif, chi2, mutual_info_classif
from sklearn.manifold import TSNE
from sklearn.naive_bayes import GaussianNB

## Additional imports from DWI code
import math
from itertools import product
from contextlib import redirect_stdout
import pandas as pd
import time
import scipy
from scipy import io, stats
#from astropy.stats import jackknife_resampling, jackknife_stats, binom_conf_interval
#import xgboost as xgb

seed_value= 42
import os
os.environ['PYTHONHASHSEED']=str(seed_value)
import random
random.seed(seed_value)
np.random.seed(seed_value)

# Load features

In [7]:
processed_data_path="../../data/processed"

fMRI_features=pd.read_csv(f"{processed_data_path}/fMRI/fMRI_features_AAL.csv",index_col=0)
## Please take a look at 'print' below. 
## It either prints 11/14 useable subs with 'Lesion_Overlap_AAL_Subjects_14' 
## or the 14 useable with a feature length of 19 with 'Lesion_Overlap_AAL_Subjects'.
## Currently set to the former of the aforementioned choices.
print('fMRI Subject IDs')
print(fMRI_features["Subject"])

# Here are the fMRI features 
#pos stands for positive strength
#neg stands for negative strength
#over stands for lesion overlap
# AAL and SCH are the two atlases, so x_pos_aal and x_pos_sch are the same quantity calculated on a slightly different network
# All features are in Subjects by Num Sub-features (166 for AAL,100 for SCH)
y=labels[label_ind,1]
x_pos_aal=pos_str_aal[:,aal_ind].transpose()
x_neg_aal=neg_str_aal[:,aal_ind].transpose()
x_over_aal=lesion_overlap_aal[:,aal_ind].transpose()
x_pos_sch=pos_str_sch[:,aal_ind].transpose()
x_neg_sch=neg_str_sch[:,aal_ind].transpose()
x_over_sch=lesion_overlap_sch[:,aal_ind].transpose()

0     1.0
1     0.0
2     1.0
3     0.0
4     1.0
5     1.0
6     0.0
7     1.0
8     1.0
9     0.0
10    0.0
11    0.0
12    1.0
13    1.0
14    0.0
15    1.0
16    1.0
17    0.0
18    0.0
Name: Late Seizure Label, dtype: float64
fMRI Subject IDs
0     3_13_0063
1     3_13_0068
2     3_16_0013
3     3_16_0016
4     3_16_0023
5     3_16_0033
6     3_16_0036
7     3_17_0001
8     3_17_0004
9     3_17_0007
10    3_17_0009
11    3_17_0012
12    3_17_0019
13    3_17_0048
14    3_19_0050
15    3_21_0040
16    3_21_0061
17    3_26_0080
18    3_26_0092
Name: Subject, dtype: object


In [None]:
# need to load EEG and DWI features and sort out which subjects to use programatically


# Baseline fMRI classifier function

In [134]:
def fmri_classifier(x,y,n_subs,cv_inner,cv_outer,score_string,feature_string):
    ''' Prints performance based on nested CV of kPCA combined with SVC for x and y.
    '''    
    seed_value= 42
    os.environ['PYTHONHASHSEED']=str(seed_value)
    random.seed(seed_value)
    np.random.seed(seed_value)
        
    pipe_svc=Pipeline([("scale",StandardScaler()),("pca",KernelPCA()),("svm",SVC(probability=True))])
    param_grid_svc={"pca__n_components":[2,3,4,5,6],"pca__gamma":[.01,.05,.1],"pca__kernel":["sigmoid","rbf"],
    "svm__C": [1, 10, 100], "svm__gamma": [.01, .1]}

    search_svc=GridSearchCV(estimator=pipe_svc,scoring=score_string,param_grid=param_grid_svc,cv=cv_inner,refit=True)
    
#     scores_svc = cross_val_score(search_svc, x, y, scoring=score_string, cv=cv_outer, n_jobs=-1
#     print(f"Mean {scores_svc.mean()} and STD {scores_svc.std()}")

## Below excerpts are added to collect train and test predictions from the fMRI classifier

    fold_no = cv_outer
    data_type  = np.float32
    X_fmri = np.zeros((fold_no, n_subs), dtype = data_type) 
    f1_scores = []
    
    folds = StratifiedKFold(n_splits=fold_no, shuffle=True, 
                            random_state=seed_value).split(x, y)

    for j, (train_idx, test_idx) in enumerate(folds):
        X_train_CV = x[train_idx,:]  
        Y_train_CV = y[train_idx]   
        X_test_CV = x[test_idx,:]   
        Y_test_CV = y[test_idx]  
        
        ## 'model' is cleared here, should there be differnt models desired at each fold
        model = None
        ## The GridSearchCV selected model is passed here
        model = search_svc
        model.fit(X_train_CV, Y_train_CV)
            
    ## Predictions for train and test folds are collected as 'soft' labels
          
        y_train_pred = model.predict(X_train_CV) 
        y_test_pred = model.predict(X_test_CV)  
        
        for n in range(len(y_train_pred)):
            X_fmri[j,n] = y_train_pred[n]
        for q in range(len(y_test_pred)):
            X_fmri[j,n+q+1] = y_test_pred[q]
                
        f1_scores.append(f1_score(Y_test_CV, y_test_pred, average='weighted'))

    f1_scores = np.array(f1_scores)
    print(feature_string,'\n',score_string,'Score:')
    print(f"Mean {f1_scores.mean()} and STD {f1_scores.std()}")
    
    ## Returns a 'soft' label prediction array, which is [fold_no * no_of_subjects]
    return X_fmri

# Call to the baseline fMRI classifier

In [135]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

n_subs = 14 # No. of subjects
cv_outer=5
cv_inner=KFold(n_splits=3,shuffle=True,random_state=42)

# X_fmri = fmri_classifier(x_pos_aal,y,n_subs,cv_inner,cv_outer,"f1","Positive Strength AAL")
# X_fmri = fmri_classifier(x_neg_aal,y,n_subs,cv_inner,cv_outer,"f1","Negative Strength AAL")
# X_fmri = fmri_classifier(x_over_aal,y,n_subs,cv_inner,cv_outer,"f1","Overall Strength AAL")
# X_fmri = fmri_classifier(x_pos_sch,y,n_subs,cv_inner,cv_outer,"f1","Positive Strength SCH")
X_fmri = fmri_classifier(x_neg_sch,y,n_subs,cv_inner,cv_outer,"f1","Negative Strength SCH")
# X_fmri = fmri_classifier(x_over_sch,y,n_subs,cv_inner,cv_outer,"f1","Overall Strength SCH")

Negative Strength SCH 
 f1 Score:
Mean 0.7066666666666667 and STD 0.3968766950969924


# Baseline DWI classifier function

In [146]:
def dwi_classifier(X,Y,n_subs,n_feats,cv_outer,score_string,feature_string):
    ''' Prints performance based on CV of feature selection combined with LDA for x and y.
    '''    
    seed_value= 42
    os.environ['PYTHONHASHSEED']=str(seed_value)
    random.seed(seed_value)
    np.random.seed(seed_value)

    fold_no = cv_outer
    data_type  = np.float32
    X_dwi = np.zeros((fold_no, n_subs), dtype = data_type) 

    f1_scores = []
    folds = StratifiedKFold(n_splits=fold_no, shuffle=True, 
                            random_state=seed_value).split(X, Y)

    for j, (train_idx, test_idx) in enumerate(folds):
        X_train_CV = X.iloc[train_idx,:]  
        Y_train_CV = Y.iloc[train_idx]   
        Y_train_CV = np.ravel(Y_train_CV)
        X_test_CV = X.iloc[test_idx,:]   
        Y_test_CV = Y.iloc[test_idx]  
        Y_test_CV = np.ravel(Y_test_CV)
        
        ##--- Univariate feature selection ---##
#         sel_mutual = SelectKBest(mutual_info_classif, k=n_feats)
        sel_mutual = SelectKBest(chi2, k=n_feats)
#         sel_mutual = SelectKBest(f_classif, k=n_feats)

        X_train_CV = sel_mutual.fit_transform(X_train_CV, Y_train_CV)
        X_test_CV = sel_mutual.transform(X_test_CV)

        model = None
#         model = LinearDiscriminantAnalysis()      
        model = AdaBoostClassifier(n_estimators=100)
        model.fit(X_train_CV, Y_train_CV)
   
        ## Predictions for train folds are collected as 'soft' labels now, 
        ## Predictions for test folds will not be used in fusion, but storing for reference
        
        y_train_pred = model.predict(X_train_CV) 
        y_test_pred = model.predict(X_test_CV)  

        for n in range(len(y_train_pred)):
            X_dwi[j,n] = y_train_pred[n]
        for q in range(len(y_test_pred)):
            X_dwi[j,n+q+1] = y_test_pred[q]
                
        f1_scores.append(f1_score(Y_test_CV, y_test_pred, average='weighted'))

    f1_scores = np.array(f1_scores)
    print('\n',feature_string,'Classifier,',score_string,'Score:')
    print(f"Mean {f1_scores.mean()} and STD {f1_scores.std()}")
    
    ## Returns a 'soft' label prediction array, which is [fold_no * no_of_subjects]
    return X_dwi

# Call to the baseline DWI classifier

In [147]:
dwi_ip = pd.read_csv("../../data/processed/DWI/IDs+Labels+Features.csv")

id_subs = dwi_ip.iloc[:, 0]
Y = dwi_ip.iloc[:, 1]
X = dwi_ip.iloc[:, 2:]

print('Subject IDs (DWI): ','\n',id_subs)

n_subs = 14
cv_outer = 5
n_feats = 7

X_dwi = dwi_classifier(X,Y,n_subs,n_feats,cv_outer,"f1","chi2-AdaBoost")

# n_feats = [i+1 for i in range(10)]
# for i in n_feats:
#     X_dwi = dwi_classifier(X,Y,n_subs,i,cv_outer,"f1","chi2-LDA")

Subject IDs (DWI):  
 0     3_13_0063_2018-08-16
1     3_13_0068_2018-09-25
2     3_16_0013_2017-12-14
3                3_16_0033
4                3_16_0036
5     3_17_0001_2017-04-06
6                3_17_0004
7     3_17_0007_2017-09-08
8     3_17_0009_2017-09-18
9     3_17_0012_2017-11-18
10    3_17_0019_2018-02-15
11    3_17_0048_2018-06-10
12    3_21_0040_2018-05-05
13    3_21_0061_2018-07-18
Name: ID, dtype: object

 chi2-AdaBoost Classifier, f1 Score:
Mean 0.9333333333333332 and STD 0.13333333333333336


# Baseline EEG classifier function

In [None]:
## def eeg_classifier():
##     return X_eeg

# Call to the baseline EEG classifier

In [None]:
## X_eeg = eeg_classifier()

# Baseline fusion classifier 

In [148]:
seed_value= 42
os.environ['PYTHONHASHSEED']=str(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)

n_modalities = 2 # No. of modalities for fusion 
cv_outer = 5
n_subs = 14
data_type  = np.float32   
f1_scores = []
X_fusion = np.zeros((cv_outer,n_subs,n_modalities), dtype = data_type) 

## Meta labels are loaded
X_fusion[:,:,0] = X_fmri
X_fusion[:,:,1] = X_dwi
# X_fusion[:,:,2] = X_eeg

## Due to small size of each fold, 
## certain evaluation metrics and plots are currently omitted

folds = StratifiedKFold(n_splits=cv_outer, shuffle=True, 
                        random_state=seed_value).split(X,Y) 

for j, (train_idx, test_idx) in enumerate(folds):
    X_train_CV = X_fusion[j,0:len(train_idx),:]
    Y_train_CV = Y.iloc[train_idx]   
    Y_train_CV = np.ravel(Y_train_CV)
    X_test_CV = X_fusion[j,len(train_idx):,:]   
    Y_test_CV = Y.iloc[test_idx]     
    Y_test_CV = np.ravel(Y_test_CV)

    model_1 = LogisticRegression(random_state=seed_value)   
#     model_2 = GaussianNB()         
#     model_3 = make_pipeline(StandardScaler(), SVC(gamma='auto',probability=True))
#     model_4 = AdaBoostClassifier(n_estimators=100)
#     model_fuse = VotingClassifier(estimators=[('lr',model_1),('gnb',model_2),
#                                               ('svc',model_3),('adb',model_4)],voting='soft')

    model_1.fit(X_train_CV, Y_train_CV)                      
    y_test_pred = model_fuse.predict(X_test_CV)                  
    f1_scores.append(f1_score(Y_test_CV, y_test_pred, average='weighted'))

f1_scores = np.array(f1_scores)
print('Fusion classifier \n F1 Score:')
print(f"Mean {f1_scores.mean()} and STD {f1_scores.std()}")

Fusion classifier 
 F1 Score:
Mean 0.9333333333333332 and STD 0.13333333333333336
