# Initialize libraries

In [1]:
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt 
from matplotlib.pyplot import imshow

from sklearn import decomposition, linear_model,metrics
from sklearn.base import BaseEstimator,ClassifierMixin
from sklearn.decomposition import KernelPCA
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, LabelEncoder
class_labels = LabelEncoder()
from sklearn.model_selection import cross_val_score,GridSearchCV,StratifiedKFold,KFold,train_test_split,LeaveOneOut
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import classification_report, confusion_matrix,mean_squared_error,r2_score
from sklearn.metrics import auc, RocCurveDisplay, roc_curve, f1_score
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,VotingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.feature_selection import RFE, SelectKBest, f_classif, chi2, mutual_info_classif
from sklearn.manifold import TSNE
from sklearn.naive_bayes import GaussianNB
from sklearn.impute import SimpleImputer,KNNImputer
from sklearn.compose import ColumnTransformer
## Additional imports from DWI code
import math
from itertools import product
from contextlib import redirect_stdout
import pandas as pd
import time
import scipy
from scipy import io, stats
#from astropy.stats import jackknife_resampling, jackknife_stats, binom_conf_interval
#import xgboost as xgb

seed_value= 42
import os
os.environ['PYTHONHASHSEED']=str(seed_value)
import random
random.seed(seed_value)
np.random.seed(seed_value)

ModuleNotFoundError: No module named 'numpy'

# Load features

In [2]:
processed_data_path="../../data/processed"

fmri_features=pd.read_csv(f"{processed_data_path}/fMRI/fMRI_features_AAL.csv",index_col=0)

print('fMRI Subject IDs')
print(fmri_features["Subject"])

# dwi_features = pd.read_csv(f"{processed_data_path}/DWI/IDs+Labels+Features.csv")

## Old sub_list (14) above is kept for sample results later,
## new sub_list (22) is loaded below
dwi_features = pd.read_csv(f"{processed_data_path}/DWI/IDs+Labels+Features_AllSubs.csv")
dwi_features["Subject"]=dwi_features["ID"].str[:9]
dwi_features["Late Seizure Label"]=dwi_features["Label"]
dwi_features=dwi_features.drop("Label",axis=1)

print("DWI Subject IDs")
print(dwi_features["Subject"])

eeg_features=pd.read_csv(f"{processed_data_path}/EEG/EEG_features_v0.csv",index_col=0)
print("EEG Subject IDs")
print(eeg_features["Subject"])

fMRI Subject IDs
0     3_13_0063
1     3_13_0068
2     3_16_0013
3     3_16_0016
4     3_16_0023
5     3_16_0033
6     3_16_0036
7     3_17_0001
8     3_17_0004
9     3_17_0007
10    3_17_0009
11    3_17_0012
12    3_17_0019
13    3_17_0048
14    3_19_0050
15    3_21_0040
16    3_21_0061
17    3_26_0080
18    3_26_0092
Name: Subject, dtype: object
DWI Subject IDs
0     3_13_0063
1     3_13_0068
2     3_16_0013
3     3_16_0021
4     3_16_0027
5     3_16_0033
6     3_16_0036
7     3_16_0038
8     3_17_0001
9     3_17_0003
10    3_17_0004
11    3_17_0005
12    3_17_0007
13    3_17_0009
14    3_17_0012
15    3_17_0019
16    3_17_0030
17    3_17_0048
18    3_21_0040
19    3_21_0061
20    3_24_0035
21    3_26_0100
Name: Subject, dtype: object
EEG Subject IDs
0    3_17_0001
1    3_17_0003
2    3_17_0004
3    3_17_0007
4    3_17_0009
5    3_17_0012
6    3_17_0019
7    3_17_0031
8    3_17_0048
9    3_21_0076
Name: Subject, dtype: object


In [3]:
# basic check for correctness
#fMRI, EEG use "Subject" , "Late Seizure Label"

for row_id,row in fmri_features.iterrows():
    fmri_label=row["Late Seizure Label"]
    eeg_label=[]
    if any(eeg_features["Subject"]==row["Subject"]):
        eeg_label=int(eeg_features["Late Seizure Label"].loc[eeg_features["Subject"]==row["Subject"]].to_numpy()[0])
        if fmri_label!=eeg_label:
            print(f'fMRI EEG mismatch subject {row["Subject"]}')

    if any(dwi_features["Subject"]==row["Subject"]):
        dwi_label=int(dwi_features["Late Seizure Label"].loc[dwi_features["Subject"]==row["Subject"]].to_numpy()[0])
        if fmri_label!=dwi_label:
            print(f'fMRI DWI mismatch subject {row["Subject"]}')
    


In [4]:
# need to load EEG and DWI features and sort out which subjects to use programatically
all_features_df=fmri_features.set_index("Subject").join(dwi_features.set_index("Subject"),how="outer",lsuffix=" fMRI",rsuffix=" DWI").reset_index()
all_features_df=all_features_df.set_index("Subject").join(eeg_features.set_index("Subject"),how="outer",lsuffix=" Mix",rsuffix=" EEG").reset_index()
all_features_df["Late Seizure Label EEG"]=all_features_df["Late Seizure Label"]


all_features_df["Late Seizure Label"]=(all_features_df["Late Seizure Label fMRI"].fillna(0)+all_features_df["Late Seizure Label DWI"].fillna(0)+all_features_df["Late Seizure Label EEG"].fillna(0))>0


In [5]:
def remove_non_features(column_list):
    '''Removes column names that aren't features from column list'''
    for to_remove in ["ID","Late Seizure Label","Subject","Subject Number"]:
        if to_remove in column_list:
            column_list.remove(to_remove)
    return column_list

In [6]:

# make np array features for classification

dwi_columns=remove_non_features([*dwi_features])
eeg_columns=remove_non_features([*eeg_features])
fmri_columns=remove_non_features([*fmri_features])

#dwi 
y = all_features_df["Late Seizure Label"].to_numpy()
X_dwi = all_features_df[dwi_columns].dropna().to_numpy()

# fMRI

overlap_columns=[]
for col in fmri_columns:
    if "Overlap AAL" in col:
        overlap_columns.append(col)

X_over_aal=all_features_df[overlap_columns].dropna().to_numpy()

#eeg 
X_eeg=all_features_df[eeg_columns].dropna().to_numpy()

#all_features=np.concatenate([X_over_aal,X_dwi,X_eeg],axis=1)
fmri_len=X_over_aal.shape[1]
dwi_len=X_dwi.shape[1]
eeg_len=X_eeg.shape[1]

fmri_ind=[*range(0,fmri_len)]
dwi_ind=[*range(fmri_len,fmri_len+dwi_len)]
eeg_ind=[*range(fmri_len+dwi_len,fmri_len+dwi_len+eeg_len)]

select_fmri=ColumnTransformer([("fMRI",'passthrough',overlap_columns)])
select_eeg=ColumnTransformer([("EEG",'passthrough',eeg_columns)])
select_dwi=ColumnTransformer([("DWI",'passthrough',dwi_columns)])
select_all=ColumnTransformer([("ALL",'passthrough',[*overlap_columns,*eeg_columns,*dwi_columns])])

class_features_df=all_features_df
class_features_df=class_features_df.drop(["ID","Subject","Subject Number"],axis=1)

# Multimodal Classifier via pipelines


In [7]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

outer_splits=5
score_string="f1"
cv_outer=StratifiedKFold(n_splits=outer_splits, shuffle=True, 
                            random_state=seed_value)
cv_inner=LeaveOneOut() #KFold(n_splits=3,shuffle=True,random_state=42)
# tried using "roc_auc" as a score but it didn't work because there are sub-folds with all of one or the other class



imputers=[SimpleImputer(strategy="constant",fill_value=0),SimpleImputer(strategy="mean"),SimpleImputer(strategy="median"),KNNImputer()]

imputer_strs=["Fill Zero","Mean","Median","KNN"]
results=pd.DataFrame()
#dwi_classifier = fit_tree_classifier(X_dwi,y,cv_inner,cv_outer,score_string,"Diffusion")
#eeg_classifier = fit_linear_classifier(X_eeg,y,cv_inner,cv_outer,score_string,"EEG")

In [8]:
def modality_svm(selecter,imputer,x_df,y):
    ''' Fits and scores a kPCA+SVM classifier for each modality, based on an imputer'''

    svm_classifier = Pipeline([("pca",KernelPCA()), ("svm",SVC(probability=True))])
    param_grid_svm={"clf__pca__n_components":[2,3,4,5,None],"clf__pca__gamma":[.01,.05,.1],"clf__pca__kernel":["linear","rbf"],
    "clf__svm__C": [1, 10, 100], "clf__svm__gamma": [.01, .1]}
    
    pipe=Pipeline([("select",selecter),("scale",StandardScaler()), ("impute",imputer),("clf",svm_classifier)])

    search=GridSearchCV(estimator=pipe,scoring=score_string,param_grid=param_grid_svm,cv=cv_inner,refit=True).fit(x_df,y)

    scores= cross_val_score(search, x_df, y, scoring=score_string, cv=cv_outer, n_jobs=-1)

    return search,scores



In [9]:
def modality_tree(selecter,imputer,x_df,y):
    ''' Fits and scores a tree-based classifier for each modality, based on an imputer'''

    tree_classifier= Pipeline([("kbest",SelectKBest(chi2)), ("tree",AdaBoostClassifier())])
    param_grid_tree={"clf__kbest__k":[2,3,5,7,"all"],"clf__tree__n_estimators":[10,50,100]}
    
    pipe=Pipeline([("select",selecter), ("impute",imputer),("clf",tree_classifier)])

    search=GridSearchCV(estimator=pipe,scoring=score_string,param_grid=param_grid_tree,cv=cv_inner,refit=True).fit(x_df,y)

    scores= cross_val_score(search, x_df, y, scoring=score_string, cv=cv_outer, n_jobs=-1)

    return search,scores


In [12]:
for imputer,imputer_str in zip(imputers,imputer_strs):
    fmri_svm,fmri_svm_scores=modality_svm(selecter=select_fmri,imputer=imputer,x_df=class_features_df,y=y)

    fmri_tree,fmri_tree_scores=modality_tree(selecter=select_fmri,imputer=imputer,x_df=class_features_df,y=y)

    results_svm={'Modality':'fMRI','Imputer':imputer_str,'Classifier':'SVM','Mean':fmri_svm_scores.mean(),'STD':fmri_svm_scores.std()}
    results=results.append(results_svm,ignore_index=True)
    results_tree={'Modality':'fMRI','Imputer':imputer_str,'Classifier':'Tree','Mean':fmri_tree_scores.mean(),'STD':fmri_tree_scores.std()}
    results.append(results_tree,ignore_index=True)



In [18]:
selecters=[select_fmri,select_dwi,select_eeg,select_all]
selecter_strs=['fMRI','DWI','EEG','All']
svm_save={}
tree_save={}
results=pd.DataFrame()
for selecter, selecter_str in zip(selecters,selecter_strs):
    for imputer,imputer_str in zip(imputers,imputer_strs):
        svm,svm_scores=modality_svm(selecter=selecter,imputer=imputer,x_df=class_features_df,y=y)
        svm_save[f'{selecter_str} {imputer_str}']=svm
        tree,tree_scores=modality_tree(selecter=selecter,imputer=imputer,x_df=class_features_df,y=y)
        tree_save[f'{selecter_str} {imputer_str}']=tree

        results_svm={'Modality':selecter_str,'Imputer':imputer_str,'Classifier':'SVM','Mean':svm_scores.mean(),'STD':svm_scores.std()}
        results=results.append(results_svm,ignore_index=True)
        results_tree={'Modality':selecter_str,'Imputer':imputer_str,'Classifier':'Tree','Mean':tree_scores.mean(),'STD':tree_scores.std()}
        results=results.append(results_tree,ignore_index=True)

KeyboardInterrupt: 

In [13]:
dwi_pipe_linear=Pipeline([("select",select_dwi),("scale",StandardScaler()), ("impute",imputer),("clf",linear_classifier)])
dwi_pipe_tree=Pipeline([("select",select_dwi), ("impute",imputer),("clf",tree_classifier)])

search_dwi_linear=GridSearchCV(estimator=dwi_pipe_linear,scoring=score_string,param_grid=param_grid_linear,cv=cv_inner,refit=True).fit(class_features_df,y)
search_dwi_tree=GridSearchCV(estimator=dwi_pipe_tree,scoring=score_string,param_grid=param_grid_tree,cv=cv_inner,refit=True).fit(class_features_df,y)

scores_dwi_linear= cross_val_score(search_dwi_linear, class_features_df, y, scoring=score_string, cv=cv_outer, n_jobs=-1)
scores_dwi_tree= cross_val_score(search_dwi_tree, class_features_df, y, scoring=score_string, cv=cv_outer, n_jobs=-1)

print(f"Score for DWI linear with {imputer_str} Imputer")
print(f"Mean {scores_dwi_linear.mean()} and STD {scores_dwi_linear.std()}")

print(f"Score for DWI tree with {imputer_str} Imputer")
print(f"Mean {scores_dwi_tree.mean()} and STD {scores_dwi_tree.std()}")

Score for DWI linear with Fill Zero Imputer
Mean 0.71 and STD 0.055377492419453854
Score for DWI tree with Fill Zero Imputer
Mean 0.5109523809523809 and STD 0.14476190476190473


In [14]:
eeg_pipe_linear=Pipeline([("select",select_eeg),("scale",StandardScaler()), ("impute",imputer),("clf",linear_classifier)])
eeg_pipe_tree=Pipeline([("select",select_eeg), ("impute",imputer),("clf",tree_classifier)])

search_eeg_linear=GridSearchCV(estimator=eeg_pipe_linear,scoring=score_string,param_grid=param_grid_linear,cv=cv_inner,refit=True).fit(class_features_df,y)
search_eeg_tree=GridSearchCV(estimator=eeg_pipe_tree,scoring=score_string,param_grid=param_grid_tree,cv=cv_inner,refit=True).fit(class_features_df,y)

scores_eeg_linear= cross_val_score(search_eeg_linear, class_features_df, y, scoring=score_string, cv=cv_outer, n_jobs=-1)
scores_eeg_tree= cross_val_score(search_eeg_tree, class_features_df, y, scoring=score_string, cv=cv_outer, n_jobs=-1)

print(f"Score for EEG linear with {imputer_str} Imputer")
print(f"Mean {scores_eeg_linear.mean()} and STD {scores_eeg_linear.std()}")

print(f"Score for EEG tree with {imputer_str} Imputer")
print(f"Mean {scores_eeg_tree.mean()} and STD {scores_eeg_tree.std()}")

Score for EEG linear with Fill Zero Imputer
Mean 0.71 and STD 0.055377492419453854
Score for EEG tree with Fill Zero Imputer
Mean 0.5266666666666666 and STD 0.27999999999999997


In [15]:
all_pipe_linear=Pipeline([("select",select_all),("scale",StandardScaler()), ("impute",imputer),("clf",linear_classifier)])
all_pipe_tree=Pipeline([("select",select_all), ("impute",imputer),("clf",tree_classifier)])

search_all_linear=GridSearchCV(estimator=all_pipe_linear,scoring=score_string,param_grid=param_grid_linear,cv=cv_inner,refit=True).fit(class_features_df,y)
search_all_tree=GridSearchCV(estimator=all_pipe_tree,scoring=score_string,param_grid=param_grid_tree,cv=cv_inner,refit=True).fit(class_features_df,y)

scores_all_linear= cross_val_score(search_all_linear, class_features_df, y, scoring=score_string, cv=cv_outer, n_jobs=-1)
scores_all_tree= cross_val_score(search_all_tree, class_features_df, y, scoring=score_string, cv=cv_outer, n_jobs=-1)

print(f"Score for ALL linear with {imputer_str} Imputer")
print(f"Mean {scores_eeg_linear.mean()} and STD {scores_eeg_linear.std()}")

print(f"Score for ALL tree with {imputer_str} Imputer")
print(f"Mean {scores_eeg_tree.mean()} and STD {scores_eeg_tree.std()}")

Score for ALL linear with Fill Zero Imputer
Mean 0.71 and STD 0.055377492419453854
Score for ALL tree with Fill Zero Imputer
Mean 0.5266666666666666 and STD 0.27999999999999997


In [20]:
multimodal_fusion=VotingClassifier(estimators=[('fmri',search_fmri_linear),('dwi',search_dwi_linear),('eeg',search_eeg_linear)])
scores_multi= cross_val_score(multimodal_fusion, class_features_df, y, scoring=score_string, cv=cv_outer, n_jobs=-1)

print(f"Score for Multimodal Pipe with {imputer_str} Imputer")
print(f"Mean {scores_multi.mean()} and STD {scores_multi.std()}")

Score for Multimodal Pipe with Fill Zero Imputer
Mean 0.71 and STD 0.055377492419453854


In [None]:
# "Naive Bayes" Implementation

p_true=sum(y)/len(y)

fmri_svm=nb_svm(X_over_aal,y)

X_fusion = np.zeros((cv_outer,n_subs,n_modalities), dtype = data_type) 
#need all features including nan,


for j, (train_idx, test_idx) in enumerate(cv_outer.split(X_fusion,y)):
    X_train_CV = X_fusion[j,0:len(train_idx),:]
    Y_train_CV = Y.iloc[train_idx]   
    Y_train_CV = np.ravel(Y_train_CV)
    X_test_CV = X_fusion[j,len(train_idx):,:]   
    Y_test_CV = Y.iloc[test_idx]     
    Y_test_CV = np.ravel(Y_test_CV)

    #check for nan, if so drop that classifier

    # calculate p(false|x) as p(l) \PI p(false|x_i)/p(l)
    #calculate p(true|x) as p(l) \PI p(false|x_i)/p(l) 
    # select class with largest p 

    #y_test_pred=selected class
    y_test_pred = model_fuse.predict(X_test_CV)                  
    f1_scores.append(f1_score(Y_test_CV, y_test_pred, average='weighted'))

f1_scores = np.array(f1_scores)
print('Fusion classifier \n F1 Score:')
print(f"Mean {f1_scores.mean()} and STD {f1_scores.std()}")




In [None]:
def naive_bayes_multimodal(X_fMRI,X_dwi,X_eeg):

    

In [None]:
def nb_svm(x,y):
    ''' Fits and scores a kPCA+SVM classifier for use in a naive bayes classifier'''

    svm_classifier = Pipeline([("pca",KernelPCA()), ("svm",SVC(probability=True))])
    param_grid_svm={"clf__pca__n_components":[2,3,4,5,None],"clf__pca__gamma":[.01,.05,.1],"clf__pca__kernel":["linear","rbf"],
    "clf__svm__C": [1, 10, 100], "clf__svm__gamma": [.01, .1]}
    
    pipe=Pipeline([("scale",StandardScaler()),("clf",svm_classifier)])

    search=GridSearchCV(estimator=pipe,scoring=score_string,param_grid=param_grid_svm,cv=cv_inner,refit=True).fit(x,y)

    scores= cross_val_score(search, x, y, scoring=score_string, cv=cv_outer, n_jobs=-1)

    return search,scores

def modality_tree(selecter,imputer,x,y):
    ''' Fits and scores a tree-based classifier for each modality, based on an imputer'''

    tree_classifier= Pipeline([("kbest",SelectKBest(chi2)), ("tree",AdaBoostClassifier())])
    param_grid_tree={"clf__kbest__k":[2,3,5,7,"all"],"clf__tree__n_estimators":[10,50,100]}
    
    pipe=Pipeline([("clf",tree_classifier)])

    search=GridSearchCV(estimator=pipe,scoring=score_string,param_grid=param_grid_tree,cv=cv_inner,refit=True).fit(x,y)

    scores= cross_val_score(search, x, y, scoring=score_string, cv=cv_outer, n_jobs=-1)

    return search,scores