# Initialize libraries

In [1]:
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt 
from matplotlib.pyplot import imshow
from matplotlib.lines import Line2D

from sklearn import decomposition, linear_model,metrics
from sklearn.base import BaseEstimator,ClassifierMixin
from sklearn.decomposition import KernelPCA
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, LabelEncoder,FunctionTransformer
class_labels = LabelEncoder()
from sklearn.model_selection import cross_val_score,GridSearchCV,StratifiedKFold,KFold,train_test_split,LeaveOneOut
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import classification_report, confusion_matrix,mean_squared_error,r2_score
from sklearn.metrics import auc, RocCurveDisplay, roc_curve, f1_score
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,VotingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.feature_selection import RFE, SelectKBest, f_classif, chi2, mutual_info_classif
from sklearn.manifold import TSNE
from sklearn.naive_bayes import GaussianNB
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import SimpleImputer,KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.cross_decomposition import CCA
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import pairwise_distances_argmin
from sklearn.ensemble import RandomForestRegressor

## Additional imports from DWI code
import math
from itertools import product
from contextlib import redirect_stdout
import pandas as pd
import time
import scipy
from scipy import io, stats
from statistics import mean
#from astropy.stats import jackknife_resampling, jackknife_stats, binom_conf_interval
# from MMIDimReduction import MMINet

# from cluster.selfrepresentation import ElasticNetSubspaceClustering, SparseSubspaceClusteringOMP

seed_value= 42
import os
os.environ['PYTHONHASHSEED']=str(seed_value)
import random
random.seed(seed_value)
np.random.seed(seed_value)

In [None]:
X_t = [[1, 2, 4], [3, 4, 3], [10,10,11], [8, 8, 7], [4, 6, 5], [9, np.nan, 11]]
imputer = KNNImputer(n_neighbors=2)
Y_t=imputer.fit_transform(X_t)
# , [6, 9, 8]

# Load features

In [4]:
processed_data_path="../_data"

fmri_features=pd.read_csv(f"{processed_data_path}/fMRI_features_AAL.csv",index_col=0)

# print('fMRI Subject IDs')
# print(fmri_features["Subject"])

# dwi_features = pd.read_csv(f"{processed_data_path}/DWI/IDs+Labels+Features_AllSubs.csv")
dwi_features = pd.read_csv(f"{processed_data_path}/subs_jan_2022.csv")

dwi_features["Subject"]=dwi_features["ID"].str[:9]
dwi_features["Late Seizure Label"]=dwi_features["Label"]
dwi_features=dwi_features.drop("Label",axis=1)

# print("DWI Subject IDs")
# print(dwi_features["Subject"])

eeg_features=pd.read_csv(f"{processed_data_path}/EEG_features_v0.csv",index_col=0)
# print("EEG Subject IDs")
# print(eeg_features["Subject"])

In [5]:
# basic check for correctness
#fMRI, EEG use "Subject" , "Late Seizure Label"

for row_id,row in fmri_features.iterrows():
    fmri_label=row["Late Seizure Label"]
    eeg_label=[]
    if any(eeg_features["Subject"]==row["Subject"]):
        eeg_label=int(eeg_features["Late Seizure Label"].loc[eeg_features["Subject"]==row["Subject"]].to_numpy()[0])
        if fmri_label!=eeg_label:
            print(f'fMRI EEG mismatch subject {row["Subject"]}')

    if any(dwi_features["Subject"]==row["Subject"]):
        dwi_label=int(dwi_features["Late Seizure Label"].loc[dwi_features["Subject"]==row["Subject"]].to_numpy()[0])
        if fmri_label!=dwi_label:
            print(f'fMRI DWI mismatch subject {row["Subject"]}')
    


In [6]:
# need to load EEG and DWI features and sort out which subjects to use programatically
all_features_df=fmri_features.set_index("Subject").join(dwi_features.set_index("Subject"),how="outer",lsuffix=" fMRI",rsuffix=" DWI").reset_index()
all_features_df=all_features_df.set_index("Subject").join(eeg_features.set_index("Subject"),how="outer",lsuffix=" Mix",rsuffix=" EEG").reset_index()
all_features_df["Late Seizure Label EEG"]=all_features_df["Late Seizure Label"]


all_features_df["Late Seizure Label"]=(all_features_df["Late Seizure Label fMRI"].fillna(0)+all_features_df["Late Seizure Label DWI"].fillna(0)+all_features_df["Late Seizure Label EEG"].fillna(0))>0

In [7]:
def remove_non_features(column_list):
    '''Removes column names that aren't features from column list'''
    for to_remove in ["ID","Late Seizure Label","Subject","Subject Number"]:
        if to_remove in column_list:
            column_list.remove(to_remove)
    return column_list

In [8]:
# make np array features for classification

dwi_columns=remove_non_features([*dwi_features])
eeg_columns=remove_non_features([*eeg_features])
fmri_columns=remove_non_features([*fmri_features])

#dwi 
y = all_features_df["Late Seizure Label"].to_numpy()
X_dwi = all_features_df[dwi_columns].to_numpy()

# fMRI

overlap_columns=[]
mean_str_pos_columns=[]
mean_str_neg_columns=[]

for col in fmri_columns:
    if "Overlap AAL" in col:
        overlap_columns.append(col)
    elif "Pos AAL" in col:
        mean_str_pos_columns.append(col)
    elif "Neg AAL" in col:
        mean_str_neg_columns.append(col)

X_over_aal=all_features_df[overlap_columns].to_numpy()
X_pos_str_aal=all_features_df[mean_str_pos_columns].to_numpy()

all_features_df[mean_str_neg_columns]=-1*all_features_df[mean_str_neg_columns]
X_neg_str_aal=all_features_df[mean_str_neg_columns].to_numpy()
#eeg 
X_eeg=all_features_df[eeg_columns].to_numpy()

#all_features=np.concatenate([X_over_aal,X_dwi,X_eeg],axis=1)
fmri_len=X_over_aal.shape[1]
dwi_len=X_dwi.shape[1]
eeg_len=X_eeg.shape[1]

fmri_ind=[*range(0,fmri_len)]
dwi_ind=[*range(fmri_len,fmri_len+dwi_len)]
eeg_ind=[*range(fmri_len+dwi_len,fmri_len+dwi_len+eeg_len)]

select_fmri_ov=ColumnTransformer([("fMRI ov",'passthrough',overlap_columns)])
select_fmri_pos=ColumnTransformer([("fMRI pos",'passthrough',mean_str_pos_columns)])
select_fmri_neg=ColumnTransformer([("fMRI neg",'passthrough',mean_str_neg_columns)])
select_fmri=ColumnTransformer([("fMRI",'passthrough',[*mean_str_pos_columns,*mean_str_neg_columns,*overlap_columns])])

select_eeg=ColumnTransformer([("EEG",'passthrough',eeg_columns)])
select_dwi=ColumnTransformer([("DWI",'passthrough',dwi_columns)])

select_all_pos=ColumnTransformer([("ALL",'passthrough',[*mean_str_pos_columns,*eeg_columns,*dwi_columns])])
select_all_neg=ColumnTransformer([("ALL",'passthrough',[*mean_str_neg_columns,*eeg_columns,*dwi_columns])])
select_all_ov=ColumnTransformer([("ALL",'passthrough',[*overlap_columns,*eeg_columns,*dwi_columns])])

# Fusion excluding EEG
dwi_fmri_pos=ColumnTransformer([("ALL",'passthrough',[*mean_str_pos_columns,*dwi_columns])])
dwi_fmri_neg=ColumnTransformer([("ALL",'passthrough',[*mean_str_neg_columns,*dwi_columns])])
dwi_fmri_ov=ColumnTransformer([("ALL",'passthrough',[*overlap_columns,*dwi_columns])])

select_all=ColumnTransformer([("ALL",'passthrough',[*mean_str_pos_columns,*mean_str_neg_columns,*overlap_columns,*eeg_columns,*dwi_columns])])
class_features_df=all_features_df
class_features_df=class_features_df.drop(["ID","Subject","Subject Number"],axis=1)


In [None]:
# X_dwi_np=X_dwi.to_numpy()
from scipy.stats.stats import pearsonr
from sklearn.metrics import mutual_info_score
# a,b=[1,2,3],[2,4,6]
# print(a,b)
# print(pearsonr(a,b)[0])

imputer_mode=KNNImputer(n_neighbors=1)
X_imputed=imputer_mode.fit_transform(X_dwi)
print(X_imputed.shape[1])

keep_col=[]
# keep_col=[i for i in range(7)]
for j in range(X_imputed.shape[1]):
    if j==0:
        continue
    else:
        mi_coef=mutual_info_score(X_imputed[:,j-1],X_imputed[:,j])
        print(mi_coef)

    # keep_col.append(j)
    # if j==0:
    #     continue
    # else:
    #     for k in range(j):
    #         # r_coef=pearsonr(X_imputed[:,k],X_imputed[:,j])[0]
    #         mi_coef=mutual_info_score(X_imputed[:,k],X_imputed[:,j])
    #         # if r_coef>0.85:    
    #         if mi_coef>0.85:    
    #             keep_col.remove(j)
    #             break


# # print(drop_col)
# # print(keep_col)
# print('feat=',keep_col,'keep_col=',len(keep_col))

# Feature Union

In [9]:
seed_value= 42
os.environ['PYTHONHASHSEED']=str(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)

outer_splits=5
inner_splits=5
# score_string="f1"
score_string="roc_auc"
# keep a single CV object for the inner and the outer loops to ensure comparability between selected classifiers

cv_outer=StratifiedKFold(n_splits=outer_splits, shuffle=True, random_state=seed_value)
# cv_inner=LeaveOneOut() #KFold(n_splits=3,shuffle=True,random_state=42)
cv_inner=StratifiedKFold(n_splits=inner_splits, shuffle=True, random_state=seed_value)
# tried using "roc_auc" as a score but it didn't work because there are sub-folds with all of one or the other class


#set up imputers for missing data
# iter_estimator=RandomForestRegressor(
#     n_estimators=4,
#     max_depth=10,
#     bootstrap=True,
#     max_samples=0.5,
#     n_jobs=2,
#     random_state=seed_value)
imputers=[KNNImputer(n_neighbors=1)]
# imputers=[SimpleImputer(strategy="constant",fill_value=0),SimpleImputer(strategy="mean"),SimpleImputer(strategy="median"),KNNImputer(),IterativeImputer(random_state=seed_value,estimator=iter_estimator,sample_posterior=False,max_iter=2)]

imputer_strs=["KNN"]
# imputer_strs=["Fill Zero","Mean","Median","KNN","Iterative"]


In [20]:
def modality_svm(selecter,imputer,x_df,y):
    ''' Fits and scores a kPCA+SVM classifier for each modality, based on an imputer'''

    svm_classifier = Pipeline([("pca",KernelPCA()), ("svm",SVC(probability=True))])
    param_grid_svm={"clf__pca__n_components":[2,3,4,5,None],"clf__pca__gamma":[.01,.05,.1],"clf__pca__kernel":["linear","rbf"],
    "clf__svm__C": [1, 10, 100], "clf__svm__gamma": [.01, .1]}
    
    pipe=Pipeline([("select",selecter),("scale",StandardScaler()), ("impute",imputer),("clf",svm_classifier)])

    search=GridSearchCV(estimator=pipe,scoring=score_string,param_grid=param_grid_svm,cv=cv_inner,refit=True).fit(x_df,y)

    scores= cross_val_score(search, x_df, y, scoring=score_string, cv=cv_outer, n_jobs=-1)

    return search,scores, pipe



In [14]:
def modality_tree(selecter,imputer,x_df,y):
    ''' Fits and scores a tree-based classifier for each modality, based on an imputer'''

    tree_classifier= Pipeline([("kbest",SelectKBest(f_classif)), ("tree",AdaBoostClassifier())])
    param_grid_tree={"clf__kbest__k":[2,3,5,7],"clf__tree__n_estimators":[50]}
    

    pipe=Pipeline([("select",selecter), ("impute",imputer),("clf",tree_classifier)])

    search=GridSearchCV(estimator=pipe,scoring=score_string,param_grid=param_grid_tree,cv=cv_inner,refit=True).fit(x_df,y)

    scores= cross_val_score(search, x_df, y, scoring=score_string, cv=cv_outer, n_jobs=-1)

    return search,scores, pipe


In [30]:
#run single modality classifiers with an imputer 
#also includes a "naive" multimodal classifier where all features are concatenated and then passed to the tree and SVM based classifiers
from sklearn.model_selection import permutation_test_score

seed_value=42
os.environ['PYTHONHASHSEED']=str(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)

svm_save={}
tree_save={}
results=pd.DataFrame()

# selecters=[select_fmri_ov,select_fmri_pos,select_fmri_neg,select_fmri,select_dwi,select_eeg,select_all_ov,select_all_pos,select_all_neg,select_all]
# selecter_strs=['fMRI ov','fMRI pos','fMRI neg','DWI','EEG','All ov','All pos','All neg','All']

selecters=[select_fmri_ov,select_dwi,select_eeg]
selecter_strs=['fMRI ov','DWI','EEG']

# Start measuring time
start_time = time.time()

# for selecter, selecter_str,imputer,imputer_str in zip(selecters,selecter_strs,imputers,imputer_strs):
for selecter, selecter_str in zip(selecters,selecter_strs):
    for imputer,imputer_str in zip(imputers,imputer_strs):

        tree,tree_scores,clf=modality_tree(selecter=selecter,imputer=imputer,x_df=class_features_df,y=y)
        tree_save[f'{selecter_str} {imputer_str}']=tree    
        tree_grid = pd.DataFrame(tree.cv_results_)

        _, _, pvalue = permutation_test_score(clf, class_features_df, y, random_state=seed_value)     
        
        results_tree={'Modality':selecter_str,'Imputer':imputer_str,'Classifier':'Tree','Mean':tree_scores.mean(),'STD':tree_scores.std(),'p-value':pvalue}
        results=results.append(results_tree,ignore_index=True)

        svm,svm_scores,clf=modality_svm(selecter=selecter,imputer=imputer,x_df=class_features_df,y=y)
        svm_save[f'{selecter_str} {imputer_str}']=svm
        svm_grid = pd.DataFrame(svm.cv_results_)

        _, _, pvalue = permutation_test_score(clf, class_features_df, y, random_state=seed_value)     

        results_svm={'Modality':selecter_str,'Imputer':imputer_str,'Classifier':'SVM','Mean':svm_scores.mean(),'STD':svm_scores.std(),'p-value':pvalue}
        results=results.append(results_svm,ignore_index=True)
        classifier = clf.named_steps['clf']
        # Get estimator parameters
        params = classifier.get_params()
        # Count the number of parameters
        num_parameters = len(params)
        print("Parameters in SVC:", num_parameters)
   
# End measuring time
end_time = time.time()

# Print the time complexity
print(f'Time complexity: ', end_time-start_time)
# results.to_csv('_feats/All ov.csv')
tree_grid.to_csv('_feats/soft-grid.csv')
svm_grid.to_csv('_feats/soft-grid.csv')

 115 116 118 126 127 128 129 130 132 133 142 143 145 154 155 156 160 161
 162 163 164 165] are constant.
  f = msb / msw
 115 133 136 138 142 143 145 154 155 161 162 163 164 165] are constant.
  f = msb / msw
 114 115 133 142 143 145 154 155 161 162 163 164 165] are constant.
  f = msb / msw
 115 119 123 131 133 137 142 143 145 154 155 157 161 162 163 164 165] are constant.
  f = msb / msw
 133 142 143 145 154 155 161 162 163 164 165] are constant.
  f = msb / msw
 115 116 118 126 127 128 129 130 132 133 142 143 145 154 155 156 160 161
 162 163 164 165] are constant.
  f = msb / msw
 115 133 136 138 142 143 145 154 155 161 162 163 164 165] are constant.
  f = msb / msw
 114 115 133 142 143 145 154 155 161 162 163 164 165] are constant.
  f = msb / msw
 115 119 123 131 133 137 142 143 145 154 155 157 161 162 163 164 165] are constant.
  f = msb / msw
 133 142 143 145 154 155 161 162 163 164 165] are constant.
  f = msb / msw
 115 116 118 126 127 128 129 130 132 133 142 143 145 154 155 1

Parameters in SVC: 12
Parameters in SVC: 12


10 fits failed out of a total of 20.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\MEA11104\AppData\Local\anaconda3\envs\epilep\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\MEA11104\AppData\Local\anaconda3\envs\epilep\lib\site-packages\sklearn\pipeline.py", line 394, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "c:\Users\MEA11104\AppData\Local\anaconda3\envs\epilep\lib\site-packages\sklearn\pipeline.py", line 390, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "c:\Users\MEA11104\AppData\Local\anac

ValueError: k should be >=0, <= n_features = 3; got 10. Use k='all' to return all features.

# Ensemble Classifier

In [None]:
#for the voting classifiers I used the best performing individual modality classifiers with Voting classifier to combine them. 
#It might be worth verifying that the classifier trained from gridsearchcv isn't just returning something trained on the last CV iteration

seed_value= 42
os.environ['PYTHONHASHSEED']=str(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)

# multimodal_fusion=VotingClassifier(estimators=[('fmri',svm_save['fMRI ov Median']),('dwi',tree_save['DWI Median']),('eeg',tree_save['EEG Mean'])])

# Skipping EEG
# multimodal_fusion=VotingClassifier(estimators=[('fmri',svm_save['fmri_pos Median']),('dwi',tree_save['dwi KNN'])])
multimodal_fusion=VotingClassifier(estimators=[('fmri',svm_save['fMRI pos KNN']),('dwi',tree_save['DWI KNN']),('eeg',tree_save['EEG KNN'])])
scores_multi=cross_val_score(multimodal_fusion, class_features_df, y, scoring=score_string, cv=cv_outer, n_jobs=-1)
print(f"Mean {scores_multi.mean()} and SEM {1.96*stats.sem(scores_multi,ddof=0)}")
# print(multimodal_fusion.cv_results_)
# print(cross_val_score.cv_results_)

In [31]:
#soft label fusion
# Could try to tune relative weights of the estimators to improve performance

seed_value= 42
os.environ['PYTHONHASHSEED']=str(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)

# multimodal_fusion=VotingClassifier(estimators=[('fmri',svm_save['fmri_pos Median']),('dwi',tree_save['dwi KNN'])],voting='soft',weights=[2,1],weights=[3,2,1])
multimodal_fusion=VotingClassifier(estimators=[('fmri',svm_save['fMRI ov KNN']),('dwi',tree_save['DWI KNN']),('eeg',tree_save['EEG KNN'])],voting='soft')
scores_multi=cross_val_score(multimodal_fusion, class_features_df, y, scoring=score_string, cv=cv_outer, n_jobs=-1)
print(f"Mean {scores_multi.mean()} and SEM {1.96*stats.sem(scores_multi,ddof=0)}")

Mean 0.65 and SEM 0.14718136959396577


# Bayesian Fusion

In [None]:
def drop_nan_index(X,y,idx):
    ''' Selects a subset based on idx and then returns the subset of X,y that correspond to rows without any nans. 
    For use in test train split for individual modality classifiers'''

    X_sub=X[idx,:]
    y_sub=y[idx]
    drop_rows=~np.isnan(X_sub).any(axis=1)
    X_sub=X_sub[drop_rows]
    y_sub=y_sub[drop_rows]

    return X_sub,y_sub


In [None]:
def nb_svm(x,y):
    ''' Fits and scores a kPCA+SVM classifier for use in a naive bayes classifier'''

    svm_classifier = Pipeline([("pca",KernelPCA()), ("svm",SVC(probability=True))])
    param_grid_svm={"clf__pca__n_components":[2,3,4,5,None],"clf__pca__gamma":[.01,.05,.1],"clf__pca__kernel":["linear","rbf"],
    "clf__svm__C": [1, 10, 100], "clf__svm__gamma": [.01, .1]}
    
    pipe=Pipeline([("scale",StandardScaler()),("clf",svm_classifier)])

    search=GridSearchCV(estimator=pipe,scoring=score_string,param_grid=param_grid_svm,cv=cv_inner,refit=True).fit(x,y)

    return search

In [None]:
def nb_tree(x,y):
    ''' Fits and scores a tree-based classifier for use in a naive bayes classifier'''

    tree_classifier= Pipeline([("kbest",SelectKBest(chi2)), ("tree",AdaBoostClassifier())])
    param_grid_tree={"clf__kbest__k":[2,3,5,7,10,15],"clf__tree__n_estimators":[10,50,100]}
    
    pipe=Pipeline([("scale",StandardScaler()),("clf",tree_classifier)])

    search=GridSearchCV(estimator=pipe,scoring=score_string,param_grid=param_grid_tree,cv=cv_inner,refit=True).fit(x,y)

    # scores= cross_val_score(search, x_df, y, scoring=score_string, cv=cv_outer, n_jobs=-1)

    return search


In [None]:
def naive_bayes_multimodal(fmri_class,X_fmri,dwi_class,X_dwi,y_test,y_train,eeg_class=np.nan,X_eeg=np.nan):
    '''Makes a prediction based on a naive bayes multimodal fusion using a conditional independence assumption, which ignores modalities that don't have features for a given subject'''
    p_true=sum(y_test)/len(y_test)
    # p_true=sum(y_train)/len(y_train)
    p_false=1-p_true

    n_subs=X_fmri.shape[0]
    #The following two variable will not actually be probabilities (they shouldn't sum to 1). Essentially this function uses the approximation 
    # p(x|l) \approx p(l|x)/p(l). To get a real generative model, I'd suggest just using a MLE for a gaussian model for the fMRI and dwi, and a poisson model for EEG
    y_prob_false=[]
    y_prob_true=[]
    predict=[]

    for row in range(n_subs):
        if np.isnan(X_fmri[row,:]).any(): #check if there's fMRI data, if not set the relative prob to 1
            fmri_prob_true=1
            fmri_prob_false=1
        else: 
            fmri_prob_false=fmri_class.predict_proba(X_fmri[row,:].reshape(1, -1))[0][0]/p_false 
            fmri_prob_true=fmri_class.predict_proba(X_fmri[row,:].reshape(1, -1))[0][1]/p_true

        if np.isnan(X_dwi[row,:]).any():
            dwi_prob_true=1
            dwi_prob_false=1
        else: 
            dwi_prob_false=dwi_class.predict_proba(X_dwi[row,:].reshape(1, -1))[0][0]/p_false
            dwi_prob_true=dwi_class.predict_proba(X_dwi[row,:].reshape(1, -1))[0][1]/p_true

        # if np.isnan(X_eeg):
        #     eeg_prob_true=1
        #     eeg_prob_false=1            
        if np.isnan(X_eeg[row,:]).any():
            eeg_prob_true=1
            eeg_prob_false=1
        else: 
            eeg_prob_false=eeg_class.predict_proba(X_eeg[row,:].reshape(1, -1))[0][0]/p_false
            eeg_prob_true=eeg_class.predict_proba(X_eeg[row,:].reshape(1, -1))[0][1]/p_true
        
        prob_false=fmri_prob_false*dwi_prob_false*eeg_prob_false*p_false
        y_prob_false.append(prob_false)
        prob_true=fmri_prob_true*dwi_prob_true*eeg_prob_true*p_true
        y_prob_true.append(prob_true)
        predict.append(prob_true>=prob_false) #check which "probability" is higher. Could test whether taking the tie break the other direction (i.e. setting the prediciton to prob_true>=prob_false) changes the results

    return predict,y_prob_true,y_prob_false
    

In [None]:
# "Naive Bayes" Implementation

seed_value= 42
os.environ['PYTHONHASHSEED']=str(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)

fmri_feat={'Ov':X_over_aal,'Pos':X_pos_str_aal,'Neg':X_neg_str_aal}
nb_results_bayes=pd.DataFrame()
j = 0

for key,val in fmri_feat.items():
    X_fmri=val
    f1_scores=[]
    i = key

    for train_idx, test_idx in cv_outer.split(X_over_aal,y):

        j += 1
        X_train_fmri,y_train_fmri=drop_nan_index(X_fmri,y,train_idx)
        X_train_dwi,y_train_dwi=drop_nan_index(X_dwi,y,train_idx)
        X_train_eeg,y_train_eeg=drop_nan_index(X_eeg,y,train_idx)

        X_test_fmri=X_over_aal[test_idx,:]
        X_test_dwi=X_dwi[test_idx,:]
        X_test_eeg=X_eeg[test_idx,:]

        y_train=y[train_idx]
        y_test=y[test_idx]

        # SVM Classifiers
        fmri_class=nb_svm(X_train_fmri,y_train_fmri)
        dwi_class=nb_svm(X_train_dwi,y_train_dwi)
        eeg_class=nb_svm(X_train_eeg,y_train_eeg)

        fmri_grid = pd.DataFrame(fmri_class.cv_results_)
        dwi_grid = pd.DataFrame(fmri_class.cv_results_)
        # eeg_grid = pd.DataFrame(fmri_class.cv_results_)
        grid_search = fmri_grid.append(dwi_grid)
        grid_search.to_csv('grid_search_tree_fd_.csv')

        # # Tree Classifiers
        # fmri_class=nb_tree(X_train_fmri,y_train_fmri)
        # dwi_class=nb_tree(X_train_dwi,y_train_dwi)
        # eeg_class=nb_tree(X_train_eeg,y_train_eeg)

        y_pred,y_prob_true,y_prob_false=naive_bayes_multimodal(fmri_class,X_test_fmri,dwi_class,X_test_dwi,y_test,y_train,eeg_class,X_test_eeg)

        f1_scores.append(f1_score(y_test, y_pred, average='weighted'))

    f1_scores = np.array(f1_scores)
    # nb_results=nb_results.append({'fMRI':key,'Class':'SVM','Mean f1':f1_scores.mean(),'STD f1':f1_scores.std()},ignore_index=True)
    nb_results_bayes=nb_results_bayes.append({'fMRI':key,'Class':'SVM','Mean f1':f1_scores.mean(),'SEM f1':1.96*stats.sem(f1_scores,ddof=0)},ignore_index=True)

In [None]:
# Naive Bayes Fusion
print(nb_results_bayes)

# Imputer Setup

In [None]:
seed_value= 42
os.environ['PYTHONHASHSEED']=str(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)

# # Replacing missing values with Iterative imputer
# imputer_iterative=IterativeImputer(random_state=42)
# iterative_features_df=imputer_iterative.fit_transform(class_features_df)
# iter_feats_fmri_ov=iterative_features_df[:,333:499]
# iter_feats_fmri_pos=iterative_features_df[:,167:333]
# iter_feats_fmri_neg=iterative_features_df[:,1:167]
# iter_feats_dwi=iterative_features_df[:,499:562]
# iter_feats_eeg=iterative_features_df[:,564:567]
# iter_feats_fmri=np.append(iter_feats_fmri_ov,iter_feats_fmri_pos,axis=1)
# iter_feats_fmri=np.append(iter_feats_fmri,iter_feats_fmri_neg,axis=1)

# Replacing missing values with KNN imputer
imputer_knn=KNNImputer()
knn_features_df=imputer_knn.fit_transform(class_features_df)
knn_feats_fmri_ov=knn_features_df[:,333:499]
knn_feats_fmri_pos=knn_features_df[:,167:333]
knn_feats_fmri_neg=knn_features_df[:,1:167]
knn_feats_dwi=knn_features_df[:,499:562]
knn_feats_eeg=knn_features_df[:,564:567]
knn_feats_fmri=np.append(knn_feats_fmri_ov,knn_feats_fmri_pos,axis=1)
knn_feats_fmri=np.append(knn_feats_fmri,knn_feats_fmri_neg,axis=1)

# Canonical Correlation

In [None]:
def nb_cca_svm(x,y):
    ''' Fits and scores a CCA+SVM classifier for use in a Bayes fusion classifier'''

    svm_classifier = Pipeline([("svm",SVC(probability=True))])
    param_grid_svm={"clf__svm__gamma": ['auto']}
    # param_grid_svm={"clf__svm__C": [10], "clf__svm__gamma": [.01]}
    
    pipe=Pipeline([("scale",StandardScaler()),("clf",svm_classifier)])

    search=GridSearchCV(estimator=pipe,scoring=score_string,param_grid=param_grid_svm,cv=cv_inner,refit=True).fit(x,y)

    return search

In [None]:
# CCA with fMRI and dMRI

seed_value= 42
os.environ['PYTHONHASHSEED']=str(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)

fmri_feat={'Ov':X_over_aal,'Pos':X_pos_str_aal,'Neg':X_neg_str_aal}
nb_cca_results=pd.DataFrame()
j = 0

for key,val in fmri_feat.items():
    X_fmri=val
    j+=1

    # Using feature matrices with KNN imputed values 
    X_fmri_imp = knn_feats_fmri[:,(j-1)*166:j*166]
    X_dwi_imp = knn_feats_dwi

    # Varying no. of CCA components
    # for i in [7]:
    for i in range(10):
        f1_scores=[]

        for train_idx, test_idx in cv_outer.split(X_over_aal,y):

            X_train_f = X_fmri_imp[train_idx,:]
            X_train_d = X_dwi_imp[train_idx,:]
            X_test_f = X_fmri_imp[test_idx,:]
            X_test_d = X_dwi_imp[test_idx,:]
            y_train = y[train_idx]
            y_test = y[test_idx]

            # CCA decomposition into i+1 components 
            cca=None
            cca = CCA(n_components=i+1)

            cca.fit(X_train_f, X_train_d)
            X_fmri_cca, X_dwi_cca = cca.transform(X_train_f, X_train_d)
            X_train_cca = np.concatenate((X_fmri_cca, X_dwi_cca), axis=1)

            X_fmri_cca, X_dwi_cca = cca.transform(X_test_f, X_test_d)
            X_test_cca = np.concatenate((X_fmri_cca, X_dwi_cca), axis=1)

            # SVM Classifier
            clf = make_pipeline(StandardScaler(), SVC(C=10,gamma=0.01))
            clf.fit(X_train_cca, y_train)
            y_pred = clf.predict(X_test_cca)

            f1_scores.append(f1_score(y_test, y_pred, average='weighted'))

        nb_cca_results = nb_cca_results.append({'fMRI':key,'CCA #Comp':i+1,'Class':'SVM','Mean f1':mean(f1_scores),'SEM f1':1.96*stats.sem(f1_scores,ddof=0)},ignore_index=True)

In [None]:
# KNN imputer
print(nb_cca_results)

In [None]:
# Iterative imputer
print(nb_cca_results)

In [None]:
print(nb_cca_results)

# Sequential Feature Selector

In [None]:
# Sequential Selection

seed_value= 42
os.environ['PYTHONHASHSEED']=str(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)

fmri_feat={'Ov':X_over_aal,'Pos':X_pos_str_aal,'Neg':X_neg_str_aal}
nb_seq_results=pd.DataFrame()
# X_seq_tsne=None 
# X_seq_tsne=np.zeros((1, 9), dtype = np.float32) 
j = 0

for key,val in fmri_feat.items():
    X_fmri=val
    j+=1

    # Using feature matrices with KNN imputed values 
    X_fmri_imp = knn_feats_fmri[:,(j-1)*166:j*166]
    X_dwi_imp = knn_feats_dwi
    X_eeg_imp = knn_feats_eeg

    X_imputed_combined=np.append(X_fmri_imp,X_dwi_imp,axis=1)
    X_imputed_combined=np.append(X_imputed_combined,X_eeg_imp,axis=1)

    # Varying no. of greedy features
    # for i in [8]:
    for i in range(10):
        f1_scores=[]
        direction='forward'
        # direction='backward'

        for train_idx, test_idx in cv_outer.split(X_over_aal,y):

            X_train = X_imputed_combined[train_idx,:]
            X_test = X_imputed_combined[test_idx,:]
            y_train = y[train_idx]
            y_test = y[test_idx]

            # Sequential feature selector
            sfs=None
            sfs = SequentialFeatureSelector(clf, direction=direction, scoring='f1', n_features_to_select=i+1)

            sfs.fit(X_train, y_train)
            X_train_seq = sfs.transform(X_train)
            X_test_seq = sfs.transform(X_test)
            # X_seq_tsne=np.append(X_seq_tsne,X_test_seq,axis=0)

            # SVM Classifier
            clf=None
            # clf = make_pipeline(StandardScaler(), SVC(C=10,gamma=0.01))
            clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
            clf.fit(X_train_seq, y_train)
            y_pred = clf.predict(X_test_seq)

            f1_scores.append(f1_score(y_test, y_pred, average='weighted'))

        nb_seq_results = nb_seq_results.append({'fMRI':key,'Feats.':i+1,'Direction':direction,'Mean f1':mean(f1_scores),'SEM f1':1.96*stats.sem(f1_scores,ddof=0)},ignore_index=True)

In [None]:
# Sequential Selection with CCA

seed_value= 42
os.environ['PYTHONHASHSEED']=str(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)

# fmri_feat={'Ov':X_over_aal,'Pos':X_pos_str_aal,'Neg':X_neg_str_aal}
fmri_feat={'Neg':X_neg_str_aal}
nb_seq_cca_results=pd.DataFrame()
j = 2

tprs = []
aucs = []
auc_est = []
# FPR becomes the independent ordinate
mean_fpr = np.linspace(0, 1, 40)
fig, ax = plt.subplots()

for key,val in fmri_feat.items():
    X_fmri=val
    j+=1

    # Using feature matrices with KNN imputed values 
    X_fmri_imp = knn_feats_fmri[:,(j-1)*166:j*166]
    X_dwi_imp = knn_feats_dwi
    X_eeg_imp = knn_feats_eeg

    X_imputed_combined=np.append(X_fmri_imp,X_dwi_imp,axis=1)
    X_imputed_combined=np.append(X_imputed_combined,X_eeg_imp,axis=1)

    # Varying no. of greedy features
    for i in [8]:
    # for i in range(10):
        f1_scores=[]
        direction='forward'
        # direction='backward'

        for train_idx, test_idx in cv_outer.split(X_over_aal,y):

            X_train = X_imputed_combined[train_idx,:]
            X_test = X_imputed_combined[test_idx,:]
            X_train_f = X_fmri_imp[train_idx,:]
            X_train_d = X_dwi_imp[train_idx,:]
            X_test_f = X_fmri_imp[test_idx,:]
            X_test_d = X_dwi_imp[test_idx,:]
            y_train = y[train_idx]
            y_test = y[test_idx]

            # Sequential feature selector into i+1 features 
            # sfs = SequentialFeatureSelector(clf, direction=direction, scoring='f1', n_features_to_select=i+1)
            clf=None
            clf = make_pipeline(StandardScaler(), SVC(gamma='auto', probability=True))
            sfs=None
            sfs = SequentialFeatureSelector(clf, direction=direction, scoring='f1', n_features_to_select=i+1)

            sfs.fit(X_train, y_train)
            X_train_seq = sfs.transform(X_train)
            X_test_seq = sfs.transform(X_test)

            # CCA decomposition into 2 components 
            cca=None
            cca = CCA(n_components=2)

            cca.fit(X_train_f, X_train_d)
            X_fmri_cca, X_dwi_cca = cca.transform(X_train_f, X_train_d)
            X_train_cca = np.concatenate((X_fmri_cca, X_dwi_cca), axis=1)
            X_fmri_cca, X_dwi_cca = cca.transform(X_test_f, X_test_d)
            X_test_cca = np.concatenate((X_fmri_cca, X_dwi_cca), axis=1)

            X_train_concat=np.concatenate((X_train_seq, X_train_cca), axis=1)
            X_test_concat=np.concatenate((X_test_seq, X_test_cca), axis=1)

            # SVM Classifier
            # clf = make_pipeline(StandardScaler(), SVC(C=10,gamma=0.01))
            clf.fit(X_train_concat, y_train)
            y_pred = clf.predict(X_test_concat)
            y_prob=clf.predict_proba(X_test_concat)

            fpr, tpr, _ = roc_curve(y_test, y_prob[:,1])
            interp_tpr = np.interp(mean_fpr, fpr, tpr)
            interp_tpr[0] = 0.0
            tprs.append(interp_tpr)
            roc_auc = auc(fpr, tpr)
            aucs.append(roc_auc)
            
        test_statistic = lambda x: (np.mean(x), np.var(x)) 
        ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
                label='Chance', alpha=.8)

        mean_tpr = np.mean(tprs, axis=0)
        mean_tpr[-1] = 1.0
        mean_auc = auc(mean_fpr, mean_tpr)

        # Using Binomial conf intervals, as laid out in Sourati 2015
        [tprs_upper, tprs_lower] = binom_conf_interval(mean_tpr*17, 17, confidence_level=0.95, interval='wilson')  

        upper_lim = auc(mean_fpr, tprs_upper)
        auc_est.append(upper_lim)
        lower_lim = auc(mean_fpr, tprs_lower)
        auc_est.append(lower_lim)
        std_auc = abs(upper_lim-lower_lim)/2

        ax.plot(mean_fpr, mean_tpr, color='b',
                label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
                lw=2, alpha=.8)

        ax.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
                        label=r'95% level of confidence')

        ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05])
        ax.legend(loc="lower right")
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.grid('True')    
        plt.tight_layout()   
        plt.savefig('plot/ROC-AUC-SFS-CCA.png')    

        f1_scores.append(f1_score(y_test, y_pred, average='weighted'))

        nb_seq_cca_results = nb_seq_cca_results.append({'fMRI':key,'Feats.':i+1,'Direction':direction,'Mean f1':mean(f1_scores),'SEM f1':1.96*stats.sem(f1_scores,ddof=0)},ignore_index=True)

In [None]:
print(y_prob[:,1])

In [None]:
# Forward KNN imputer
print(nb_seq_cca_results)

In [None]:
# GFS with CCA for Clustering Viz

seed_value= 42
os.environ['PYTHONHASHSEED']=str(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)

# fmri_feat={'Ov':X_over_aal,'Pos':X_pos_str_aal,'Neg':X_neg_str_aal}
fmri_feat={'Neg':X_neg_str_aal}
nb_seq_cca_results=pd.DataFrame()
j = 0

for key,val in fmri_feat.items():
    X_fmri=val
    j+=1

    # Using feature matrices with KNN imputed values 
    X_fmri_imp = knn_feats_fmri[:,(j-1)*166:j*166]
    X_dwi_imp = knn_feats_dwi
    X_eeg_imp = knn_feats_eeg

    X_imputed_combined=np.append(X_fmri_imp,X_dwi_imp,axis=1)
    X_imputed_combined=np.append(X_imputed_combined,X_eeg_imp,axis=1)

    # Varying no. of greedy features
    for i in [8]:
        f1_scores=[]
        direction='forward'

        # Sequential feature selector into i+1 features 
        clf=None
        clf = make_pipeline(StandardScaler(), SVC(C=10,gamma=0.01))
        sfs=None
        sfs = SequentialFeatureSelector(clf, direction=direction, scoring='f1', n_features_to_select=i+1)
        X_train_seq=sfs.fit_transform(X_imputed_combined, y)

        # CCA decomposition into 2 components 
        cca=None
        cca = CCA(n_components=2)
        X_fmri_cca,X_dwi_cca=cca.fit_transform(X_fmri_imp, X_dwi_imp)
        X_train_cca=np.concatenate((X_fmri_cca, X_dwi_cca), axis=1)

        X_concat_viz=np.concatenate((X_train_seq, X_train_cca), axis=1)

In [None]:
# Forward KNN imputer [selected features]
print(nb_seq_results)

In [None]:
# Forward KNN imputer
print(nb_seq_results)

In [None]:
# Forward KNN imputer (gamma=auto)
print(nb_seq_results)

In [None]:
# Forward KNN imputer [Sequential+CCA]
print(nb_seq_cca_results)

In [None]:
# Forward KNN imputer [Sequential+CCA(fixed)]
print(nb_seq_cca_results)

In [None]:
# Backward KNN imputer
print(nb_seq_results)

In [None]:
# Iterative imputer
print(nb_seq_results)

In [None]:
# t-SNE (2D)

seed_value= 42
os.environ['PYTHONHASHSEED']=str(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)

# --- No. indicated #subs ---
k = 48
tsne = TSNE(n_components=2, verbose=0, perplexity=10, n_iter=300)
# X=X_imputed_combined
X=X_concat_viz

# tsne_results = tsne.fit_transform(X_seq_tsne[97:])
tsne_results = tsne.fit_transform(X)

fig, ax = plt.subplots()
# ax = plt.figure(figsize=(10,8))
cm = plt.cm.viridis
scat = ax.scatter(
    x=tsne_results[:,0], 
    y=tsne_results[:,1], 
    c=y,
    cmap=cm)
legend_elem = [Line2D([0], [0], marker='o', color=cm(0.),lw=0,label='No Seizure'),
                   Line2D([0], [0], marker='o', color=cm(1.),lw=0,label='Late Seizure')]

legend1 = ax.legend(handles=legend_elem,
                    loc="upper right")

# ax.add_artist(legend1)
# ax.set(title="t-SNE plot: [%2.0f features, %1.0f axes]" % (63,2))
ax.set_xlabel('Component-1')
ax.set_ylabel('Component-2')

# handles, labels = scatter.legend_elements(prop="sizes", alpha=0.6)
# legend2 = ax.legend(handles, labels, loc="upper right", title="Sizes")
plt.tight_layout()     
# plt.savefig('plot/X-imputed-t-SNE-2D.png')       
plt.savefig('plot/SFS-CCA-t-SNE-2D.png')     

In [None]:
# t-SNE (3D)

seed_value= 42
os.environ['PYTHONHASHSEED']=str(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)

# --- No. indicated #subs ---
k = 48
X=X_imputed_combined
# X=X_concat_viz

tsne = TSNE(n_components=3, verbose=1, perplexity=10, n_iter=300)
tsne_results = tsne.fit_transform(X)

ax = plt.figure(figsize=(8,8)).gca(projection='3d')
cm = plt.cm.viridis
scat = ax.scatter(
    xs=tsne_results[:,0], 
    ys=tsne_results[:,1], 
    zs=tsne_results[:,2], 
    c=y,
    s=100,
    cmap=cm)
legend_elem = [Line2D([0], [0], marker='o', color=cm(0.),lw=0,label='No Seizure'),
                   Line2D([0], [0], marker='o', color=cm(1.),lw=0,label='Late Seizure')]

legend1 = ax.legend(handles=legend_elem,
                    loc="upper right")

ax.set_xlabel('t-SNE Component-1')
ax.set_ylabel('t-SNE Component-2')
ax.set_zlabel('t-SNE Component-3')

# handles, labels = scatter.legend_elements(prop="sizes", alpha=0.6)
# legend2 = ax.legend(handles, labels, loc="upper right", title="Sizes")
# plt.tight_layout()   
plt.savefig('plot/t-SNE-3D-X-imputed.png')   
# plt.savefig('plot/t-SNE-3D-X-SFS-CCA.png')   

In [None]:
# PCA (2D) & k-means
seed_value= 42
os.environ['PYTHONHASHSEED']=str(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)

from sklearn.decomposition import PCA
n_clusters=2
colors = ["#4EACC5","#FF9C34"]
cm = plt.cm.viridis

data=X_concat_viz
# data=X_imputed_combined

k_means = KMeans(init="k-means++",n_clusters=n_clusters,n_init=10,random_state=42)
X = PCA(n_components=3).fit_transform(data)
k_means.fit(X)
k_means_cluster_centers = k_means.cluster_centers_
k_means_labels = pairwise_distances_argmin(X, k_means_cluster_centers)

fig = plt.figure()
# fig.subplots_adjust(left=0.02, right=0.98, bottom=0.05, top=0.9)
ax = plt.figure(figsize=(8,8)).gca(projection='3d')
for k in range(n_clusters):
# for k, col in zip(range(n_clusters), colors):
    my_members = (k_means_labels==k)
    labels=(y==k)
    print(labels)
    # cluster_center = k_means_cluster_centers[k]
    # ax.plot(X[my_members, 0], X[my_members, 1], "w", markerfacecolor=col, marker="o",markersize=10)
    # ax.plot(X[labels, 0], X[labels, 1], "w", marker="o",markersize=10)
    # ax.plot(
    #     cluster_center[0],
    #     cluster_center[1],
    #     "o",
    #     markerfacecolor=col,
    #     markeredgecolor="k",
    #     markersize=10,
    # )
    ax.scatter(
    xs=X[labels, 0], 
    ys=X[labels, 1], 
    zs=X[labels, 2], 
    s=100,
    # markersize=10,
    # c=k,
    cmap=cm)

# ax.set_title("KMeans")
# ax.set_xticks(())
# ax.set_yticks(())
ax.set_xlabel('Principal component-1')
ax.set_ylabel('Principal component-2')
ax.set_ylabel('Principal component-3')
plt.tight_layout()  
 
# plt.savefig('plot/k-means-X-SFS-CCA.png')    
# plt.savefig('plot/k-means-X-imputed.png')   
# plt.savefig('plot/PCA-X-SFS-CCA.png')     
# plt.savefig('plot/PCA-3D-X-imputed.png')   
plt.savefig('plot/PCA-3D-X-SFS-CCA.png')   

In [None]:
# PCA (3D) & k-means
seed_value= 42
os.environ['PYTHONHASHSEED']=str(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)

from sklearn.decomposition import PCA
# %matplotlib notebook

# import matplotlib.pyplot as plt
# from mpl_toolkits.mplot3d import axes3d  

n_clusters=2
colors = ["#4EACC5","#FF9C34"]
cm = plt.cm.viridis

data=X_concat_viz
# data=X_imputed_combined

X = PCA(n_components=3).fit_transform(data)

ax = plt.figure(figsize=(8,8)).gca(projection='3d')
for k in range(n_clusters):
    my_members = (k_means_labels==k)
    labels=(y==k)
    ax.scatter(
    xs=X[labels, 0], 
    ys=X[labels, 1], 
    zs=X[labels, 2], 
    s=100,
    cmap=cm)

ax.set_xlabel('Principal component-1')
ax.set_ylabel('Principal component-2')
ax.set_ylabel('Principal component-3')
ax.view_init(60, 30)
plt.show()
 
# plt.savefig('plot/PCA-3D-X-SFS-CCA.png')   
# plt.savefig('plot/PCA-3D-X-imputed.png')   

In [None]:
# Spectral Clustering
seed_value= 42
os.environ['PYTHONHASHSEED']=str(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)

from sklearn.cluster import SpectralClustering
n_clusters=2
colors = ["#4EACC5","#FF9C34"]
X=X_imputed_combined
# X=X_concat_viz

clustering = SpectralClustering(n_clusters=n_clusters,
        assign_labels='kmeans',
        random_state=42).fit(X)
# k_means_cluster_centers = clustering.cluster_centers_
# k_means_labels = pairwise_distances_argmin(X, k_means_cluster_centers)
k_means_labels=clustering.labels_
print(k_means_labels)

fig = plt.figure()
# fig.subplots_adjust(left=0.02, right=0.98, bottom=0.05, top=0.9)
ax = fig.add_subplot(1, 1, 1)
for k, col in zip(range(n_clusters), colors):
    my_members = k_means_labels == k
    # cluster_center = k_means_cluster_centers[k]
    ax.plot(X[my_members, 0], X[my_members, 1], "w", markerfacecolor=col, marker=".",markersize=10)
    # ax.plot(
    #     cluster_center[0],
    #     cluster_center[1],
    #     "o",
    #     markerfacecolor=col,
    #     markeredgecolor="k",
    #     markersize=10,
    # )
ax.set_title("KMeans")
ax.set_xticks(())
ax.set_yticks(())
# plt.text(-3.5, 1.8, "train time: %.2fs\ninertia: %f" % (t_batch, k_means.inertia_))
# plt.grid('True')    
plt.tight_layout()  
# plt.show()
 
plt.savefig('plot/SFS-CCA-spectral-k-means.png')    

In [None]:
from sklearn.datasets import make_blobs

np.random.seed(0)

batch_size = 5
centers = [[1, 1], [-1, -1], [1, -1]]
n_clusters = len(centers)
Q, labels_true = make_blobs(n_samples=50, centers=centers, cluster_std=0.7)

# MMINet SMIG Feature Reduction

In [None]:
# MMINet SMIG with all

seed_value= 42
os.environ['PYTHONHASHSEED']=str(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)

fmri_feat={'Ov':X_over_aal,'Pos':X_pos_str_aal,'Neg':X_neg_str_aal}
nb_smig_results=pd.DataFrame()
j = 0

for key,val in fmri_feat.items():
    X_fmri=val
    j+=1

    # # Using feature matrices with KNN imputed values 
    X_fmri_imp = knn_feats_fmri[:,(j-1)*166:j*166]
    X_dwi_imp = knn_feats_dwi
    X_eeg_imp = knn_feats_eeg

    X_imputed_combined=np.append(X_fmri_imp,X_dwi_imp,axis=1)
    X_imputed_combined=np.append(X_imputed_combined,X_eeg_imp,axis=1)

    # Varying no. of projected dimensions
    # for i in [2]:
    for i in range(10):
        f1_scores=[]

        for train_idx, test_idx in cv_outer.split(X_over_aal,y):

            X_train = X_imputed_combined[train_idx,:]
            X_test = X_imputed_combined[test_idx,:]
            y_train = y[train_idx]
            y_test = y[test_idx]

            # Project n-dimensional input features into i+1 dimensions
            model=None
            model=MMINet(input_dim=232, output_dim=i+1, net='nonlinear')

            model.learn(X_train, y_train, num_epochs=30)
            z_train = model.reduce(X_train)
            z_test = model.reduce(X_test)

            # SVM Classifier
            # clf = make_pipeline(StandardScaler(), SVC(C=10,gamma=0.01))
            clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
            clf.fit(z_train, y_train)
            y_pred = clf.predict(z_test)

            f1_scores.append(f1_score(y_test, y_pred, average='weighted'))

        nb_smig_results = nb_smig_results.append({'fMRI':key,'Feats.':i+1,'Mean f1':mean(f1_scores),'SEM f1':1.96*stats.sem(f1_scores,ddof=0)},ignore_index=True)

In [None]:
# MMINet SMIG with CCA

seed_value= 42
os.environ['PYTHONHASHSEED']=str(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)

fmri_feat={'Ov':X_over_aal,'Pos':X_pos_str_aal,'Neg':X_neg_str_aal}
nb_smig_cca_results=pd.DataFrame()
j = 0

for key,val in fmri_feat.items():
    X_fmri=val
    j+=1

    # # Using feature matrices with KNN imputed values 
    X_fmri_imp = knn_feats_fmri[:,(j-1)*166:j*166]
    X_dwi_imp = knn_feats_dwi
    X_eeg_imp = knn_feats_eeg

    X_imputed_combined=np.append(X_fmri_imp,X_dwi_imp,axis=1)
    X_imputed_combined=np.append(X_imputed_combined,X_eeg_imp,axis=1)

    # Varying no. of projected dimensions
    # for i in [2]:
    for i in range(10):
        f1_scores=[]

        for train_idx, test_idx in cv_outer.split(X_over_aal,y):

            X_train = X_imputed_combined[train_idx,:]
            X_test = X_imputed_combined[test_idx,:]
            y_train = y[train_idx]
            y_test = y[test_idx]

            X_train_f = X_fmri_imp[train_idx,:]
            X_train_d = X_dwi_imp[train_idx,:]
            X_test_f = X_fmri_imp[test_idx,:]
            X_test_d = X_dwi_imp[test_idx,:]

            # Project n-dimensional input features into i+1 dimensions
            model=None
            model=MMINet(input_dim=232, output_dim=6, net='linear')

            model.learn(X_train, y_train, num_epochs=10)
            z_train = model.reduce(X_train)
            z_test = model.reduce(X_test)

            # CCA decomposition into i+1 components 
            cca=None
            cca = CCA(n_components=i+1)

            cca.fit(X_train_f, X_train_d)
            X_fmri_cca, X_dwi_cca = cca.transform(X_train_f, X_train_d)
            X_train_cca = np.concatenate((X_fmri_cca, X_dwi_cca), axis=1)
            X_fmri_cca, X_dwi_cca = cca.transform(X_test_f, X_test_d)
            X_test_cca = np.concatenate((X_fmri_cca, X_dwi_cca), axis=1)

            X_train_concat=np.concatenate((z_train, X_train_cca), axis=1)
            X_test_concat=np.concatenate((z_test, X_test_cca), axis=1)

            # SVM Classifier
            clf=None
            clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
            clf.fit(X_train_concat, y_train)
            y_pred = clf.predict(X_test_concat)

            f1_scores.append(f1_score(y_test, y_pred, average='weighted'))

        nb_smig_cca_results = nb_smig_cca_results.append({'fMRI':key,'Feats.':i+1,'Mean f1':mean(f1_scores),'SEM f1':1.96*stats.sem(f1_scores,ddof=0)},ignore_index=True)

In [None]:
print(nb_smig_results)

In [None]:
# KNN imputer (Gamma 'auto')
print(nb_smig_results)

In [None]:
# KNN imputer (Non-linear, Gamma 'auto')
print(nb_smig_results)

In [None]:
# KNN imputer (Gamma & C set)
print(nb_smig_results)

In [None]:
# Iterative imputer
print(nb_smig_results)

In [None]:
# KNN imputer
print(nb_smig_cca_results)

In [None]:
print(nb_results.to_latex(index=False))  

In [None]:
# Subspace Clustering

# model_sub = ElasticNetSubspaceClustering(n_clusters=2,algorithm='lasso_lars',gamma=50).fit(X_seq_tsne[97:])
model_sub = ElasticNetSubspaceClustering(n_clusters=2,algorithm='lasso_lars',gamma=50).fit(knn_features_df)

# model = SparseSubspaceClusteringOMP(n_clusters=3).fit(X_sub.T)
print(model_sub.labels_)

In [None]:
# Subspace Clustering

X=X_concat_viz
# model_sub = ElasticNetSubspaceClustering(n_clusters=2,algorithm='lasso_lars',gamma=50).fit(X_seq_tsne[97:])
model_sub = ElasticNetSubspaceClustering(n_clusters=2,algorithm='lasso_lars',gamma=50).fit(X)

# model = SparseSubspaceClusteringOMP(n_clusters=3).fit(X_sub.T)
print(model_sub.labels_)

In [None]:
# Imputed Features
from sklearn.metrics import accuracy_score
y_inv=1-y
# accuracy_score(y_inv, model_sub.labels_)
accuracy_score(y, model_sub.labels_)

In [None]:
# from metrics.cluster.accuracy import clustering_accuracy
from sklearn.metrics import accuracy_score
# acc = clustering_accuracy(y, model_sub.labels_)
accuracy_score(y, model_sub.labels_)

In [None]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore')
# enc.fit(y)
y_new=y.reshape(len(y), 1)
q=enc.fit_transform(y_new)

# Current Progress (6/10, SFR):

There are currently three multimodal classifiers up and running. One which uses the conditional independence assumption (or naive bayes approach) to deal with missing values and two which use imputers to deal with missing values. So far, none of the multimodal classifiers outperform the single modality classifiers and only the feature union (just lump them all together) and hard label voting classifier perform at the same level. 

There might be some things that are dependent on implementation or that require hyper parameter tuning as I've noted through the notebook. We may also see some improvements as we get more subjects. 