# Using machine learning XGboost classifier to look for different pattern between Ketamin and Midazolam groups

In [None]:
# import relevant packages
import glob
import numpy as np
import scipy
import nilearn
import nilearn.plotting
import nilearn.input_data
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

## No apperant contribution to before/after treatment in general. 
- Lets look at group differences in ROIs $\rightarrow$
    * vmPFC
    * Hippocampus
    
- We compare pattern of ROI activation in the trauma > relax contrast on the 2nd day

In [None]:
# compare between groups
import pandas as pd
medication_cond = pd.read_csv('kpe_sub_condition.csv')


ketamine_list = list(medication_cond['scr_id'][medication_cond['med_cond']==1])
ket_list = []
for subject in ketamine_list:
    
    sub = subject.split('KPE')[1]
    ket_list.append(sub)


midazolam_list = list(medication_cond['scr_id'][medication_cond['med_cond']==0])
mid_list = []
for subject in midazolam_list:
    
    sub = subject.split('KPE')[1]
    mid_list.append(sub)
#mid_list.remove('1480')

In [None]:
print(ket_list)
print(mid_list)

In [None]:
mid_list
## only for 3rd session
ket_list.remove('1315')

In [None]:
# remove subjects in session 3
mid_list.remove('1253')
mid_list.remove('1468')
mid_list.remove('1480')

In [None]:
# only in 2nd session
mid_list.remove('1578')
# only for 3rd session
#mid_list.remove('1253')
#mid_list.remove('1480')

In [None]:
ses = '2'
ket_func = ['/gpfs/gibbs/pi/levy_ifat/Or/kpe/results/ScriptPart_ses%s_Nosmooth/modelfit/_subject_id_%s/modelestimate/results/cope7.nii.gz'% (ses,sub) for sub in ket_list]
mid_func = ['/gpfs/gibbs/pi/levy_ifat/Or/kpe/results/ScriptPart_ses%s_Nosmooth/modelfit/_subject_id_%s/modelestimate/results/cope7.nii.gz'% (ses,sub) for sub in mid_list]

In [None]:
# set n folds
# In session 3 there are many NAs from Midazolam. So we set different fold than LOO (per group)
n_folds= 7 #for 3rd session

In [None]:
mask_file = '/gpfs/gibbs/pi/levy_ifat/Or/ROI/hippocampus_association-test_z_FDR_0.01.nii.gz'
mask_file = nilearn.image.math_img("a>=13", a=mask_file)
%matplotlib inline
nilearn.plotting.plot_roi(mask_file)#, display_mode='x', cut_coords=[-26], annotate=False)
masker = nilearn.input_data.NiftiMasker(mask_img=mask_file, 
                               sessions=None, smoothing_fwhm=2,
                                        standardize=False, 
                                        detrend=False, verbose=5)

In [None]:
ketamine = []
for func in ket_func:
    print(f'Running {func}')
    beta = masker.fit_transform(func)
    ketamine.append(beta)

midazolam = []
for func in mid_func:
    print(f'Running {func}')
    beta = masker.fit_transform(func)
    midazolam.append(beta)

ketArr = np.array(ketamine)
ketArr_reshape= np.array(ketArr).reshape(ketArr.shape[0], ketArr.shape[2])
ketArr_reshape.shape


midArr = np.array(midazolam)
midArr_reshape= np.array(midArr).reshape(midArr.shape[0], midArr.shape[2])
midArr_reshape.shape


## Create condition labels (1 = plus, 0 = minus)
label1 = [1] * ketArr.shape[0]
label2 = [0] * midArr.shape[0]
condition_label = np.concatenate([label1, label2])
condition_label

X = np.concatenate([ketArr, midArr])
X = X.reshape(X.shape[0], midArr_reshape.shape[1])
X.shape

In [None]:
n_folds

In [None]:
#from sklearn.model_selection import StratifiedKFold
from sklearn.svm import LinearSVC

from sklearn import svm
model = XGBClassifier(n_jobs=7)

## Here we use stratified K-fold with shuffling to generate different shuffling of leave one subject out
cv = StratifiedKFold(n_splits=11, shuffle=True) # running for each subject


In [None]:
scores = cross_val_score(model,
                            X,
                            y=condition_label,
                            cv=cv,
                            groups=condition_label,
                            scoring= "roc_auc",
                            n_jobs=5, # set number of CPUs
                            #verbose = 5 # set some details of the activity 
                            )

## Use shuffle split to randomize and run the XGboost N times
- This will create a distribution of estimation level 
- We can then better estimate how really its more accurate than chance


In [None]:
n_iter = 1000
rand_score = []
for i in range(n_iter):
    print(f' Running {i+1} iteration')
    mean_scores = []
    scores = cross_val_score(model,
                            X,
                            y=condition_label,
                            cv=cv,
                            groups=condition_label,
                            scoring= "roc_auc",#"f1",#"accuracy",
                            n_jobs=5, # set number of CPUs
                            #verbose = 5 # set some details of the activity 
                            )
    mean_scores.append(scores.mean())
    rand_score.append(mean_scores)

### Plotting area under ROC curve ditribution and printing average and standard deviation of the distribution

In [None]:
rand_score = np.array(rand_score)
print("Area under curve: %0.2f (+/- %0.2f)" % (np.mean(rand_score), np.std(rand_score) * 2))
print(f'95% CI is {np.quantile(rand_score, [0.025, 0.975])}')
sns.distplot(rand_score)

## Now we do similar thing but with vmPFC 


In [None]:
mask_file = '/gpfs/gibbs/pi/levy_ifat/Or/ROI/vmpfc_association-test_z_FDR_0.01.nii.gz'
mask_file = nilearn.image.math_img("a>=2", a=mask_file)
%matplotlib inline
nilearn.plotting.plot_roi(mask_file)
masker = nilearn.input_data.NiftiMasker(mask_img=mask_file, 
                               smoothing_fwhm=1, standardize=True, 
                                        detrend=False, verbose=5)

In [None]:
ketamine = []
for func in ket_func:
    print(f'Running {func}')
    beta = masker.fit_transform(func)
    ketamine.append(beta)

midazolam = []
for func in mid_func:
    print(f'Running {func}')
    beta = masker.fit_transform(func)
    midazolam.append(beta)

ketArr = np.array(ketamine)
ketArr_reshape= np.array(ketArr).reshape(ketArr.shape[0], ketArr.shape[2])
ketArr_reshape.shape


midArr = np.array(midazolam)
midArr_reshape= np.array(midArr).reshape(midArr.shape[0], midArr.shape[2])
midArr_reshape.shape


## Create condition labels (1 = plus, 0 = minus)
label1 = [1] * ketArr.shape[0]
label2 = [0] * midArr.shape[0]
condition_label = np.concatenate([label1, label2])
condition_label

X = np.concatenate([ketArr, midArr])
X = X.reshape(X.shape[0], midArr_reshape.shape[1])
X.shape

In [None]:
model = XGBClassifier(n_jobs=7, 
                      random_state=None)

## Here we use stratified K-fold with shuffling to generate different shuffling of leave one subject out
cv = StratifiedKFold(n_splits=11, shuffle=True) # running for each subject


In [None]:
scores = cross_val_score(model,
                            X,
                            y=condition_label,
                            cv=cv,
                            groups=condition_label,
                            scoring= "roc_auc",
                            n_jobs=5, # set number of CPUs
                            #verbose = 5 # set some details of the activity 
                            )

In [None]:
scores

In [None]:
n_iter = 100
rand_score = []
mean_scores = []
for i in range(n_iter):
    print(f' Running {i+1} iteration')
    mean_scores = []
    scores = cross_val_score(model,
                            X,
                            y=condition_label,
                            cv=cv,
                            groups=condition_label,
                            scoring= "roc_auc",
                            n_jobs=8, # set number of CPUs
                            
                            )
    mean_scores.append(scores.mean())
    rand_score.append(mean_scores)

In [None]:
rand_score = np.array(rand_score)
print("Area under curve: %0.2f (+/- %0.2f)" % (np.mean(rand_score), np.std(rand_score) * 2))
print(f'95% CI is {np.quantile(rand_score, [0.025, 0.975])}')
sns.distplot(rand_score)