# fMRI analysis and visualization - subject averaged data 

#### In this notebook we will do some visualization and analysis of fMRI data which has already been averaged across participants.



In [None]:
import numpy as np 
from matplotlib import pyplot as plt 
from hdf5storage import loadmat, savemat 
from nilearn import plotting,datasets

In [None]:
#if you want to keep the code in the github repository
datapath = '/home/ramesh/Teaching/data_archive/hcp_task/'
#use this if you placed this program in the same location as the data. 
#datapath = './'


In [None]:
regions = np.load(datapath+'regions.npy') # this is the file 
roi_names = regions[:,0] # these are the names of each of 360 roi from the parcellation.
network_names = regions[:,1] # these are the networks each roi "belongs" to
networks = np.unique(regions[:,1]) # these are the unique network names 

#### Load the hcp atlas file.  This file provides a mapping between the 360 roi and the fsaverage (FreeSurfer Average) brain.  fsaverage is the average of 40 brains of healthy individuals.  

In [None]:
atlas = dict(np.load(datapath+'hcp_atlas.npz'))
fsaverage = datasets.fetch_surf_fsaverage()

In [None]:
data = loadmat(datapath+'SOCIAL_fmri_subjectaverage.mat')

In [None]:
data.keys()

In [None]:
condition_index = data['condition_index']
conditions = data['conditions']
fmri = data['fmri']
nconditions = data['nconditions']
nregions = data['nregions']
nsubjects = data['nsubjects']
subject = data['subject']
task = data['task']

## README 

#### condition_index - index for each data sample, indicating the experimental condition 
#### conditions - conditions in the experiment 
#### fmri - fmri data averaged over participants, nregions x (nsubjects x nconditions)
#### nconditions - number of conditions
#### nregions - number of regions (always 360)
#### nsubjects - number of subjects (always 100)
#### subject - indexes which subject each average comes from. 
#### task - which task the data comes from.  

In [None]:
print(conditions)

#### For the SOCIAL task 
* #### 'mental' - appearance of social interaction 
* #### 'rnd' - appears to be random 



#### 2.  Standardize the data using z-score, and compute the different between conditions. Make a bar graph of the difference versus roi. Print out the roi_names and network_names of the 5 regions where activity in `mental` is most higher than `rnd`.  Print out the roi_names and network_names of the 5 regions where `rnd` is most higher than `mental`

In [None]:
from scipy.stats import zscore
z = zscore(fmri)
#z = zscore(fmri,axis = 0)
diff = np.mean((z[:,condition_index == 0]-z[:,condition_index == 1]),axis = 1)

#### Bar plot of differences to orient to effect size.  

In [None]:
plt.figure(figsize = (8,8))
plt.bar(np.arange(nregions),diff)
plt.xlabel('ROI')
plt.ylabel('Mental-Random (sd units)')
plt.show()

#### I am going to sort the effect size (absolute value of differences)

In [None]:
difforder = np.argsort(np.abs(diff))  # python sorting is always ascending.  so the first 5 here are big negative numbers. 
difforder = np.flipud(difforder)

#### Now print out the 10 regions with the highest magnitude differences.  

In [None]:
print('10 regions with strongest differences')
for j in range(10):
    print('ROI: ', roi_names[difforder[j]], 'Network: ', network_names[difforder[j]])



### Perform a Linear Discriminant Analysis to find the discriminant model to moximize differences between conditions.  

In [None]:
z_T = np.transpose(z) # sci kit learn likes variables in columns 
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
fmri_lda = LDA(n_components=1) # the highest number of components is the number of classes - 1
fmri_lda.fit(z_T, condition_index) # this fits a model
data_lda = fmri_lda.transform(z_T) # this transforms the data into the component space
data_lda = np.squeeze(data_lda) # remove unneeded dimensionality in 2 class models 
coef_lda = fmri_lda.coef_ # get the coefficient equation. 
coef_lda = np.squeeze(coef_lda) # squeeze unneeded dimensions.  

#### Visualize transformed data. 

In [None]:
plt.hist((data_lda[condition_index ==0]),bins=np.arange(-10,10,0.5),color = 'r')
plt.hist((data_lda[condition_index ==1]),bins=np.arange(-10,10,0.5), color = 'b')
plt.xlabel('LDA 1')
plt.ylabel('Number of Samples')
plt.show()

#### What is the transformation? 
#### The coef vector contains the **weight** on each ROI to maximally discriminate the 2 classes.  

In [None]:
surf_label = coef_lda[atlas['labels_R']] ## This maps the 360 values onto every voxel on the ROI in the brain.  
plotting.view_surf(fsaverage['infl_right'],surf_label,symmetric_cmap = True, title = 'cmax_mental',black_bg = True, vmax = 7)

In [None]:
surf_label = coef_lda[atlas['labels_L']] ## This maps the 360 values onto every voxel on the ROI in the brain.  
plotting.view_surf(fsaverage['infl_left'],surf_label,symmetric_cmap = True, title = 'cmax_mental',black_bg = True, vmax = 7)

#### The weights can be used to identify which were the features of the original data that contrubuted to classification. 

In [None]:
coef_r = np.argsort(np.abs(coef_lda))
coef_r = np.flipud(coef_r)
print('Regions contributing to classifier')
for j in range(10):
    print(roi_names[coef_r[j]],' Weight: ', coef_lda[coef_r[j]])



### We learnt last time that the correct way to do a classification analysis is to make use of cross-validation to confirm the predictive value of the model.   

In [None]:

from sklearn.model_selection import KFold 
kf = KFold(n_splits = 5,shuffle = True)  # Here I told it to shuffle the data, and to make 5 splits of the data. 
errors = 0 
predictions = np.zeros(np.shape(z_T)[0]) # here I save the prediction of each sample, when it was tested. 
probability = np.zeros((len(predictions),nconditions))
for train_index, test_index in kf.split(z_T):
#    print("TRAIN:", train_index, "TEST:", test_index)
    data_train = z_T[train_index]
    data_test = z_T[test_index]
    Label_train= condition_index[train_index] 
    Label_test = condition_index[test_index]
    lda = LDA(n_components=1)
    data_model = lda.fit(data_train, Label_train)
    predict = lda.predict(data_test)
    test = Label_test == predict
    errors = errors + sum(~test)
    predictions[test_index] = predict
    probability[test_index] = lda.predict_proba(data_test)
errorrate = errors/len(condition_index)
print(errorrate)

#### We can compute a confusion matrix to look at the pattern of errors.  

In [None]:
confusion_matrix = np.zeros((nconditions,nconditions))
for j in range(nconditions):
    values, counts = np.unique(predictions[condition_index == j],return_counts = True)
    confusion_matrix[j,values.astype(int)] = counts
print(confusion_matrix)    