# BCCWJ-fMRI regressors

## Import

In [None]:
import pandas as pd
from pandas import Series

import numpy as np
import numpy.linalg as npl

from nilearn.glm.first_level import FirstLevelModel, compute_regressor

import seaborn as sns

In [None]:
sns.set(rc={'figure.figsize':(17,8)})

## Orthongonalization function

In [None]:
'''
This code for orthogonalization is from Christophe Pallier:
https://github.com/chrplr/lpp-scripts3/blob/master/models/en/bottomup-topdown-ortho/orthonormalize.py
'''
def ortho_proj(Y, M):
    """ returns the orthogonal component of Y to the space spanned by M and the constant vector 1 """
    if M.ndim == 1:   # M is a vector but needs to be a 2-D matrix
        M = M[:, np.newaxis]
    I = np.ones(len(M))
    I = I[:, np.newaxis]
    M2 = np.hstack((I, M))  # adding the constant 
    betas,_,_,_ = npl.lstsq(M2, Y, rcond=None)
    Xc = np.dot(M2, betas)  # colinear component "residuals"
    Xo = Y - Xc
    return Xo

## fMRI predictors

In [None]:
predictors = pd.read_csv('./TS_ALL.tsv', sep='\t')
predictors

In [None]:
#number of volumes
n_scans = [317,311,262,266]

In [None]:
sum(n_scans)

In [None]:
n_scans[0]

## Convolving Regressors

In [None]:
def convolve_regressors(section_num): 
    
    regressors_df = pd.DataFrame()
    
    '''
    compute_regressor() arguments:
        exp_condition: matrix of size 3 x num_events which consists of (onsets, durations, amplitudes)
        hrf_model: use spm
        frame_times: the sampling times
    '''
    #############################################
    #regressors of non-interest:
    #word_rate,freq, word_length, sentid, sentpos
    #############################################
    word_rate_regressor = compute_regressor(exp_condition = np.vstack((predictors[predictors['section_number']==section_num].offset,
                                                            np.zeros(len(predictors[predictors['section_number']==section_num])),
                                                            np.ones(len(predictors[predictors['section_number']==section_num])))),
                                                hrf_model="spm",
                                                frame_times = np.arange(0.0, n_scans[section_num-1] * 2.0, 2.0))[0]  
    
    word_length_regressor = compute_regressor(exp_condition = np.vstack((predictors[predictors['section_number']==section_num].offset, 
                                                              np.zeros(len(predictors[predictors['section_number']==section_num])),
                                                              predictors[predictors['section_number']==section_num].word_length)),
                                                hrf_model = "spm", 
                                                frame_times = np.arange(0.0, n_scans[section_num-1] * 2.0, 2.0))[0]
        
    word_freq_regressor = compute_regressor(exp_condition = np.vstack((predictors[predictors['section_number']==section_num].offset, 
                                                            np.zeros(len(predictors[predictors['section_number']==section_num])),
                                                            predictors[predictors['section_number']==section_num].count_ave_log)),
                                                hrf_model = "spm", 
                                                frame_times = np.arange(0.0, n_scans[section_num-1] * 2.0, 2.0))[0] 
    
    sentid_regressor = compute_regressor(exp_condition = np.vstack((predictors[predictors['section_number']==section_num].offset, 
                                                         np.zeros(len(predictors[predictors['section_number']==section_num])),
                                                         predictors[predictors['section_number']==section_num].sent_id)),
                                            hrf_model = "spm", 
                                            frame_times = np.arange(0.0, n_scans[section_num-1] * 2.0, 2.0))[0]  
    
    sentpos_regressor = compute_regressor(exp_condition = np.vstack((predictors[predictors['section_number']==section_num].offset, 
                                                          np.zeros(len(predictors[predictors['section_number']==section_num])),
                                                          predictors[predictors['section_number']==section_num].bunsetsu_pos)),
                                            hrf_model = "spm", 
                                            frame_times = np.arange(0.0, n_scans[section_num-1] * 2.0, 2.0))[0]  
    
    ######################ngrams##################
    ngram_five_regressor = compute_regressor(exp_condition = np.vstack((predictors[predictors['section_number']==section_num].offset, 
                                                             np.zeros(len(predictors[predictors['section_number']==section_num])),
                                                             predictors[predictors['section_number']==section_num].surp_ngram_five)),
                                                hrf_model = "spm", 
                                                frame_times = np.arange(0.0, n_scans[section_num-1] * 2.0, 2.0))[0]
    ngram_five_regressor = ortho_proj(ngram_five_regressor,word_rate_regressor)
    
    ###############################################
    #regressors of interest: LSTM, RNNGs
    ###############################################
    ###################LSTM########################
    LSTM_seed_1_regressor = compute_regressor(exp_condition = np.vstack((predictors[predictors['section_number']==section_num].offset, 
                                                              np.zeros(len(predictors[predictors['section_number']==section_num])),
                                                              predictors[predictors['section_number']==section_num].surp_LSTM_1)),
                                                hrf_model = "spm", 
                                                frame_times = np.arange(0.0, n_scans[section_num-1] * 2.0, 2.0))[0] 
    LSTM_seed_1_regressor = ortho_proj(LSTM_seed_1_regressor,word_rate_regressor)
    
    #################RNNG_LC_beam size 400###########
    RNNG_LC_1_4_regressor = compute_regressor(exp_condition = np.vstack((predictors[predictors['section_number']==section_num].offset, 
                                                              np.zeros(len(predictors[predictors['section_number']==section_num])),
                                                              predictors[predictors['section_number']==section_num].surp_RNNG_LC_1_4)),
                                                hrf_model = "spm", 
                                                frame_times = np.arange(0.0, n_scans[section_num-1] * 2.0, 2.0))[0]   
    RNNG_LC_1_4_regressor = ortho_proj(RNNG_LC_1_4_regressor,word_rate_regressor)
    
    #################RNNG_TD_beam size 1000##########
    RNNG_TD_2_10_regressor = compute_regressor(exp_condition = np.vstack((predictors[predictors['section_number']==section_num].offset, 
                                                               np.zeros(len(predictors[predictors['section_number']==section_num])),
                                                               predictors[predictors['section_number']==section_num].surp_RNNG_TD_2_10)),
                                                hrf_model = "spm", 
                                                frame_times = np.arange(0.0, n_scans[section_num-1] * 2.0, 2.0))[0] 
    RNNG_TD_2_10_regressor = ortho_proj(RNNG_TD_2_10_regressor,word_rate_regressor)
    
    ###########RNNGs_distance ######################
    dis_RNNG_LC_1_4_regressor = compute_regressor(exp_condition = np.vstack((predictors[predictors['section_number']==section_num].offset, 
                                                                  np.zeros(len(predictors[predictors['section_number']==section_num])),
                                                                  predictors[predictors['section_number']==section_num].dis_RNNG_LC_1_4)),
                                                    hrf_model = "spm", 
                                                    frame_times = np.arange(0.0, n_scans[section_num-1] * 2.0, 2.0))[0]   
    dis_RNNG_LC_1_4_regressor = ortho_proj(dis_RNNG_LC_1_4_regressor,word_rate_regressor)
    
    dis_RNNG_TD_2_10_regressor = compute_regressor(exp_condition = np.vstack((predictors[predictors['section_number']==section_num].offset, 
                                                                   np.zeros(len(predictors[predictors['section_number']==section_num])),
                                                                   predictors[predictors['section_number']==section_num].dis_RNNG_TD_2_10)),
                                                    hrf_model = "spm", 
                                                    frame_times = np.arange(0.0, n_scans[section_num-1] * 2.0, 2.0))[0] 
    dis_RNNG_TD_2_10_regressor = ortho_proj(dis_RNNG_TD_2_10_regressor,word_rate_regressor)
    
    #store all of the regressors
    regressors_df['word_rate']= word_rate_regressor.flatten()
    regressors_df['word_length']= word_length_regressor.flatten()
    regressors_df['word_freq']= word_freq_regressor.flatten()
    regressors_df['sentid'] = sentid_regressor.flatten()
    regressors_df['sentpos'] = sentpos_regressor.flatten()
    regressors_df['surp.ngram_five'] = ngram_five_regressor.flatten()
    regressors_df['surp.LSTM'] = LSTM_seed_1_regressor.flatten()
    regressors_df['surp.RNNG_TD'] = RNNG_TD_2_10_regressor.flatten()
    regressors_df['surp.RNNG_LC'] = RNNG_LC_1_4_regressor.flatten()
    regressors_df['dis_RNNG_TD'] = dis_RNNG_TD_2_10_regressor.flatten()
    regressors_df['dis_RNNG_LC'] = dis_RNNG_LC_1_4_regressor.flatten()
    regressors_df['section_number']=[section_num]*n_scans[section_num-1]
    
    return regressors_df

In [None]:
big_data = pd.DataFrame()
sections = [1,2,3,4]
for i in sections:
    data = convolve_regressors(i)
    big_data = big_data.append(data)

In [None]:
big_data

In [None]:
big_data.to_csv('BCCWJ_regressors.tsv',sep='\t')

## Correlations

In [None]:
sns.color_palette("crest",as_cmap=True)

corr = big_data.drop(columns=['section_number']).corr()
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)]=True
picture = sns.heatmap(corr,center=1.5,linewidth=.01,mask=mask,annot=True)
figure = picture.get_figure()