HP event segmentation project: 
re-preprocessing fMRI data and mapping into shared response space

1) The preprocessed Harry Potter fMRI dataset fMRI data download from: https://drive.google.com/file/d/1aYEZZSyrlo0UqswDBUiGzE3kGl3RcCn8/view?usp=sharing
2) The fMRI mask is a processed mask from https://github.com/anikethjr/brain_syntactic_representations/blob/main/mask_sub_fedorenko_HP_narrow_mirrored.npy
3) Other infomation about the data can be found at https://github.com/anikethjr/brain_syntactic_representations

25 Sep 2023 by Huidong Xue

In [1]:
import glob
import os
import sys
import numpy as np
from scipy import stats
from scipy import io as sio
import scipy.spatial.distance as sp_distance
import matplotlib.pyplot as plt
import pandas as pd
import pickle


from brainiak.isc import isc
from brainiak.fcma.util import compute_correlation
import brainiak.funcalign.srm
from brainiak import image, io
from brainiak.isc import isc
import brainiak.funcalign.srm
from brainiak import image, io



In [2]:
import requests

base_url = "https://github.com/anikethjr/brain_syntactic_representations/raw/main/masks/"
subjects = ["F.npy", "H.npy", "I.npy", "J.npy", "K.npy", "L.npy", "M.npy", "N.npy"]
dest_dir = "/Volumes/my drive/thesis_pipeline/data/masks"

if not os.path.exists(dest_dir):
    os.makedirs(dest_dir)

for subject in subjects:
    url = base_url + subject
    response = requests.get(url)
    if response.status_code == 200:
        with open(os.path.join(dest_dir, subject), 'wb') as f:
            f.write(response.content)
        print(f"Downloaded {subject}")
    else:
        print(f"Failed to download {subject}")


Downloaded F.npy
Downloaded H.npy
Downloaded I.npy
Downloaded J.npy
Downloaded K.npy
Downloaded L.npy
Downloaded M.npy
Downloaded N.npy


In [3]:
base_path = "/Volumes/my drive/thesis_pipeline"
masks_path = os.path.join(base_path, "data", "masks")
sub_space_data_path = os.path.join(base_path, "data", "sub_space_data")
output_path = os.path.join(base_path, "sub_space_ROIdata")

if not os.path.exists(output_path):
    os.makedirs(output_path)


In [4]:
all_subjects = ["F", "H", "I", "J", "K", "L", "M", "N"]
roi_names = [
    'PTL',
    'ATL',
    'AG',
    'IFG',
    'MFG',
    'IFGorb'
]
n_rois = len(roi_names)
NS = len(all_subjects)

In [6]:
mask_sub_path = os.path.join(masks_path, "mask_sub_fedorenko_HP_narrow_mirrored.npy")
mask_sub = np.load(mask_sub_path, allow_pickle=True, fix_imports=True)[()]


mask_num = dict([(s, mask_sub[s][np.load(os.path.join(masks_path, "{}.npy".format(s)))])
                 for s in all_subjects])

In [7]:
ROI_sub_data = []

for i_s, sub in enumerate(all_subjects):
    mask = np.load(os.path.join(masks_path, f"{sub}.npy"))
    data = np.load(os.path.join(sub_space_data_path, f"{sub}.npy"))
    print(sub)
    print(data.shape)
    for i, roi in enumerate(roi_names): 
        roi_mask = np.where(mask_num[sub]==(i+1))[0]
        roi_data = data[:, roi_mask]
        
        data_point = {}
        data_point["Subject"] = sub
        data_point["ROI"] = roi
        data_point["data"] = roi_data
        
        ROI_sub_data.append(data_point)

ROI_sub_data = pd.DataFrame(ROI_sub_data)

np.save(os.path.join(output_path, "HP_ROI_data.npy"), ROI_sub_data)
sio.savemat(os.path.join(output_path, "HP_ROI_data.mat"), {Subject: col.values for Subject, col in ROI_sub_data.items()})

F
(1291, 27905)
H
(1291, 24983)
I
(1291, 25263)
J
(1291, 29650)
K
(1291, 25003)
L
(1291, 24678)
M
(1291, 28752)
N
(1291, 24397)


In [9]:
srm_data_path = os.path.join(base_path, "SRM_data")
if not os.path.exists(srm_data_path):
    os.mkdir(srm_data_path)

# Load the ROI data
data = np.load(os.path.join(output_path, "HP_ROI_data.npy"), allow_pickle=True)

# Prepare the dataset for SRM, create list for each ROIs
for ROIi in roi_names:
    subroidata = data[np.where(data[:, 1] == ROIi)]
    roidata = subroidata[:, 2]
    
    # Normalize the data for each run
    run1 = [stats.zscore(d[0:325, :], axis=0) for d in roidata[:]]
    run2 = [stats.zscore(d[325:662, :], axis=0) for d in roidata[:]]
    run3 = [stats.zscore(d[662:926, :], axis=0) for d in roidata[:]]
    run4 = [stats.zscore(d[926:1291, :], axis=0) for d in roidata[:]]
    
    roidata_Z = [np.concatenate([sublist1, sublist2, sublist3, sublist4], axis=0) for sublist1, sublist2, sublist3, sublist4 in zip(run1, run2, run3, run4)]
    
    # Save the normalized data
    with open(os.path.join(srm_data_path, f"{ROIi}_dataZ.pkl"), 'wb') as file:
        pickle.dump(roidata_Z, file)

In [11]:
srm_data_norTS_path = os.path.join(base_path, "SRM_data_norTS")
num_subs = len(all_subjects)

# Create the SRM normalized data directory if it doesn't exist
if not os.path.exists(srm_data_norTS_path):
    os.makedirs(srm_data_norTS_path)

# Load the data and normalize the data across runs (the data was normalized for each run)
nVox_ROI = []

for ROIi in roi_names:
    with open(os.path.join(srm_data_path, f"{ROIi}_dataZ.pkl"), 'rb') as file:
        data = pickle.load(file)
        
        # Iterate through the subjects
        for sub in range(num_subs): 
            # Find rows where all elements are not NaN    
            non_nan_rows = ~np.any(np.isnan(data[sub]), axis=0)
            data[sub] = data[sub][:, non_nan_rows]    
            
            # Normalize the data
            data[sub] = stats.zscore(data[sub], axis=1, ddof=1)
    
    with open(os.path.join(srm_data_norTS_path, f"{ROIi}_dataTVZ_NANnorTS.pkl"), 'wb') as datafile:
        pickle.dump(data, datafile)
            
    nVox = [len(data[i][0]) for i in range(num_subs)]
    nVox_ROI.append(nVox)

with open(os.path.join(srm_data_path, f"{ROIi}_nGoodVox.pkl"), 'wb') as nVoxfile:
    pickle.dump(nVox_ROI, nVoxfile)

In [16]:
srm_data_norTS_path = os.path.join(base_path, "SRM_data_norTS")


for ROIj in roi_names:
    with open(os.path.join(srm_data_norTS_path, f"{ROIj}_dataTVZ_NANnorTS.pkl"), 'rb') as datafSRMfile:
        datafSRMraw = pickle.load(datafSRMfile)
        # The model requires a list of 2D arrays, element i has shape=[voxels_i, samples]
        # Each element in the list contains the fMRI data of one subject.
        datafSRM = [d.T for d in datafSRMraw[:]]
    
    # Estimating the SRM
    features = 90  # Number of features 
    n_iter = 10  # Number of iterations of fitting
    
    # Create the SRM object
    srm = brainiak.funcalign.srm.SRM(n_iter=n_iter, features=features)
    
    # Fit the SRM data
    print('Fitting SRM, may take a minute ...')
    srm.fit(datafSRM)
    
    print('SRM has been fit')
        
    output_file_path = os.path.join(srm_data_path, f"{ROIj}_srm")
    srm.save(output_file_path)
    
    # Use the model to transform matrix to Shared Response space
    shared = srm.transform(datafSRM)
    with open(os.path.join(srm_data_path, f"{ROIj}_shareddata.pkl"), 'wb') as sharedfile:
        pickle.dump(shared, sharedfile)
    
    with open(os.path.join(srm_data_norTS_path, f"{ROIj}_dataTVZ_NANnorTS_SRM.pkl"), 'wb') as useddatafile:
        pickle.dump(datafSRM, useddatafile)

Fitting SRM, may take a minute ...
SRM has been fit


  val = np.asanyarray(val)


Fitting SRM, may take a minute ...
SRM has been fit
Fitting SRM, may take a minute ...
SRM has been fit
Fitting SRM, may take a minute ...
SRM has been fit
Fitting SRM, may take a minute ...
SRM has been fit
Fitting SRM, may take a minute ...
SRM has been fit
