In [None]:
# nilearn imports
from nilearn.plotting import plot_roi, plot_stat_map, plot_anat
from nilearn.image import mean_img

In [None]:
# sklearn imports
from sklearn.model_selection import cross_validate, GridSearchCV, KFold
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import r2_score

In [None]:
# utility imports
import numpy as np
from matplotlib import pyplot as plt
import os.path
from joblib import dump, load
import tqdm

In [14]:
# Set the differents path to data and folders

# path to **PREPROCESSED** dataset of Sherlock or Merlin
local_sherlock_path = "/home/brain/datasets/SherlockMerlin_ds001110/"

# locate the folder containing feature vectors extracted from soundnet for the corresponding movie (merlin_pytorch or sherlock_pytorch
feature_folder = "soundnet_features/sherlock_pytorch/"

# folder for storing the resulting r2 brain maps
result_folder = "results/parcellation_conv{}/"

# path to mask files (make sure to match the naming convention)
### WARNING : be sure to write the mask corresponding to the task (MerlinMovie or SherlockMovie)
generic_mask_name = "/home/brain/datasets/SherlockMerlin_ds001110/sub-{:02d}/func/sub-{:02d}_task-SherlockMovie_bold_space-T1w_brainmask.nii.gz"

In [16]:
# generate folders for organized storage
os.mkdir("fmri_mean")
os.mkdir("wards")
os.mkdir("fmri_ready")
os.mkdir("results")
for id_layer in range(1,8):
    os.mkdir("results/parcellation_conv{}".format(id_layer))

FileExistsError: [Errno 17] File exists: 'results/parcellation_conv1'

In [None]:
os.makedirs('')

# Define utility functions

Used to load data more efficiently and in a modular manner

In [None]:
def parcellate(id_subject, n_frames):
    # Compute the ward parcellation and other fmri data of a given subject and save it
    
    # Function should be called only when the feature vector or number of parcels change
    from nilearn.regions import Parcellations
    from nilearn.input_data import NiftiMasker
    ### Load fmri data
    filename = "sub-{:02d}_task-SherlockMovie_bold_space-T1w_preproc.nii.gz".format(id_subject)
    folder_name = "sub-{:02d}/func".format(id_subject)
    irm_file = os.path.join(local_sherlock_path,folder_name, filename)
    fmri_mean = mean_img(irm_file)
    fmri_mean.to_filename("fmri_mean/sub-{:02d}.nii.gz".format(id_subject))
    print("Saved mean fmri for subject {}".format(id_subject))
    
    ### Compute mask
    filename_mask = generic_mask_name.format(id_subject,id_subject)
    masker = NiftiMasker(mask_img=filename_mask, detrend=True,standardize=True)
    masker.fit()
    ward = Parcellations(method='ward',mask=masker,standardize=True,smoothing_fwhm=None,n_parcels=500)
    ward.fit(irm_file)
    dump(ward, "wards/sub-{:02d}.nii.gz".format(id_subject))
    print("Saved ward mask for subject {}".format(id_subject))
    
    # Compute fmri_ready
    fmri_data = ward.transform(irm_file)
    # Truncate the data because of an offset in the fmri (see dataset description)
    fmri_ready = fmri_data[17:-(fmri_data.shape[0]-17-n_frames)]  
    np.save("fmri_ready/sub-{:02d}".format(id_subject), fmri_ready)    
    print("Saved fmri_ready for subject {}".format(id_subject))
    

In [None]:
def load_feature_vector(id_layer) :
    filename = "conv{}.npz".format(id_layer)
    file_fv = os.path.join(feature_folder, filename)
    fv = np.load(file_fv)['fv']
    # Check the size
    n_frames = fv.shape[0]
    print("layer {}, {} frames, FV dimension is {}".format(id_layer,n_frames, fv.shape[1]))
    return fv, n_frames

In [None]:
def load_fmri_data(id_subject):
    from nilearn.input_data import NiftiLabelsMasker
    fmri_ready = np.load("fmri_ready/sub-{:02d}.npy".format(id_subject))
    fmri_mean = "fmri_mean/sub-{:02d}.nii.gz".format(id_subject)
    ward = load("wards/sub-{:02d}.nii.gz".format(id_subject))
    return fmri_ready, fmri_mean, ward

In [None]:
def load_data(id_layer,id_subject): 
    X, _ = load_feature_vector(id_layer)
    y, fmri_mean, ward = load_fmri_data(id_subject)
    return X, y, fmri_mean, ward

# Data preparation :
We first have to pre-generate the data  that we will use repeatedly. 
We need the length of the feature vector to do so, which is why we use the corresponding function.
We then proceed to generating all the needed data, one subject at a time.

--- It should be noted that we do not have data for subject 5, which is why it is consistently skipped ---

In [None]:
_, n_frames = load_feature_vector(7)
for id_subject in range(1,18):
    if id_subject != 5 :
        parcellate(id_subject, n_frames)

# Model definition :
We use an MLP on multiple folds, with early stopping enabled, and only one hidden layer

In [None]:
def train(X, y, fmri_mean, ward, n_folds,id_layer, id_subject, alpha=0.0001, n_neurons = 1000, plot=False):
    print("Training on subject {} and layer {} with {} folds and alpha = {}".format(id_subject, id_layer, n_folds, alpha))
    mlp_estimator = MLPRegressor(hidden_layer_sizes=(n_neurons,),solver='adam',activation='relu',
                             max_iter=2000,learning_rate_init=0.001, alpha=alpha, 
                             batch_size=50,early_stopping=True,verbose=False, warm_start = False)
    cv = KFold(n_splits = n_folds)
    fold_number = 1
    scores = []
    for train_index, test_index in cv.split(X):
        print("Subject {} Fold {},layer {} index = [{},{}], alpha={},neurons={} :".format(id_subject,fold_number,id_layer, test_index[0], test_index[-1], alpha, n_neurons))
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        # warm_start is False so each train is fitted on a new MLP
        mlp_estimator.fit(X_train, y_train) 
        predictions = mlp_estimator.predict(X_test)
        # Compare predictions and truth, using the r2 metric
        r2_scores = r2_score(y_test, predictions, multioutput="raw_values")
        r2_scores[r2_scores < 0 ] = 0
        r2_max = np.max(r2_scores)
        print(r2_max)
        # Generate corresponding mapping with the brain
        scores_img = ward.inverse_transform(r2_scores.reshape((1,-1)))
        scores_img.to_filename("./{}/fold{}_sub{:02d}alpha{}_{}neurons.nii.gz".format(layer_result_folder, fold_number, id_subject,alpha, n_neurons))
        
        if plot :
            plot_stat_map(scores_img,bg_img=fmri_mean, title = "{} neurons fold{} sub{:02d} alpha{}".format(n_neurons,fold_number, id_subject, alpha))
            plt.savefig("./{}/{}neurons_fold{}_sub{:02d}alpha{}_.png".format(layer_result_folder, n_neurons,fold_number, id_subject, alpha))
            plt.show()
        fold_number += 1
        scores.append(r2_max)
    # scores is a list of length n_folds ,the  i-th element is the max r2 score (accross 500 parcels) for the i-th fold
    return scores

# Training and evaluation

In [None]:
# Define the parameters

hidden_layer_sizes = [100,500,1000]
n_folds = 4

In [None]:
for id_layer in range(5,8):
    #layer1-3 give non-significant results (10e-5)
    # layer4 gives bigger results but still insignificant
    print("Layer #{}".format(id_layer))
    layer_result_folder = result_folder.format(id_layer)
    filename = "conv{}_scores.pkl".format(id_layer)
    if filename in os.listdir(".") :
        layer_scores = load(filename)
    else: 
        layer_scores = {}
    for n_neurons in hidden_layer_sizes :
        print("Neurons {}".format(n_neurons))
        if n_neurons in layer_scores.keys():
            neurons_scores = layer_scores[n_neurons]
        else :
            neurons_scores = {}
        for id_subject in tqdm.tqdm(range(1,18)):
            print("Subject {}".format(id_subject))
            if id_subject != 5 :
                X, y, fmri_mean, ward = load_data(id_layer,id_subject)
                r2_scores = train(X, y, fmri_mean, ward, n_folds, id_layer, id_subject,0.0001, n_neurons)
            neurons_scores[id_subject] = list(r2_scores)
        layer_scores[n_neurons] = dict(neurons_scores)
        dump(layer_scores,filename) # The scores are stored in a dictionnary
