In [1]:
# nilearn imports
from nilearn.plotting import plot_roi, plot_stat_map, plot_anat
from nilearn.image import mean_img



In [2]:
# sklearn imports
from sklearn.model_selection import cross_validate, GridSearchCV, KFold
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import r2_score

In [3]:
# utility imports
import numpy as np
from matplotlib import pyplot as plt
import os.path
from joblib import dump, load
import tqdm

In [4]:
### Remove useless warnings from scikit learn

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

In [5]:
#### CHANGE THE PATH TO CORRESPOND TO THE PATH ON YOUR SYSTEM

def initialize(movie):
    # Generate folders for organized storage
    folders = ["fmri_mean_{}", "wards_{}", "fmri_ready_{}", "results"]
    for folder in folders :
        folder_name = folder.format(movie)
        try :
            os.mkdir(folder_name)
        except OSError :
            print("Folder already exists, skipping creation ({})".format(folder_name))
        else :
            print("Folder created  ({})".format(folder_name))

    result_folders = ["results/{}_conv{}".format(movie, id_layer) for id_layer in range(1,8)]
    for result_folder in result_folders :
        try :
            os.mkdir(result_folder)
        except OSError :
            print("Folder already exists, skipping creation ({})".format(result_folder))
        else :
            print("Folder created ({})".format(result_folder))
    
    # Set the differents path to data and folders

    # path to dataset of Sherlock or Merlin or TwilightZone
    if movie in ("sherlock","merlin"):
        # fmri data location
        
        ### CHANGE
        local_movie_path = "/home/brain/datasets/SherlockMerlin_ds001110/"
        
        # mask name
        movie_mask =  "{}Movie_bold_space-T1w_brainmask.nii.gz".format(movie.capitalize())
        
        ### CHANGE
        generic_mask_name = "/home/brain/datasets/SherlockMerlin_ds001110/sub-{:02d}/func/sub-{:02d}_task-" + movie_mask
        
        # fmri file name
        generic_filename = "sub-{:02d}_task-" + "{}Movie_bold_space-T1w_preproc.nii.gz".format(movie.capitalize())
        
        # anat file name
        ### CHANGE
        anat_filename = "/home/brain/victor/datasets/fmri_anat_{}/".format(movie) + "sub-{:02d}.nii.gz"
        
    elif movie == "twilight-zone" :
        # fmri data location
        ### CHANGE
        local_movie_path = "/home/brain/victor/datasets/twilight-zone"
        
        # there are no pre-existing masks
        generic_mask_name = ""
        
        # fmri file name
        generic_filename = "sub-{:02d}_task-watchmovie_bold.nii.gz"
    else :
        raise ValueError("The movie name has to be 'twilight-zone', 'merlin' or 'sherlock'")

        
    # locate the folder containing feature vectors extracted from soundnet for the corresponding movie (merlin_pytorch or sherlock_pytorch
    ### CHANGE
    feature_folder = "soundnet_features/{}_pytorch/".format(movie)

    # folder for storing the resulting r2 brain maps
    folder_name = "results/{}_".format(movie)
    result_folder = folder_name + "conv{}/"
    
    # Configure the subjects corresponding to the movie : 
    # Sherlock : [1,18]\{5} 
    # Merlin : [19,37]\{25}
    # Twilight-zone : [1,25]
    id_subjects = {
    "sherlock": (1,18,5),
    "merlin": (19,37,25),
    "twilight-zone": (1,25,0)
    }

    sub_values = id_subjects[movie]
    
    return local_movie_path, generic_mask_name, generic_filename, anat_filename, feature_folder, result_folder, sub_values


In [6]:
# Movie has to be 'sherlock', 'merlin' or 'twilight-zone'
movie = 'sherlock'
local_movie_path, generic_mask_name, generic_filename, anat_filename, feature_folder, result_folder, sub_values = initialize(movie)
min_sub, max_sub, null_sub = sub_values

Folder already exists, skipping creation (fmri_mean_sherlock)
Folder already exists, skipping creation (wards_sherlock)
Folder already exists, skipping creation (fmri_ready_sherlock)
Folder already exists, skipping creation (results)
Folder already exists, skipping creation (results/sherlock_conv1)
Folder already exists, skipping creation (results/sherlock_conv2)
Folder already exists, skipping creation (results/sherlock_conv3)
Folder already exists, skipping creation (results/sherlock_conv4)
Folder already exists, skipping creation (results/sherlock_conv5)
Folder already exists, skipping creation (results/sherlock_conv6)
Folder already exists, skipping creation (results/sherlock_conv7)


# Define utility functions

Used to load data more efficiently and in a modular manner

In [7]:
from nilearn.regions import Parcellations
from nilearn.input_data import NiftiMasker
from nilearn.masking import compute_background_mask
import os

def parcellate(id_subject, n_frames, compute_mean = True, compute_ward = True, compute_ready = True):
    # Compute the ward parcellation and other fmri data of a given subject and save it
    
    folder_name = "sub-{:02d}/func".format(id_subject)
    subject_filename = generic_filename.format(id_subject)
    fmri_file = os.path.join(local_movie_path,folder_name, subject_filename)

    # Compute the mean of fmri accross time
    if not os.path.isfile("fmri_mean_{}/sub-{:02d}.nii.gz".format(movie, id_subject)) or compute_mean:
        fmri_mean = mean_img(fmri_file)
        fmri_mean.to_filename("fmri_mean_{}/sub-{:02d}.nii.gz".format(movie, id_subject))
        print("Saved mean fmri for subject {}".format(id_subject))
    else :
        print("Mean fmri already exists for subject {}".format(id_subject))
        fmri_mean = "fmri_mean_{}/sub-{:02d}.nii.gz".format(movie, id_subject)
 
    # Compute mask if the movie is twilight-zone 
    if movie == "twilight-zone":
        print("Computing background mask")
        mask_img = compute_background_mask(fmri_file)
    else :
        print("Loading pre-generated mask")
        mask_img = generic_mask_name.format(id_subject,id_subject)
    
    # Compute ward parcellation
    if not os.path.isfile("wards_{}/sub-{:02d}.nii.gz".format(movie,id_subject)) or compute_ward:
        masker = NiftiMasker(mask_img=mask_img, detrend=True,standardize=True)
        masker.fit()
        ward = Parcellations(method='ward',mask=masker,standardize=True,smoothing_fwhm=None,n_parcels=500)
        ward.fit(fmri_file)
        dump(ward, "wards_{}/sub-{:02d}.nii.gz".format(movie,id_subject))
        print("Saved ward mask for subject {}".format(id_subject))
    else :
        ward = load("wards_{}/sub-{:02d}.nii.gz".format(movie,id_subject))
        print("Ward mask exists for subject {}".format(id_subject))

     # Compute fmri_ready       
    if not os.path.isfile("fmri_ready_{}/sub-{:02d}.npy".format(movie, id_subject)) or compute_ready :
        print("fmri_file : ", fmri_file)
        fmri_data = ward.transform(fmri_file)
        # Truncate the data because of an offset in the fmri (see dataset description)
        if movie in ("merlin, sherlock"):
            # 25 seconds of offset, and 1.5s per frame
            fmri_ready = fmri_data[17:-(fmri_data.shape[0]-17-n_frames)]
        else :
            # 15 TR of offset, and 1.5s per frame
            fmri_ready = fmri_data[15:-(fmri_data.shape[0]-15-n_frames)]
        np.save("fmri_ready_{}/sub-{:02d}".format(movie, id_subject), fmri_ready)    
        print("Saved fmri_ready for subject {}".format(id_subject))
    else :
        print("fmri_ready exists for subject {} and movie {}".format(id_subject, movie))


In [8]:
def load_feature_vector(id_layer) :
    filename = "conv{}.npz".format(id_layer)
    file_fv = os.path.join(feature_folder, filename)
    fv = np.load(file_fv)['fv']
    print("Loaded feature vector from {}".format(file_fv))
    # Check the size
    n_frames = fv.shape[0]
    print("layer {}, {} frames, FV dimension is {}".format(id_layer,n_frames, fv.shape[1]))
    return fv, n_frames

In [9]:
def load_fmri_data(id_subject):
    fmri_ready_path = "fmri_ready_{}/sub-{:02d}.npy".format(movie, id_subject)
    fmri_ready = np.load(fmri_ready_path)
    print("Loaded {}".format(fmri_ready_path))
    
    fmri_mean_path = "fmri_mean_{}/sub-{:02d}.nii.gz".format(movie, id_subject)
    print("Loaded {}".format(fmri_mean_path))
    
    ward_path = "wards_{}/sub-{:02d}.nii.gz".format(movie, id_subject)
    ward = load(ward_path)
    print("Loaded {}".format(ward_path))
    
    return fmri_ready, fmri_mean_path, ward

In [10]:
def load_data(id_layer,id_subject): 
    X, _ = load_feature_vector(id_layer)
    y, fmri_mean, ward = load_fmri_data(id_subject)
    return X, y, fmri_mean, ward

# Data preparation :
We first have to pre-generate the data  that we will use repeatedly. 
We need the length of the feature vector to do so, which is why we use the corresponding function.
We then proceed to generating all the needed data, one subject at a time.

--- It should be noted that we do not have data for subject null_subject, which is why it is consistently skipped ---

In [19]:
_, n_frames = load_feature_vector(7)
for id_subject in range(min_sub,max_sub):
    if id_subject != null_sub :
        parcellate(id_subject, n_frames,compute_mean = True, compute_ward=True, compute_ready=True)

Loaded feature vector from soundnet_features/sherlock_pytorch/conv7.npz
layer 7, 946 frames, FV dimension is 1024
Saved mean fmri for subject 1
Loading pre-generated mask
[MultiNiftiMasker.fit] Loading data from None
[MultiNiftiMasker.transform] Resampling mask
[Parcellations] Loading data
[MultiNiftiMasker.transform_single_imgs] Loading data from Nifti1Image('/home/brain/datasets/SherlockMerlin_ds001110/sub-01/func/sub-01_task-SherlockMovie_bold_space-T1w_preproc.nii.gz')
[MultiNiftiMasker.transform_single_imgs] Extracting region signals



KeyboardInterrupt



# Model definition :
We use an MLP on multiple folds, with early stopping enabled, and only one hidden layer

In [11]:
def train(X, y, fmri_mean, ward, n_folds,id_layer, id_subject, alpha=0.0001, n_neurons = 1000, plot=False):
    print("Training on subject {} and layer {} with {} folds and alpha = {}".format(id_subject, id_layer, n_folds, alpha))
    mlp_estimator = MLPRegressor(hidden_layer_sizes=(n_neurons,),solver='adam',activation='relu',
                             max_iter=2000,learning_rate_init=0.001, alpha=alpha, 
                             batch_size=50,early_stopping=True,verbose=False, warm_start = False)
    cv = KFold(n_splits = n_folds)
    fold_number = 1
    scores = []
    for train_index, test_index in cv.split(X):
        print("Subject {} Fold {},layer {} index = [{},{}], alpha={},neurons={} :".format(id_subject,fold_number,id_layer, test_index[0], test_index[-1], alpha, n_neurons))
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        # warm_start is False so each train is fitted on a new MLP
        mlp_estimator.fit(X_train, y_train) 
        predictions = mlp_estimator.predict(X_test)
        # Compare predictions and truth, using the r2 metric
        r2_scores = r2_score(y_test, predictions, multioutput="raw_values")
        r2_scores[r2_scores < 0 ] = 0
        r2_max = np.max(r2_scores)
        print(r2_max)
        # Generate corresponding mapping with the brain
        scores_img = ward.inverse_transform(r2_scores.reshape((1,-1)))
        scores_img.to_filename("./{}/fold{}_sub{:02d}alpha{}_{}neurons.nii.gz".format(layer_result_folder, fold_number, id_subject,alpha, n_neurons))
        fmri_anat = anat_filename.format(id_subject)
        if plot :
            plot_stat_map(scores_img,bg_img=fmri_anat, title = "{} neurons fold{} sub{:02d} alpha{}".format(n_neurons,fold_number, id_subject, alpha))
            plt.savefig("./{}/{}neurons_fold{}_sub{:02d}alpha{}_.png".format(layer_result_folder, n_neurons,fold_number, id_subject, alpha))
            plt.show()
        fold_number += 1
        scores.append(r2_max)
    # scores is a list of length n_folds ,the  i-th element is the max r2 score (accross 500 parcels) for the i-th fold
    return scores

# Training and evaluation

In [12]:
# Define the parameters

hidden_layer_sizes = [1000]
n_folds = 4

In [None]:
for id_layer in range(7,8):
    #layer1-3 give non-significant results (10e-5)
    # layer4 gives bigger results but still insignificant
    print("Layer #{}".format(id_layer))
    layer_result_folder = result_folder.format(id_layer)
    filename = "{}_conv{}_scores.pkl".format(movie, id_layer)
    if filename in os.listdir(".") :
        layer_scores = load(filename)
    else: 
        layer_scores = {}
    for n_neurons in hidden_layer_sizes :
        print("Neurons {}".format(n_neurons))
        if n_neurons in layer_scores.keys():
            neurons_scores = layer_scores[n_neurons]
        else :
            neurons_scores = {}
        for id_subject in tqdm.tqdm(range(min_sub, max_sub)):
            print("Subject {}".format(id_subject))
            if id_subject != null_sub :
                X, y, fmri_mean, ward = load_data(id_layer,id_subject)
                r2_scores = train(X, y, fmri_mean, ward, n_folds, id_layer, id_subject,0.0001, n_neurons, plot = True)
            neurons_scores[id_subject] = list(r2_scores)
        layer_scores[n_neurons] = dict(neurons_scores)
        dump(layer_scores,filename) # The scores are stored in a dictionnary
