In [1]:
import os, librosa
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

#extract embedding
import torch
from torch import Tensor
from torchvision.models import feature_extraction
from torch.utils.data import DataLoader

#ridge regression
from sklearn.metrics import r2_score
from sklearn.linear_model import RidgeCV

#functions from directory
from models import soundnet_model as sm
import encoding_utils as eu

soundnet_param_path = 'models/sound8.pth'
#audio, any sample rate are ok --> conversion with librosa to 22050Hz
audio_dataset = 'dataset/audio'
#processed fMRI output (parcellation, voxels, ...) in npz
fMRI_dataset = 'dataset/fMRI'

tr=1.49

1 - Prepare your dataset

In [2]:
#create pair of audio-fMRI --> input/output for training
training_dataset = []
for audio in os.listdir(audio_dataset):
    audiopath = os.path.join(audio_dataset, audio)
    
    audioname = os.path.splitext(audio)[0]
    stimuli = audioname[len('movie10_'):]
    
    for fMRI in os.listdir(fMRI_dataset):
        if stimuli in fMRI:
            fMRIpath = os.path.join(fMRI_dataset, fMRI)
            wav, sr = librosa.load(audiopath, sr=22050)
            fMRI_arr = np.load(fMRIpath)['X']
            training_dataset.append((wav, fMRI_arr))

#convert list of pair into a usable pytorch dataset (using a custom dataset class)
test_dataset = eu.soundnet_dataset(training_dataset)
test_dataset.convert_input_to_tensor()
testloader = DataLoader(test_dataset)

2 - Prepare your network

In [3]:
#create your network
soundnet = sm.SoundNet8_pytorch()

#add parameters from previous training
soundnet.load_state_dict(torch.load(soundnet_param_path))

#look at your network architecture
print(soundnet.modules)
train_nodes, _ = feature_extraction.get_graph_node_names(soundnet) 
print(train_nodes)

#transform your model in a version where you can access internal embeddings
return_nodes = {'conv7.2':'conv7'}
soundnet_feat = feature_extraction.create_feature_extractor(soundnet, return_nodes=return_nodes)

<bound method Module.modules of SoundNet8_pytorch(
  (conv1): Sequential(
    (0): Conv2d(1, 16, kernel_size=(64, 1), stride=(2, 1), padding=(32, 0))
    (1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
  )
  (pool1): MaxPool2d(kernel_size=(8, 1), stride=(8, 1), padding=0, dilation=1, ceil_mode=False)
  (conv2): Sequential(
    (0): Conv2d(16, 32, kernel_size=(32, 1), stride=(2, 1), padding=(16, 0))
    (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
  )
  (pool2): MaxPool2d(kernel_size=(8, 1), stride=(8, 1), padding=0, dilation=1, ceil_mode=False)
  (conv3): Sequential(
    (0): Conv2d(32, 64, kernel_size=(16, 1), stride=(2, 1), padding=(8, 0))
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
  )
  (conv4): Sequential(
    (0): Conv2d(64, 128, kernel_size=(8, 1), stride=(2, 1), padding=(4, 0

3 - create embedding of the wav in dataset through SoundNet and extract embedding at layer 7

In [4]:
out_p = eu.test(testloader, net=soundnet_feat, return_nodes=return_nodes)

4 - conversion of embedding of SoundNet layer 7 to temporal resolution of fMRI (TO ADAPT: simple version working for TR=1.49s)

In [5]:
converted_embedding, fMRI_output = [], []

for embedding, output in out_p['conv7']:
    embedding_arr = embedding.squeeze().numpy()
    output_arr = output.squeeze().numpy()
    nb_tr, nb_roi = output_arr.shape
    embedding_arr = embedding_arr[:, :nb_tr]
    converted_embedding.append(embedding_arr.T)
    fMRI_output.append(output_arr)
    print(embedding_arr.shape, output_arr.shape, nb_tr)

Ridge_training_embedding = converted_embedding[:4]
Ridge_training_output = fMRI_output[:4]
Ridge_testing_embedding = converted_embedding[4]
Ridge_testing_output = fMRI_output[4]

(1024, 406) (406, 210) 406
(1024, 410) (410, 210) 410
(1024, 392) (392, 210) 392
(1024, 406) (406, 210) 406
(1024, 410) (410, 210) 410


5 - training a ridge regression to learn fMRI features from soundnet features 

In [6]:
x = np.concatenate(Ridge_training_embedding, axis=0)
y = np.concatenate(Ridge_training_output, axis=0)

print(x.shape, y.shape)
model = RidgeCV(
        alphas=(0.1, 1.0, 10.0),
        fit_intercept=True,
        cv=None,
        alpha_per_target=False)

model.fit(x, y)

(1614, 1024) (1614, 210)


6 - testing the trained ridge on testing dataset

In [9]:
x_test = Ridge_testing_embedding
y_test = Ridge_testing_output

y_predict = model.predict(x_test)
print(y_test.shape, y_predict.shape)
r2 = r2_score(y_test, y_predict, multioutput='raw_values')
print('r2: ', max(r2), ', roi: ', np.argmax(r2))

(410, 210) (410, 210)
r2:  0.2715117274206007 , roi:  153
