In [1]:
import os, pickle
import numpy as np
import pandas as pd
import librosa

In [3]:
#paths
tracks_path = '/home/maellef/DataBase/stimuli/movie10/'
data_path = '/home/maellef/DataBase/MuteMusic/Preprocessed_fMRI'
scale = 'MISTROI'
sub = 'sub-03'
filename = 'sub-03_ses-001_task-mutemusic_run-1.npz'

sessions_data = './data/sub_session_data.tsv'
sessions_df = pd.read_csv(sessions_data, sep='\t')
silences_data = './data/silences_data.tsv'
silences_df = pd.read_csv(silences_data, sep='\t')

path = os.path.join(data_path, scale, sub, filename)
with np.load(path) as data:
    x = data['X']
data.close()

In [None]:
def find_label_value(name, label):
    label += '-' if label[-1] != '-' else label
    startval = name.find(label)+len(label)
    endval = startval
    for i in range(startval, len(name)):
        if name[i] in ['_', '.']:
            break
        endval+=1
    return name[startval:endval]

In [None]:
#structure to store all data pair (wav-fMRI)
df = pd.DataFrame({
    'stimuli':[],
    'groupe':[],
    'category':[],
    'score':[],
})
wavbold = []
#general args
tr = 1.49
hrf_nb_tr = 2
sr=22050

sub_path = os.path.join(data_path, scale, sub)

for filename in os.listdir(sub_path):
    print(filename)
    #load bold data
    runpath = os.path.join(sub_path, filename)
    with np.load(runpath) as data:
        x = data['X']
    data.close()
    
    #args necessary to extract infos from run
    sub = sub
    ses = int(find_label_value(filename, 'ses'))
    run = int(find_label_value(filename, 'run'))
    
    run_data = sessions_df.loc[(sessions_df['sub']==sub)&
                                (sessions_df['session']==ses)&
                                (sessions_df['run']==run)]
    
    #extract data from each track in a run
    for i, track in run_data.iterrows():
        #track infos
        groupe = track['Groupe']
        category = track['category']
        score = track['value']
        title_silenced = track['title']
        title = title_silenced.replace('_silenced','')
        onset = round(track['onset'])
        duration = round(track['total_duration'])
    
        #load silenced track
        wav_path = os.path.join(tracks_path, category, title_silenced)
        wav, sr = librosa.load(wav_path, sr=sr)
        print(sr)
        
        #bold data for track (WITH HRF delay !!!!!)
        tr_onset = round(onset/tr) + hrf_nb_tr
        tr_duration = round(duration/tr)
        track_fmri = x[tr_onset:tr_onset+tr_duration, :]
    
        #define timestamps to extract both music and silence    
        sdf = silences_df.loc[silences_df['track']==title]
        print(title)
        timestamps_s = {'duration':[sdf['S1_duration'].values[0], sdf['S2_duration'].values[0], sdf['S3_duration'].values[0], sdf['S4_duration'].values[0]],
                        'start':[sdf['S1_start'].values[0], sdf['S2_start'].values[0], sdf['S3_start'].values[0], sdf['S4_start'].values[0]],
                        'stop':[sdf['S1_stop'].values[0], sdf['S2_stop'].values[0], sdf['S3_stop'].values[0], sdf['S4_stop'].values[0]]}
        siltt_df = pd.DataFrame(timestamps_s).sort_values(by='start').dropna()
        
        alltt = []
        music_start = 0
        for i, silence_tt in siltt_df.iterrows():
            silence_start = int(silence_tt['start'])
            music_stop = silence_start-1
            silence_stop = int(silence_tt['stop'])
            alltt+=[('m', music_start, music_stop), ('s', silence_start, silence_stop)]
            music_start = silence_stop+1
        alltt.append(('m', music_start, duration*sr))
    
        #extract corresponding wav/bold
        alltr = [(stim, round(start/sr/tr), round(stop/sr/tr)) for (stim, start, stop) in alltt]
        for (stim, tr_start, tr_stop) in alltr:
            bold_segment = track_fmri[tr_start:tr_stop]
            wav_segment = wav[int(tr_start*tr*sr):int(tr_stop*tr*sr)]
            segment_df = pd.Series({
                'stimuli':stim,
                'groupe':groupe,
                'category':category,
                'score':score}).to_frame().T
            df = pd.concat([df, segment_df], axis='rows', ignore_index=True)  
            wavbold.append((wav_segment, bold_segment))
df.to_csv(f'{sub}_metadata.tsv', sep='\t')
with open(f'{sub}_pairWavBold', 'wb') as f: 
    pickle.dump(wavbold, f) 