In [1]:
import os, pickle
import numpy as np
import pandas as pd
import librosa

In [2]:
#paths
stim_type = 'silenced'
tracks_path = f'/home/maellef/DataBase/mutemusic/stimuli/{stim_type}'
data_path = '/home/maellef/DataBase/mutemusic/Preprocessed_fMRI'
scale = 'MISTROI'
sub = 'sub-03'
filename = 'sub-03_ses-001_task-mutemusic_run-1.npz'

sessions_data = '/home/maellef/git/MuteMusic_analysis/data/sub_session_data.tsv'
sessions_df = pd.read_csv(sessions_data, sep='\t')
silences_data = '/home/maellef/git/MuteMusic_analysis/data/silences_data.tsv'
silences_df = pd.read_csv(silences_data, sep='\t')

path = os.path.join(data_path, scale, sub, filename)
with np.load(path) as data:
    x = data['X']
data.close()

In [3]:
def find_label_value(name, label):
    label += '-' if label[-1] != '-' else label
    startval = name.find(label)+len(label)
    endval = startval
    for i in range(startval, len(name)):
        if name[i] in ['_', '.']:
            break
        endval+=1
    return name[startval:endval]

In [18]:
#structure to store all data pair (wav-fMRI)
df = pd.DataFrame({
    'stimuli':[],
    'groupe':[],
    'category':[],
    'score':[],
})
wavbold = []
#general args
tr = 1.49
hrf_nb_tr = 2
sr=22050

sub_path = os.path.join(data_path, scale, sub)

for filename in os.listdir(sub_path):
    print(filename)
    #load bold data
    runpath = os.path.join(sub_path, filename)
    with np.load(runpath) as data:
        x = data['X']
    data.close()
    
    #args necessary to extract infos from run
    sub = sub
    ses = int(find_label_value(filename, 'ses'))
    run = int(find_label_value(filename, 'run'))
    
    run_data = sessions_df.loc[(sessions_df['sub']==sub)&
                                (sessions_df['session']==ses)&
                                (sessions_df['run']==run)]
    
    #extract data from each track in a run
    for i, track in run_data.iterrows():
        #track infos
        groupe = track['Groupe']
        category = track['category']
        score = track['value']
        title_silenced = track['title']
        title = title_silenced.replace('_silenced','')
        onset = round(track['onset'])
        duration = round(track['total_duration'])

        #load silenced track
        wav_path = os.path.join(tracks_path, category, title_silenced)
        wav, sr = librosa.load(wav_path, sr=sr)

        #bold data for track (WITH HRF delay !!!!!)
        tr_onset = round(onset/tr) + hrf_nb_tr
        tr_duration = round(duration/tr)
        track_fmri = x[tr_onset:tr_onset+tr_duration, :]
        print(title, sr, onset, tr_onset, duration, tr_duration, len(track_fmri))

        #define timestamps to extract both music and silence    
        sdf = silences_df.loc[silences_df['track']==title]
        timestamps_s = {'duration':[sdf['S1_duration'].values[0], sdf['S2_duration'].values[0], sdf['S3_duration'].values[0], sdf['S4_duration'].values[0]],
                        'start':[sdf['S1_start'].values[0], sdf['S2_start'].values[0], sdf['S3_start'].values[0], sdf['S4_start'].values[0]],
                        'stop':[sdf['S1_stop'].values[0], sdf['S2_stop'].values[0], sdf['S3_stop'].values[0], sdf['S4_stop'].values[0]]}
        siltt_df = pd.DataFrame(timestamps_s).sort_values(by='start').dropna()
        
        alltt = []
        music_start = 0
        #print(siltt_df)
        
        for i, silence_tt in siltt_df.iterrows():
            silence_start = int(silence_tt['start'])
            music_stop = silence_start-1
            silence_stop = int(silence_tt['stop'])
            alltt+=[('m', music_start, music_stop), ('s', silence_start, silence_stop)]
            music_start = silence_stop+1
        alltt.append(('m', music_start, duration*sr))
        
        #extract corresponding wav/bold
        alltr = [(stim, round(start/sr/tr), round(stop/sr/tr)) for (stim, start, stop) in alltt]
        for (stim, tr_start, tr_stop) in alltr:
            bold_segment = track_fmri[tr_start:tr_stop]
            wav_segment = wav[int(tr_start*tr*sr):int(tr_stop*tr*sr)]
            segment_df = pd.Series({
                'stimuli':stim,
                'groupe':groupe,
                'category':category,
                'score':score}).to_frame().T
            df = pd.concat([df, segment_df], axis='rows', ignore_index=True)  
            wavbold.append((wav_segment, bold_segment))
df.to_csv(f'{sub}_metadata_test.tsv', sep='\t')
with open(f'{sub}_pairWavBold_test', 'wb') as f: 
    pickle.dump(wavbold, f) 

sub-03_ses-001_task-mutemusic_run-5.npz
Super_Mario_64_BobOmb_Battlefield_(Extended)_Theme.wav 22050 6 6 54 36 36
m 0 30
30 985635
s 30 37
6 205065
m 37 57
0 0
s 57 64
0 0
m 64 72
0 0
s 72 78
0 0
m 78 36
0 0
Super_Mario_Bros_3_Underwater_Theme.wav 22050 69 48 46 31 31
m 0 18
18 591381
s 18 24
6 197127
m 24 39
7 225792
s 39 44
0 0
m 44 56
0 0
s 56 61
0 0
m 61 31
0 0
Space_Harrier_Music_MAIN_THEME.wav 22050 125 86 48 32 32
m 0 8
8 262836
s 8 16
8 262836
m 16 40
16 532728
s 40 48
0 0
m 48 32
0 0
Dreaming_of_Bag_End.wav 22050 182 124 52 35 35
m 0 11
11 361399
s 11 17
6 197127
m 17 39
18 588074
s 39 46
0 0
m 46 56
0 0
s 56 62
0 0
m 62 35
0 0
Creep.wav 22050 244 166 49 33 33
m 0 11
11 361399
s 11 16
5 164273
m 16 41
17 554778
s 41 47
0 0
m 47 62
0 0
s 62 68
0 0
m 68 33
0 0
Super_Mario_Bros_3_Athletic_Theme.wav 22050 303 205 50 34 34
m 0 16
16 525672
s 16 25
9 295690
m 25 56
9 281138
s 56 65
0 0
m 65 34
0 0
Star_Trek_the_Next_Generation_(Main_Title).wav 22050 362 245 56 38 38
m 0 11
11 361399

In [None]:
        #define timestamps to extract both music and silence    
        sdf = silences_df.loc[silences_df['track']==title]
        print(title)
        timestamps_s = {'duration':[sdf['S1_duration'].values[0], sdf['S2_duration'].values[0], sdf['S3_duration'].values[0], sdf['S4_duration'].values[0]],
                        'start':[sdf['S1_start'].values[0], sdf['S2_start'].values[0], sdf['S3_start'].values[0], sdf['S4_start'].values[0]],
                        'stop':[sdf['S1_stop'].values[0], sdf['S2_stop'].values[0], sdf['S3_stop'].values[0], sdf['S4_stop'].values[0]]}
        siltt_df = pd.DataFrame(timestamps_s).sort_values(by='start').dropna()
        
        alltt = []
        music_start = 0
        for i, silence_tt in siltt_df.iterrows():
            silence_start = int(silence_tt['start'])
            music_stop = silence_start-1
            silence_stop = int(silence_tt['stop'])
            alltt+=[('m', music_start, music_stop), ('s', silence_start, silence_stop)]
            music_start = silence_stop+1
        alltt.append(('m', music_start, duration*sr))