## Loading functions

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import norm
import scipy.io
import re
import itertools

import os
from os.path import join
import contextlib
from copy import deepcopy
import imp 
import time 
import sys

  import imp


## Settings

In [15]:
# settings
pp = 1
task = 0     # 0: main, 1: localizer
loc_run = 1  # run 1 or run 2

# MEG directory and file names
tasks = ['main', 'loc']
input_dir = f'/project/3018063.01/preproc/sub-{pp:03}/preproc/{tasks[task]}/'
out_fn = f'{tasks[task]}_behdf_sub-{pp:03}.csv'

# set behevioural directiories
stim_dir = f'/project/3018063.01/beh/stimuli/{pp}'
loud_dir = f'/project/3018063.01/beh/loudness/{pp}'
data_dir = f'/project/3018063.01/beh/data/{pp}'

#fns
sync_fn = f'MEG_sync_sub-{pp:03d}.mat'

# loading log data

In [4]:
# main loading
def data_load(pp,data_dir, stim_dir):
    """load mainpred mat file and stimuli matfile"""
    mat = scipy.io.loadmat(join(data_dir,
                                f'{pp}-mainpred.mat'))
    stimuli = scipy.io.loadmat(join(stim_dir, 
                                    f'{pp}_main_stims.mat'))
    return(mat, stimuli)


def stims_load(mat, stimuli):
    """using information from stimuli and pulse timing create dataframe 
    with frequency information, pulse location etc.
    note: 'volume_rel' & 'vol_abs' are the volume where this stimuli was measured
    'closest_volume_rel' & 'closest_volume_abs' are the volume which is the closest in time
    (half tr shift) - since a tr should capture information within that tr"""

    # set arrays
    freqz   = np.array([])
    timingz  = np.array([])
    timings_offsetz  = np.array([])
    runz     = np.array([])
    blockz   = np.array([])
    segmenz  = np.array([])
    centaz   = np.array([])
    centbz   = np.array([])
    probaz   = np.array([])
    probbz   = np.array([])

    for blk in np.arange(1, mat['timingz'][1].max()+1):
        # get blockidx
        idxblock = np.where(mat['timingz'][1] == blk) # where block is 1

        #get frequency presentation data for block
        frequencies = stimuli['pres_freq'][int(blk)-1, :]

        # other values
        tps = np.sum(mat['timingz'][3, idxblock] == 1) # get trials per secion

        #get timings back from mat file, substract begin time
        timings = mat['timingz'][6, idxblock]
        timings_offset = mat['timingz'][7, idxblock]
        matidx = np.where(mat['segmentz'][1] == blk)

        # append to arrays
        freqz = np.append(freqz, frequencies)
        timingz = np.append(timingz, timings)
        timings_offsetz = np.append(timings_offsetz, timings_offset)
        runz = np.append(runz, np.repeat(mat['segmentz'][0][matidx], tps))
        blockz = np.append(blockz, np.repeat(mat['segmentz'][1][matidx], tps))
        segmenz = np.append(segmenz, np.repeat(mat['segmentz'][2][matidx], tps))
        centaz = np.append(centaz, 2**np.repeat(mat['segmentz'][7][matidx], tps))   # cent freq a
        centbz = np.append(centbz, 2**np.repeat(mat['segmentz'][8][matidx], tps))  # cent freq b
        probaz = np.append(probaz, np.repeat(mat['segmentz'][5][matidx], tps))
        probbz = np.append(probbz, np.repeat(mat['segmentz'][6][matidx], tps))

    # oct variant 
    freqz_oct = np.log2(freqz)
    centaz_oct = np.log2(centaz)
    centbz_oct = np.log2(centbz)

    # put data into a dictionary and subsequentially in a dataframe
    stim_df_dict = {'frequencies': freqz,
                    'frequencies_oct': freqz_oct,
                    'timing': timingz,
                    'timing_offset': timings_offsetz,
                    'run': runz,
                    'block': blockz,
                    'segment': segmenz,
                    'center_freq_a': centaz,
                    'center_freq_b': centbz,
                    'center_freq_a_oct': centaz_oct,
                    'center_freq_b_oct': centbz_oct,
                    'probability_a': probaz,
                    'probability_b': probbz
                   }

    stim_df = pd.DataFrame(stim_df_dict)
    # Add the 'stimulus' column to df_beh
    stim_df['stimulus'] = stim_df.index + 1
    return(stim_df)


def sync_timing(df, sync_val, timingname='timing', new_timingname='timing_meg',
                              timingname_offset='timing_offset', new_timingname_offset='timing_offset_meg'):
    """use syncing value to get timings from stimpc domain into the MEG clock domain
    input df and sync value, returns adjusted dataframe"""

    # create new column in old dataframe
    df[new_timingname] = df[timingname] + sync_val
    df[new_timingname_offset] = df[timingname_offset] + sync_val
    # and return
    return(df)

In [6]:
# load sync files
sync_mat = scipy.io.loadmat(join(data_dir, sync_fn))
sync_val = sync_mat['MEG_sync']['mn'][0][0][0,0]

# get mat and stimuli struct
mat, stimuli = data_load(pp, data_dir, stim_dir)

# put in dataframe
df_beh = stims_load(mat, stimuli)
df_beh = sync_timing(df_beh, sync_val)

In [None]:
# settings

for pp in [1, 2, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14]:
    task = 0     # 0: main, 1: localizer
    loc_run = 1  # run 1 or run 2

    # MEG directory and file names
    tasks = ['main', 'loc']
    input_dir = f'/project/3018063.01/preproc/sub-{pp:03}/preproc/{tasks[task]}/'
    out_fn = f'{tasks[task]}_behdf_sub-{pp:03}.csv'

    # set behevioural directiories
    stim_dir = f'/project/3018063.01/beh/stimuli/{pp}'
    loud_dir = f'/project/3018063.01/beh/loudness/{pp}'
    data_dir = f'/project/3018063.01/beh/data/{pp}'

    #fns
    sync_fn = f'MEG_sync_sub-{pp:03d}.mat'

    # load sync files
    sync_mat = scipy.io.loadmat(join(data_dir, sync_fn))
    sync_val = sync_mat['MEG_sync']['mn'][0][0][0,0]

    # get mat and stimuli struct
    mat, stimuli = data_load(pp, data_dir, stim_dir)

    # put in dataframe
    df_beh = stims_load(mat, stimuli)
    df_beh = sync_timing(df_beh, sync_val)

    df_beh.to_csv(join(input_dir, out_fn), index=False)

In [26]:
# save df file to csv
df_beh.to_csv(join(input_dir, out_fn), index=False)

## loading dataframe data

In [32]:
df_beh = pd.read_csv(join(input_dir, out_fn))

'main_behdf_sub-014.csv'

In [33]:
df_beh

Unnamed: 0,frequencies,frequencies_oct,timing,timing_offset,run,block,segment,center_freq_a,center_freq_b,center_freq_a_oct,center_freq_b_oct,probability_a,probability_b,stimulus,timing_meg,timing_offset_meg
0,683.438005,9.416667,191025.80758,191026.009564,1.0,1.0,1.0,645.854171,1798.010924,9.335065,10.812186,0.891211,0.108789,1,122903.348354,122903.550338
1,683.438005,9.416667,191026.05758,191026.259580,1.0,1.0,1.0,645.854171,1798.010924,9.335065,10.812186,0.891211,0.108789,2,122903.598354,122903.800354
2,683.438005,9.416667,191026.30758,191026.509560,1.0,1.0,1.0,645.854171,1798.010924,9.335065,10.812186,0.891211,0.108789,3,122903.848354,122904.050334
3,767.133223,9.583333,191026.55758,191026.759608,1.0,1.0,1.0,645.854171,1798.010924,9.335065,10.812186,0.891211,0.108789,4,122904.098354,122904.300382
4,912.280287,9.833333,191026.80758,191027.009048,1.0,1.0,1.0,645.854171,1798.010924,9.335065,10.812186,0.891211,0.108789,5,122904.348354,122904.549821
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11515,2169.780417,11.083333,194544.45757,194544.659529,12.0,24.0,10.0,645.854171,1798.010924,9.335065,10.812186,0.074206,0.925794,11516,126421.998344,126422.200303
11516,1448.154688,10.500000,194544.70757,194544.909898,12.0,24.0,10.0,645.854171,1798.010924,9.335065,10.812186,0.074206,0.925794,11517,126422.248344,126422.450671
11517,2169.780417,11.083333,194544.95757,194545.159566,12.0,24.0,10.0,645.854171,1798.010924,9.335065,10.812186,0.074206,0.925794,11518,126422.498344,126422.700340
11518,2580.318310,11.333333,194545.20757,194545.409397,12.0,24.0,10.0,645.854171,1798.010924,9.335065,10.812186,0.074206,0.925794,11519,126422.748344,126422.950170


## transformation into the time domain
Below some function to help translate stimulus domain dataframes into timedomain ones (as would be the case for Pupil or MEG data).
+ `map_stim_to_time` takes the timedomain and stimulus domain (*df_beh*) dataframes and uses the onset and offset times (must be in same domain) to fit stimulus number into timedomain dataframe (*df*).
    + internally it `df.apply` a costum `_assign_stimulus` function, which np.searchsorted onset and offset timings
    + i.e. the stimulus dataframe with stim[12345] will fit within the timedomain as [00001110022200333004440055500]
+ `map_columns_to_time` takes the adjusted timedomain dataframe (with stimulus information) and fits collumns of interest into this new format
    + internally it uses `df[indicator_nm].map(stimulus_to_col).fillna(0)` to fill stimulus timecourses with per stimulus information
    + e.g. the stimulus dataframe [frequency information] stim[200 100 50 200] will fit within the timedomain as [0 0 0 0 200 200 200 0 0 100 etc.]

In [41]:
## MOVE FUNCTIONS FROM STIMULUS DOMAIN INTO TIMEDOMAIN
## MAP 'STIMULUS' TO INDEX OF WHAT STIMULI, FOR EASY MAPPING

def stim_save_segments(df_beh, groupby_nm=['block', 'segment']):
    """apply a new column to behavioural dataframe with segment_all - 
    indicating a continious numerical indicator of what segment we are on"""

    # predefine segement all in df_beh
    df_beh['segment_all'] = np.nan
    
    # loop over block and segment combinations
    for idx, row in df_beh.groupby(groupby_nm).first().reset_index().iterrows():

        # save new segment all 
        df_beh.loc[(df_beh['block'] == row['block']) & (df_beh['segment'] == row['segment']), 'segment_all'] = idx 

    # return the dataframe
    return(df_beh)


def map_stim_to_time(df, df_beh, cn_stim='stimulus', cn_ts='TIMESTAMP', 
                     beh_cn_onset='timing_meg', beh_cn_offset='timing_offset_meg'):
    """transform / map stimulus dataframe into the time domain
    input: df: Pandas dataframe - time domain
           df_beh: Pandas dataframe - stim domain
           cn_stim: (optional) column name for stimulus indicator
           cn_ts: (optional) column name for timestamp indicater in time df
           beh_cn_onset: (optional) column name for onset time in beh_df
           beh_cn_offset: (optional) column name for offset time in beh_df
    returns adjusted time domain df (Pandas dataframe) with columns existing fetched from beh df"""
    
    # Apply the function to create the 'stimulus' column in df_time
    df['stimulus'] = df['TIMESTAMP'].apply(_assign_stimulus, 
                                           df_beh=df_beh,
                                           cn_onset=beh_cn_onset,
                                           cn_offset=beh_cn_offset,
                                           cn_stimulus=cn_stim)
    # Return the dataframe
    return(df)


def map_columns_to_time(df, df_beh, col_to_trans, indicator_nm='stimulus'):
    """map columns of interest to transfer to timedomain
    df: dataframe in timedomain
    df_beh: dataframe in stim domain
    col_to_trans: all columns to transfer
    indicator_nm: (optional) indicator name - what to use for the mapping"""
    
    # loop over columns to transfer
    for colnm in col_to_trans:

        # create a dictionary to map 'stimulus' to all conditions I want to transfer to the other df
        stimulus_to_col = df_beh.set_index(indicator_nm)[colnm].to_dict()

        # map function to go from one to another
        df[colnm] = df[indicator_nm].map(stimulus_to_col).fillna(0)
        
    # returns dataframe
    return(df)


def map_block_to_run(df, df_beh, run_nm='run', block_nm_beh='block', block_nm='BLOCK'):
    """map from blocknumber to run number in timedomain"""

    # create a dictionary to map 'block' to run
    map_block_run = df_beh.set_index(block_nm_beh)[run_nm].to_dict()

    # map the df
    df[run_nm] = df[block_nm].map(map_block_run).fillna(0).astype(int)

    # return dataframe
    return(df)


def time_save_segments(df, df_beh,
                       groupby_nm=['block', 'segment'],
                       onset_nm='timing_meg',
                       offset_nm='timing_offset_meg',
                       timing_mm='TIMESTAMP'
                      ):
    """save segments into the time domain dataframe
    df: timedomain dataframe
    df_beh: stimdomain dataframe
    groupby_nm: (optional) list of names to groupby
    onset_nm: (optional) what to use as onset timings - in same timeframe
    offset_nm: (optional) what to use as offset timings - in same timeframe
    timing_nm: (optional) time indicator in original dataframe"""

    # get dataframe of onset and offset timings only
    onset_df = df_beh.groupby(groupby_nm).first()[onset_nm].reset_index()
    offset_df = df_beh.groupby(groupby_nm).last()[offset_nm].reset_index()

    # predefine all new columns in our timedomain dataframe
    df['block'] = np.nan
    df['segment'] = np.nan
    df['segment_all'] = np.nan

    # loop over all index (combinations)
    for idx, row in onset_df.iterrows():

        # get start and endtime of groupby section
        cur_onset = onset_df[onset_nm].iloc[idx]
        cur_offset = offset_df[offset_nm].iloc[idx]

        # map to OG dataframe
        df.loc[(df[timing_mm] >= cur_onset) & (df[timing_mm] <= cur_offset), 'segment'] = onset_df['segment'].iloc[idx]

        # save per segment indicator
        df.loc[(df[timing_mm] >= cur_onset) & (df[timing_mm] <= cur_offset), 'segment_all'] = idx
    
    return(df)


def time_save_onoff(df, onoff_nm='onoff', indicator='stimulus'):
    """save onoff value (bool), based on indicator value"""

    # predefine
    df['onoff'] = 0
    # take wherever there is any stimulus - set to 1
    df.loc[(df['stimulus'] > 0), 'onoff'] = 1
    
    return(df)


# create a function to assign stimuli based on timing
def _assign_stimulus(timing, 
                     df_beh, 
                     cn_onset='timing_meg', 
                     cn_offset='timing_offset_meg',
                     cn_stimulus='stimulus'):
    """pandas apply function to get stimuli into the time domain
    input df_beh, cn_onset (optional columnname of onset time),
    cn_offset (optional columnname of offset time), cn_stimulus (optional columnname of stimulus)"""
    idx = np.searchsorted(df_beh[cn_onset], timing)
    if idx == 0 or timing >= df_beh[cn_offset].iloc[idx - 1]:
        return 0
    return df_beh[cn_stimulus].iloc[idx - 1]


### Example of use

Below some examples of use. Note that these examples were copied from my MEG script and are not directly translateable, however they should give you a gist of how the approach works.

- I also left in the mapping of segment data, to show that these functions should be relatively robust in multiple senarios
where `map_columns_to_time(df, df_beh, col_to_trans, indicator_nm='stimulus ')` will fit columns based on stimulus as indicator
`map_columns_to_time(df, df_beh, col_to_trans, indicator_nm='segment_all')` will fit columns based on segment (but can also be used for blocks etc.)

In [43]:
# map stimulus indexing to timedomain (123 > 00011100222000333)
df = map_stim_to_time(df, df_beh, cn_stim='stimulus', cn_ts='TIMESTAMP', 
                     beh_cn_onset='timing_meg', beh_cn_offset='timing_offset_meg')

# get columns of interest to transfer and apply stim specific mapping
col_to_trans = ['frequencies', 'frequencies_oct', 'forward_adapation', 'forward_adapted_activation', 'surprisal', 'pred_prob']
df = map_columns_to_time(df, df_beh, col_to_trans)

# save segment and segment all in timedomain and segment all in main stimulus domain dataframe
df = time_save_segments(df, df_beh)
df_beh = stim_save_segments(df_beh)

# use the blocknumber to runnumber pairing in stimulus domain to map block to run in time domain
df = map_block_to_run(df, df_beh)

# map segment specific data onto current segment
col_to_trans = ['center_freq_a', 'center_freq_b', 'center_freq_a_oct', 'center_freq_b_oct', 'probability_a', 'probability_b']
df = map_columns_to_time(df, df_beh, col_to_trans, indicator_nm='segment_all')

# save onoff inside timing dataframe
df = time_save_onoff(df)