In [8]:
import numpy as np
import pandas as pd

In [9]:
DATA = '../data/malcolm_clean_w_duration.csv'
SAVE_DIR = '../data/2023-04-05/'
DURATIONS = [21, 25, 27, 28]

In [10]:
df = pd.read_csv(DATA)
print('number of rows: ' + str(len(df)))
print('number of unique mids: ' + str(df.MID.nunique()))

number of rows: 1663
number of unique mids: 1663


In [15]:
def process_data(df, min_duration, fn):
    d = df.loc[df.duration >= min_duration]
    print('min_duration = ' + str(min_duration))
    print('num rows: ' + str(len(d)))
    d = d[['Study', 'Group', 'Drug', 'Control', 'MID', 'start_size', 'end_size', 'duration']]
    d = d.rename(columns={'start_size': 'V0', 'end_size': 'V'})
    # Compute V/V0, centered
    old_len = len(d)
    d['V/V0'] = d['V']/d['V0']
    d = d.merge(d.groupby(['Study', 'Drug'])['V/V0'].mean().reset_index(name='V/V0_sm'),
                on=['Study', 'Drug'],
                validate='many_to_one')
    d['V/V0_cen'] = d['V/V0'] - d['V/V0_sm']
    assert (len(d) == old_len)
    # Compute C0 for each Study-Drug pair
    c = d.loc[d.Control == 1]
    # each study has exactly one control
    assert(c.groupby('Study').Drug.nunique().max() == 1)
    assert(c.groupby('Study').Drug.nunique().min() == 1)
    assert(c.groupby('Study').Group.nunique().max() == 1)
    assert(c.groupby('Study').Group.nunique().min() == 1)
    assert((c.Control == 1).all())
    # Get C0 for each (Study, Drug) pair
    old_mids = d.MID.nunique()
    c = c[['Study', 'Group', 'Control', 'MID', 'V0']]
    c_studies = c.Study.unique()
    nmids_wo_control = d.loc[d.Study.isin(c_studies) == False].MID.nunique()
    d = d.merge(c.groupby('Study')['V0'].mean().reset_index(name='C0'),
                on=['Study'], 
                validate='many_to_one')
    print('dropped ' + str(nmids_wo_control) + ' MIDs without a control with duration >= ' + str(min_duration))
    assert(old_mids - d.MID.nunique() == nmids_wo_control)
    # Compute V/C0, centered
    old_len = len(d)
    d['V/C0'] = d['V'] / d['C0']
    d = d.merge(d.groupby(['Study', 'Drug'])['V/C0'].mean().reset_index(name='V/C0_sm'),
                on=['Study', 'Drug'],
                validate='many_to_one')
    d['V/C0_cen'] = d['V/C0'] - d['V/C0_sm']
    assert(len(d) == old_len)
    # Compute log(V/C0), centered
    old_len = len(d)
    d['lg_V/C0'] = np.log(d['V/C0'])
    d = d.merge(d.groupby(['Study', 'Drug'])['lg_V/C0'].mean().reset_index(name='lg_V/C0_sm'))
    d['lg_V/C0_cen'] = d['lg_V/C0'] - d['lg_V/C0_sm']
    assert(len(d) == old_len)
    # Compute log(V/V0), centered
    old_len = len(d)
    d['log_V/V0'] = np.log2(d['V/V0'])
    d = d.merge(d.groupby(['Study', 'Drug'])['log_V/V0'].mean().reset_index(name='log_V/V0_sm'), 
                on=['Study', 'Drug'],
                validate='many_to_one')
    d['log_V/V0_cen'] = d['log_V/V0'] - d['log_V/V0_sm']
    assert(len(d) == old_len)
    # Compute V0_sm
    old_len = len(d)
    d = d.merge(d.groupby(['Study', 'Drug'])['V0'].mean().reset_index(name='V0_samp'), 
                on=['Study', 'Drug'],
                validate='many_to_one')
    assert(len(d) == old_len)
    # Compute log(V/V0_sm), centered
    old_len = len(d)
    d['log_V/V0_samp'] = np.log2(d['V']/d['V0_samp'])
    d = d.merge(d.groupby(['Study', 'Drug'])['log_V/V0_samp'].mean().reset_index(name='log_V/V0_samp_sm'), 
                on=['Study', 'Drug'],
                validate='many_to_one')
    d['log_V/V0_samp_cen'] = d['log_V/V0_samp'] - d['log_V/V0_samp_sm']
    assert(len(d) == old_len)
    d.to_csv(fn, index=False)

In [16]:
# process data for each duration
for min_duration in DURATIONS:
    fn = SAVE_DIR + 'min_duration_' + str(min_duration) + '.csv'
    process_data(df, min_duration, fn)

min_duration = 21
num rows: 1445
dropped 75 MIDs without a control with duration >= 21
min_duration = 25
num rows: 1159
dropped 59 MIDs without a control with duration >= 25
min_duration = 27
num rows: 911
dropped 59 MIDs without a control with duration >= 27
min_duration = 28
num rows: 817
dropped 52 MIDs without a control with duration >= 28
