In [1]:
import numpy as np
import pandas as pd

In [2]:
DATA = '../data/malcolm_clean_w_duration.csv'
SAVE_DIR = '../data/'
DURATIONS = [21, 25, 27, 28]

In [3]:
df = pd.read_csv(DATA)
print('number of rows: ' + str(len(df)))
print('number of unique mids: ' + str(df.MID.nunique()))

number of rows: 1663
number of unique mids: 1663


In [4]:
df.head()

Unnamed: 0,Study,Group,Drug,Control,MID,start,start_size,end,end_size,duration
0,J000100672,3,Cisplatin,0.0,396-11,0.0,212.49,27.0,1275.52,27.0
1,J000100672,3,Cisplatin,0.0,396-20,0.0,180.48,27.0,1115.51,27.0
2,J000100672,3,Cisplatin,0.0,396-21,0.0,76.94,27.0,1056.95,27.0
3,J000100672,3,Cisplatin,0.0,396-31,0.0,151.44,27.0,1330.26,27.0
4,J000100672,3,Cisplatin,0.0,396-32,0.0,127.7,27.0,746.38,27.0


In [5]:
def process_data(df, min_duration, fn):
    d = df.loc[df.duration >= min_duration]
    d = d[['Study', 'Group', 'Drug', 'Control', 'MID', 'start_size', 'end_size', 'duration']]
    d = d.rename(columns={'start_size': 'V0', 'end_size': 'V'})
    # Compute V/V0, centered
    d['V/V0'] = d['V']/d['V0']
    d = d.merge(d.groupby(['Study', 'Drug'])['V/V0'].mean().reset_index(name='V/V0_sm'),
                on=['Study', 'Drug'],
                validate='many_to_one')
    d['V/V0_cen'] = d['V/V0'] - d['V/V0_sm']
    # Compute C0 for each Study-Drug pair
    c = d.loc[d.Control == 1]
    # each study has exactly one control
    assert(c.groupby('Study').Drug.nunique().max() == 1)
    assert(c.groupby('Study').Drug.nunique().min() == 1)
    assert(c.groupby('Study').Group.nunique().max() == 1)
    assert(c.groupby('Study').Group.nunique().min() == 1)
    assert((c.Control == 1).all())
    # Get C0 for each (Study, Drug) pair
    c = c[['Study', 'Group', 'Control', 'MID', 'V0']]
    d = d.merge(c.groupby('Study')['V0'].mean().reset_index(name='C0'),
                on=['Study'], 
                validate='many_to_one')
    # Compute V/C0, centered
    d['V/C0'] = d['V'] / d['C0']
    d = d.merge(d.groupby(['Study', 'Drug'])['V/C0'].mean().reset_index(name='V/C0_sm'),
                on=['Study', 'Drug'],
                validate='many_to_one')
    d['V/C0_cen'] = d['V/C0'] - d['V/C0_sm']
    # Compute log(V/C0), centered
    d['lg_V/C0'] = np.log(d['V/C0'])
    d = d.merge(d.groupby(['Study', 'Drug'])['lg_V/C0'].mean().reset_index(name='lg_V/C0_sm'))
    d['lg_V/C0_cen'] = d['lg_V/C0'] - d['lg_V/C0_sm']
    d.to_csv(fn, index=False)

In [6]:
# process data for each duration
for min_duration in DURATIONS:
    fn = SAVE_DIR + 'min_duration_' + str(min_duration) + '.csv'
    process_data(df, min_duration, fn)

In [8]:
df.loc[df.duration >= 21].MID.nunique()

1445