In [1]:
import numpy as np
import pandas as pd
from scipy import interpolate
import sys

END_DAY = 22
MIN_VOL = 1.0
NUM_ARGS = 4

def get_start(df):
	start = df.loc[df.groupby('MID')['Day'].idxmin()]
	return start.rename(columns = {'Day': 'start', 'Volume': 'start_vol'})

def get_end(df):
	end = df.loc[df.groupby('MID')['Day'].idxmax()]
	return end.rename(columns = {'Day': 'end', 'Volume': 'end_vol'})

def get_log_volume(df):
    # clip end volumes so that all are non-zero
    df['end_vol'] = df['end_vol'].clip(lower=MIN_VOL)
    df['V_V0'] = df['end_vol'].div(df['start_vol'])
    df['log(V_V0)'] = np.log2(df['V_V0'])
    return df

def split_by_duration(df, end_day):
    n_mids = df.MID.nunique()
    short_mids = df.loc[df.end < end_day].MID.unique()
    short_df = df.loc[df.MID.isin(short_mids)]
    normal_mids = df.loc[df.end >= end_day].MID.unique()
    normal_df = df.loc[df.MID.isin(normal_mids)]
    assert set(short_mids).isdisjoint(set(normal_mids))
    assert len(short_mids) + len(normal_mids) == n_mids
    return short_mids, short_df, normal_mids, normal_df

def assign_single_value(df, mids, max_val):
    df['log(V_V0)'] = max_val
    df = df[['MID', 'Sample', 'Drug', 'log(V_V0)']].drop_duplicates()
    assert (len(df) == len(mids))
    assert (set(df.MID.unique()) >= set(mids))
    return df

def assign_end_value(df, mids, end_day):
    # only keep rows for the end_day
    df = df.loc[df.Day == end_day]
    df = df[['MID', 'Sample', 'Drug', 'log(V_V0)']].drop_duplicates()
    # ensure no mid's dropped
    assert (len(df == len(mids)))
    assert (set(df.MID.unique()) >= set(mids))
    return df

def set_duration(df, end_day):
    n_mids = df.MID.nunique()
    mids = df.MID.unique()
    # get max value through end_day
    max_val = df.loc[df.Day <= end_day]['log(V_V0)'].max()
    # split data between mids with short durations and mids with durations ending after end_day
    short_mids, short_df, normal_mids, normal_df = split_by_duration(df, end_day)
    # assign max_val to mids with short durations
    short_df = assign_single_value(short_df.copy(deep=True), short_mids, max_val)
    # assign individual value at end_day to normal mids
    normal_df = assign_end_value(normal_df, normal_mids, end_day)
    # concatenate short_df and normal_df together
    final_df = pd.concat([short_df, normal_df]).reset_index(drop=True)
    assert len(final_df) == n_mids
    assert set(final_df.MID.unique()) >= set(mids)
    return final_df
    
def get_start_and_end(df):
    start = get_start(df)
    end = get_end(df)
    n_mids = df.MID.nunique()
    assert (len(start) == n_mids) and (len(end) == n_mids)
    cols = ['MID', 'Sample', 'Drug']
    df = df.merge(start, on=cols, validate='many_to_one')
    df = df.merge(end, on=cols, validate='many_to_one')
    return df

In [2]:
read_fn = '../results/2023-06-08/clean_and_split_data/welm_pdx_clean_mid.csv'

#df['Volume'] = df['Volume'].clip(lower=MIN_VOL)
#start_day = get_start_day(df)
#df = add_volume_columns(df, start_day)
#df = drop_short_duration_mids(df, start_day, end_day)
#end_df = create_end_day(df, end_day)
#end_df.to_csv(write_dir + '/welm_pdx_clean_mid_volume.csv', index=False)

In [3]:
df = pd.read_csv(read_fn)
# get start and end days, start and end volumes for each mid
d = get_start_and_end(df)
# check all mids start on day 1
assert (d.start == 1).all()
# compute log(V/V0)
d = get_log_volume(d)
# for mice that end before end_day, assign max value of log(V/V0)
d = set_duration(d, END_DAY)
d.head()

Unnamed: 0,MID,Sample,Drug,log(V_V0)
0,21,HCI-023,Docetaxel,5.306245
1,22,HCI-023,Docetaxel,5.306245
2,23,HCI-023,Docetaxel,5.306245
3,24,HCI-023,Vehicle,5.306245
4,25,HCI-023,Vehicle,5.306245


In [4]:
len(d)

264