In [1]:
import numpy as np
import pandas as pd

import matplotlib
import matplotlib.pyplot as plt
import matplotlib.backends.backend_pdf

END_DAY = 2
MIN_VOL = 1.0
NUM_ARGS = 4
YMIN = 0
YMAX = 1000

In [2]:
def get_start(df):
	start = df.loc[df.groupby('MID')['Day'].idxmin()]
	return start.rename(columns = {'Day': 'start', 'Volume': 'start_vol'})

def get_end(df):
	end = df.loc[df.groupby('MID')['Day'].idxmax()]
	return end.rename(columns = {'Day': 'end', 'Volume': 'end_vol'})

def get_log_volume(df):
    # clip end volumes so that all are non-zero
    df['end_vol'] = df['end_vol'].clip(lower=MIN_VOL)
    df['V_V0'] = df['end_vol'].div(df['start_vol'])
    df['log(V_V0)'] = np.log2(df['V_V0'])
    return df

def split_by_duration(df, end_day):
    n_mids = df.MID.nunique()
    short_mids = df.loc[df.end < end_day].MID.unique()
    short_df = df.loc[df.MID.isin(short_mids)]
    normal_mids = df.loc[df.end >= end_day].MID.unique()
    normal_df = df.loc[df.MID.isin(normal_mids)]
    assert set(short_mids).isdisjoint(set(normal_mids))
    assert len(short_mids) + len(normal_mids) == n_mids
    return short_mids, short_df, normal_mids, normal_df

def assign_single_value(df, mids, max_val):
    df['log(V_V0)'] = max_val
    df = df[['MID', 'Sample', 'Drug', 'log(V_V0)']].drop_duplicates()
    assert (len(df) == len(mids))
    assert (set(df.MID.unique()) >= set(mids))
    return df

def assign_end_value(df, mids, end_day):
    # only keep rows for the end_day
    df = df.loc[df.Day == end_day]
    df = df[['MID', 'Sample', 'Drug', 'log(V_V0)']].drop_duplicates()
    # ensure no mid's dropped
    assert (len(df == len(mids)))
    assert (set(df.MID.unique()) >= set(mids))
    return df

def get_max_end_value(df, end_day):
	n_mids = df.MID.nunique()
	df = df.loc[df.Day <= end_day]
	# group by MID to get latest day for each MID
	latest = df.loc[df.groupby('MID')['Day'].idxmax()]
	assert len(latest) == n_mids
	return latest['log(V_V0)'].max()

def set_duration(df, end_day):
    n_mids = df.MID.nunique()
    mids = df.MID.unique()
    # get max value through end_day
    max_val = get_max_end_value(df, end_day)
    # split data between mids with short durations and mids with durations ending after end_day
    short_mids, short_df, normal_mids, normal_df = split_by_duration(df, end_day)
    # assign max_val to mids with short durations
    short_df = assign_single_value(short_df.copy(deep=True), short_mids, max_val)
    # assign individual value at end_day to normal mids
    normal_df = assign_end_value(normal_df, normal_mids, end_day)
    # concatenate short_df and normal_df together
    final_df = pd.concat([short_df, normal_df]).reset_index(drop=True)
    assert len(final_df) == n_mids
    assert set(final_df.MID.unique()) >= set(mids)
    return final_df
    
def get_start_and_end(df):
    start = get_start(df)
    end = get_end(df)
    n_mids = df.MID.nunique()
    assert (len(start) == n_mids) and (len(end) == n_mids)
    cols = ['MID', 'Sample', 'Drug']
    df = df.merge(start, on=cols, validate='many_to_one')
    df = df.merge(end, on=cols, validate='many_to_one')
    return df

In [3]:
data = '../results/2023-06-13/clean_and_split_data/welm_pdx_clean_mid.csv'
df = pd.read_csv(data)
df.head()

Unnamed: 0,MID,Sample,Drug,Day,Volume
0,0,HCI-010,Navitoclax,1.0,163.9208
1,0,HCI-010,Navitoclax,4.0,158.374381
2,0,HCI-010,Navitoclax,8.0,197.154048
3,0,HCI-010,Navitoclax,11.0,158.582177
4,0,HCI-010,Navitoclax,15.0,176.645


In [4]:
d = get_start_and_end(df)
# check all mids start on day 1
assert (d.start == 1).all()
d = get_log_volume(d)
n_mids = d.MID.nunique()
mids = d.MID.unique()
# get max value through end_day
max_val = get_max_end_value(d, END_DAY)
# split data between mids with short durations and mids with durations ending after end_day
short_mids, short_df, normal_mids, normal_df = split_by_duration(d, END_DAY)

In [5]:
short_mids

array([ 21,  22,  23,  24,  25,  26,  27,  28,  29,  57,  58,  59,  60,
        61,  62, 195, 198, 200, 233, 235, 236, 237, 240])

In [6]:
df = d
fig_list = []
for mid in short_mids:
    f = plt.figure()
    m = df.loc[df.MID == mid]
    sample = m.Sample.unique()[0]
    drug = m.Drug.unique()[0]
    plt.title('MID: ' + str(mid) + ', Sample: ' + sample + ', Drug: ' + drug)
    plt.xlabel('Day')
    plt.ylabel('Volume mm^3')
    ax = plt.gca()
    ax.set_ylim([YMIN, YMAX])
    plt.plot(m.Day, m.Volume)
    fig_list.append(f)
    plt.close(f)
# save all figures to one pdf
p = matplotlib.backends.backend_pdf.PdfPages('short_mids_volume_vs_day.pdf')
for f in fig_list:
    p.savefig(f)
p.close()