In [1]:
import numpy as np
import pandas as pd
from scipy import interpolate

MIN_VOL = 1.0
end_day = 22

In [2]:
def get_start_day(df):
	start_day = df.loc[df.groupby('MID')['Day'].idxmin()]
	return start_day.rename(columns = {'Day': 'start', 'Volume': 'V0'})

def get_last_day(df):
	last_day = df.loc[df.groupby('MID')['Day'].idxmax()]
	return last_day.rename(columns = {'Day': 'end', 'Volume': 'end_vol'})

def add_volume_columns(df, start_day):
	assert len(start_day) == df.MID.nunique()
	old_len = len(df)
	df = df.merge(start_day[['MID', 'start', 'V0']], on='MID', validate='many_to_one')
	assert old_len == len(df)
	# compute functions of volume
	df['V_V0'] = df['Volume'].div(df['V0'])
	df['log(V_V0)'] = np.log2(df['V_V0'])
	return df

def drop_short_duration_mids(df, start_day, end_day):
	# ensure all mids start at day 1
	assert (start_day.start == 1).all()
	last_day = get_last_day(df)
	mids_to_drop = last_day.loc[last_day.end < end_day].MID.unique()
	return df.loc[~df.MID.isin(mids_to_drop)]

def create_end_day(df, end_day):
	end_df = df.loc[df.Day == end_day]
	assert df.MID.nunique() == end_df.MID.nunique()
	assert end_df.Day.isin([end_day]).all()
	return end_df[['MID', 'Sample', 'Drug', 'Volume', 'V_V0', 'log(V_V0)']]

In [3]:
df = pd.read_csv('../results/2023-05-26/clean_and_split_data/welm_pdx_clean_mid.csv')
df.head()

Unnamed: 0,MID,Sample,Drug,Day,Volume
0,0,HCI-010,Navitoclax,1.0,163.9208
1,0,HCI-010,Navitoclax,4.0,158.374381
2,0,HCI-010,Navitoclax,8.0,197.154048
3,0,HCI-010,Navitoclax,11.0,158.582177
4,0,HCI-010,Navitoclax,15.0,176.645


In [4]:
print('num MIDs: ' + str(df.MID.nunique()))

num MIDs: 264


In [5]:
# clip volumes
df['Volume'] = df['Volume'].clip(lower=MIN_VOL)
start_day = get_start_day(df)
df = add_volume_columns(df, start_day)
df = drop_short_duration_mids(df, start_day, end_day)
end_df = create_end_day(df, end_day)

In [None]:
df.MID.nunique()

In [None]:
end_df.MID.nunique()

In [None]:
end_df.head()

In [None]:
print('num MIDs: ' + str(df.MID.nunique()))

In [None]:
df.head()

In [None]:
def get_vol_value(mid, df, end_day, interpolation_kind):
    if len(df.loc[(df.MID == mid) & (df.Day == end_day)]) > 0:
        return df.loc[((df.MID == mid) & (df.Day == 22)), 'log(V_V0)'].values[0]
    x = df.loc[df.MID == mid]['Day'].to_numpy()
    y = df.loc[df.MID == mid]['log(V_V0)'].to_numpy()
    assert np.max(x) > end_day
    f = interpolate.interp1d(x, y, kind=interpolation_kind)
    vfcn = f(end_day)
    return vfcn

In [None]:
# initialize new matrix: MID, Day, log(V_V0)
end_df = pd.DataFrame({'MID': [], 'end': [], 'log(V_V0)': []})
for mid in list(df.MID.unique()):
    vol_value = get_vol_value(mid, df, end_day, 'linear')
    end_df.loc[len(end_df)] = [mid, end_day, vol_value]
# merge end_df with df on MID
m = df[['MID', 'Sample', 'Drug', 'start', 'V0']].drop_duplicates()
assert len(m) == df.MID.nunique()
m = m.merge(end_df, on='MID', validate='many_to_one')
m.head()

In [None]:
# get mids with Day==22
# select Day == 22 value and put in column next to it
# for each type of interpolation, create column with day 22 value interpolated
# compute the mse for each column
# output the interpolation function that fits best

In [None]:
print('day22 mids in df: ' + str(df.loc[df.Day == END_DAY].MID.nunique()))
print('>=22 mids in df: ' + str(df.loc[df.Day >= END_DAY].MID.nunique()))

In [None]:
# CHOOSING INTERPOLATION FUNCTIONS
END_DAY = 22
mids_to_keep = df.loc[df.Day == END_DAY].MID.unique()
d = df.loc[df.MID.isin(mids_to_keep)].reset_index(drop=True)
d = d[['MID', 'Day', 'log(V_V0)']]
holdout = d.loc[d.Day == END_DAY].reset_index(drop=True)
holdout = holdout.rename(columns = {'log(V_V0)': 'log(V_V0)_test'})
d = d.loc[~(d.Day == END_DAY)].reset_index(drop=True)
# create end_df
end_df = pd.DataFrame({'MID': [], 'end': [], 'linear': [], 'quadratic': [], 'cubic': []})
for mid in list(d.MID.unique()):
    linear_value = get_vol_value(mid, d, end_day, 'linear')
    quadratic_value = get_vol_value(mid, d, end_day, 'quadratic')
    cubic_value = get_vol_value(mid, d, end_day, 'cubic')
    end_df.loc[len(end_df)] = [mid, end_day, linear_value, quadratic_value, cubic_value]
end_df.head()

In [None]:
print(holdout.MID.nunique())
print(d.MID.nunique())

In [None]:
holdout.head()

In [None]:
np.sum(end_df['linear'] - end_df['quadratic'])

In [None]:
m.start_vol.describe()

In [None]:
end_df = pd.DataFrame({'MID': [], 'Day': [], 'log(V_V0)': []})
end_df

In [None]:
end_df.loc[0] = [203, 22, 1.3]
end_df

In [None]:
end_df.head()

In [None]:
df.Volume.describe()

In [None]:

interpolate_end_day(df, end_day)
create_end_day(df, end_day)

In [None]:
df = pd.DataFrame({'col1':[0,1,0],
                   'col2':[2,3,1]})
df

In [None]:

test = df['col1'].isin([0,1]).all()
