In [1]:
import numpy as np
import pandas as pd

MIN_VOL = 1.0
end_day = 22

In [2]:
def get_start_day(df):
	start_day = df.loc[df.groupby('MID')['Day'].idxmin()]
	return start_day.rename(columns = {'Day': 'start', 'Volume': 'start_vol'})

def get_last_day(df):
	last_day = df.loc[df.groupby('MID')['Day'].idxmax()]
	return last_day.rename(columns = {'Day': 'end', 'Volume': 'end_vol'})

def add_volume_columns(df, start_day):
	assert len(start_day) == df.MID.nunique()
	old_len = len(df)
	df = df.merge(start_day[['MID', 'start', 'start_vol']], on='MID', validate='many_to_one')
	assert old_len == len(df)
	# compute functions of volume
	df['V_V0'] = df['Volume'].div(df['start_vol'])
	df['log(V_V0)'] = np.log2(df['V_V0'])
	return df

def drop_short_duration_mids(df, start_day, end_day):
	# ensure all mids start at day 1
	assert (start_day.start == 1).all()
	last_day = get_last_day(df)
	mids_to_drop = last_day.loc[last_day.end < end_day].MID.unique()
	return df.loc[~df.MID.isin(mids_to_drop)]

def interpolate_end_day(df, end_day):
	return df

def create_end_day(df, end_day):
	end_df = df.loc[df.Day == end_day]
	assert end_df.Day.isin([end_day]).all()
	return end_df[['MID', 'Sample', 'Drug', 'Volume', 'V_V0', 'log(V_V0)']]

# clip volumes to 1 mm^3
# Find V0 for each MID
# Add V/V0 col, grouped by each MID
# Add log(V/V0) col for each MID
# Select MIDs with end-day > 21
# For ea MID, interpolate log(V/V0) at day 21
#

In [3]:
df = pd.read_csv('../results/2023-05-26/clean_and_split_data/welm_pdx_clean_mid.csv')
df.head()

Unnamed: 0,MID,Sample,Drug,Day,Volume
0,0,HCI-010,Navitoclax,1.0,163.9208
1,0,HCI-010,Navitoclax,4.0,158.374381
2,0,HCI-010,Navitoclax,8.0,197.154048
3,0,HCI-010,Navitoclax,11.0,158.582177
4,0,HCI-010,Navitoclax,15.0,176.645


In [4]:
print('num MIDs: ' + str(df.MID.nunique()))

num MIDs: 264


In [5]:
# clip volumes
df['Volume'] = df['Volume'].clip(lower=MIN_VOL)
start_day = get_start_day(df)
df = add_volume_columns(df, start_day)

In [6]:
df = drop_short_duration_mids(df, start_day, end_day)

In [7]:
print('num MIDs: ' + str(df.MID.nunique()))

num MIDs: 241


In [8]:
df.head()

Unnamed: 0,MID,Sample,Drug,Day,Volume,start,start_vol,V_V0,log(V_V0)
0,0,HCI-010,Navitoclax,1.0,163.9208,1.0,163.9208,1.0,0.0
1,0,HCI-010,Navitoclax,4.0,158.374381,1.0,163.9208,0.966164,-0.04966
2,0,HCI-010,Navitoclax,8.0,197.154048,1.0,163.9208,1.20274,0.266324
3,0,HCI-010,Navitoclax,11.0,158.582177,1.0,163.9208,0.967432,-0.047768
4,0,HCI-010,Navitoclax,15.0,176.645,1.0,163.9208,1.077624,0.107854


In [12]:
def get_vol_value(mid, df, end_day):
    if len(df.loc[(df.MID == mid) & (df.Day == end_day)]) > 0:
        return df.loc[(df.MID == mid) & (df.Day == end_day)]['log(V_V0)']
    x = df.loc[df.MID == mid]['Day'].to_numpy()
    y = df.loc[df.MID == mid]['log(V_V0)'].to_numpy()
    assert np.max(x) > end_day
    f = scipy.interpolate.interp1d(x, y)
    vfcn = f(end_day)
    return vfcn

In [15]:
# initialize new matrix: MID, Day, log(V_V0)
end_df = pd.DataFrame({'MID': [], 'Day': [], 'log(V_V0)': []})
for mid in list(df.MID.unique()):
    vol_value = get_vol_value(mid, df, end_day)
    end_df.loc[len(end_df)] = [mid, end_day, vol_value]
    print(vol_value)
    # append MID, end_day, end_vol to end_df 
end_df

  element = np.asarray(element)


KeyError: 0

In [None]:
end_df = pd.DataFrame({'MID': [], 'Day': [], 'log(V_V0)': []})
end_df

In [None]:
end_df.loc[0] = [203, 22, 1.3]
end_df

In [None]:
end_df.head()

In [None]:
df.Volume.describe()

In [None]:

interpolate_end_day(df, end_day)
create_end_day(df, end_day)

In [None]:
df = pd.DataFrame({'col1':[0,1,0],
                   'col2':[2,3,1]})
df

In [None]:

test = df['col1'].isin([0,1]).all()
