In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('data/welm_pdx_clean.csv')
print('Number of rows: ' + str(len(df)))
cols = ['Sample', 'Drug', 'Replicate Number', 'excel_sheet']
estimated_mids = len(df[cols].drop_duplicates())

Number of rows: 2660


In [3]:
df.head()

Unnamed: 0,Day,Drug,Other Info,Replicate Number,Sample,Tumor Volume mm3,source_file,excel_sheet
0,1.0,Navitoclax,,M0,HCI-010,163.9208,43018_2022_337_MOESM7_ESM.xlsx,6d left
1,4.0,Navitoclax,,M0,HCI-010,158.374381,43018_2022_337_MOESM7_ESM.xlsx,6d left
2,8.0,Navitoclax,,M0,HCI-010,197.154048,43018_2022_337_MOESM7_ESM.xlsx,6d left
3,11.0,Navitoclax,,M0,HCI-010,158.582177,43018_2022_337_MOESM7_ESM.xlsx,6d left
4,15.0,Navitoclax,,M0,HCI-010,176.645,43018_2022_337_MOESM7_ESM.xlsx,6d left


### Give each Sample-Drug-Replicate Number-excel_sheet a unique MID

In [4]:
def enumerate_mid_names(mid_names):
    mid_dict = {}
    for x in range(0, len(mid_names)):
        mid_dict[mid_names[x]] = x
    return mid_dict

old_len = len(df)
# Assign each Sample-Drug-Replicate an MID
df = df.merge(df.groupby(['Sample', 'Drug', 'Replicate Number', 'excel_sheet']).apply(lambda x: x.name).reset_index(name='MID'), 
              on=['Sample', 'Drug', 'Replicate Number', 'excel_sheet'], 
              validate='many_to_one')

In [5]:
mid_names = df['MID'].unique()

In [6]:
mid_dict = enumerate_mid_names(df['MID'].unique())
df['MID'] = df['MID'].map(mid_dict)

In [7]:
assert df.MID.nunique() == estimated_mids
assert len(df) == old_len

### Rename and select columns

In [8]:
df = df.rename(columns = {'Tumor Volume mm3': 'Volume'})
cols = ['MID', 'Sample', 'Drug', 'Day', 'Volume']
df_out = df[cols]
df_out.head()

Unnamed: 0,MID,Sample,Drug,Day,Volume
0,0,HCI-010,Navitoclax,1.0,163.9208
1,0,HCI-010,Navitoclax,4.0,158.374381
2,0,HCI-010,Navitoclax,8.0,197.154048
3,0,HCI-010,Navitoclax,11.0,158.582177
4,0,HCI-010,Navitoclax,15.0,176.645


In [9]:
df_out.to_csv('data/welm_pdx_clean_mid.csv', index=False)

### Computing Stats

In [10]:
df_out.MID.nunique()

264

### Investigating odd MIDs

In [None]:
d = pd.read_csv('../data/welm_pdx.csv')
d.head()

In [None]:
mid_names[21]

In [None]:
s = d.loc[(d['Sample'] == 'HCI-015') & (d['Drug'] == 'Vehicle') & (d['Replicate Number'] == 'M0')]
s

In [None]:
mid_names[24]

In [None]:
f = d.loc[(d['Sample'] == 'HCI-015') & (d['Drug'] == 'Vehicle') & (d['Replicate Number'] == 'M3')]
f