In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('../data/welm_pdx_w_mid.csv')

In [3]:
def count_mice(mf):
    return sum(mf.groupby(['Sample', 'Drug'])['Replicate Number'].nunique())

In [4]:
mf.head()

Unnamed: 0,Day,Drug,Other Info,Replicate Number,Sample,Tumor Volume mm3,source_file,excel_sheet
0,1.0,Navitoclax,,M0,HCI-010,163.9208,43018_2022_337_MOESM7_ESM.xlsx,6d left
1,4.0,Navitoclax,,M0,HCI-010,158.374381,43018_2022_337_MOESM7_ESM.xlsx,6d left
2,8.0,Navitoclax,,M0,HCI-010,197.154048,43018_2022_337_MOESM7_ESM.xlsx,6d left
3,11.0,Navitoclax,,M0,HCI-010,158.582177,43018_2022_337_MOESM7_ESM.xlsx,6d left
4,15.0,Navitoclax,,M0,HCI-010,176.645,43018_2022_337_MOESM7_ESM.xlsx,6d left


In [5]:
print('num rows: ' + str(len(mf)))
print('num mice: ' + str(count_mice(mf)))

num rows: 5955
num mice: 399


In [6]:
print('There are ' + str(mf.Sample.nunique()) + ' samples: ')
print(mf.Sample.unique())

There are 24 samples: 
['HCI-010' 'HCI-024' 'HCI-015' 'HCI-027' 'HCI-002' 'HCI-023' 'HCI-019'
 'HCI-016' 'HCI-001' 'HCI-018' 'HCI-013' 'HCI-013EI' 'HCI-032' 'HCI-032EI'
 'HCI-040' 'HCI-040EI' 'HCI-044' 'HCI-044EI' 'HCI-012' 'HCI-003' 'HCI-011'
 'HCI-017' 'HCI-043' 'HCI-051']


### Give each Sample-Drug-Replicate Number a unique MID

In [7]:
sample_drug_replicate = len(mf[['Sample', 'Drug', 'Replicate Number']].drop_duplicates())

In [8]:
print(sample_drug_replicate)

399


In [9]:
# Assign each Sample-Drug-Replicate an MID
gf = mf.groupby(['Sample', 'Drug', 'Replicate Number']).apply(lambda x: x.name).reset_index(name='MID')
mf = mf.merge(gf, on=['Sample', 'Drug', 'Replicate Number'], validate='many_to_one')
mid_names = mf['MID'].unique()
mid_dict = {}
for x in range(0, len(mid_names)):
    mid_dict[mid_names[x]] = x
mf['MID'] = mf['MID'].map(mid_dict)

In [10]:
# Thus the below two assert statements show that there is a one-to-one mapping between 
# sample-drug-replicate groups and MIDs.

# The number of MIDs equals the number of sample-drug-replicate groups
assert mf.MID.nunique() == sample_drug_replicate

# The number of (sample-drug-replicate, MID) pairings is equal to the number of MIDs
assert mf.MID.nunique() == len(mf[['Sample', 'Drug', 'Replicate Number', 'MID']].drop_duplicates())

### Drugs
- Divide drugs into treatment and control
- Make sure that each sample has a control drug
- Add a control label?

In [11]:
print('There are ' + str(mf.Drug.nunique()) + ' drugs: ')
print(mf.Drug.unique())

There are 25 drugs: 
['Navitoclax' 'Vehicle' 'Docetaxel' 'E2 pellet only'
 'E2 pellet + E2 water ' 'E2 ' 'OVX' 'Intact' 'E2' 'Birinapant' 'vehicle'
 'RO4929097' 'Irinotecan' 'Birinapant + Irinotecan' 'Birinapant '
 ' Irinotecan' 'Fulvestrant (40 mg/kg)' 'Fulvestrant (200 mg/kg)'
 'Fulvestrant' ' Fulvestrant ' 'AC-T' 'Eribulin' 'Enzalutamide'
 'Cabozantinib' 'Talazoparib']


In [12]:
drugs = mf.Drug.unique()
control_drugs = ['Vehicle', 'vehicle']
drugs_to_ignore = ['E2 pellet only', 'E2 pellet + E2 water ', 'E2 ', 'OVX', 'Intact', 'E2']
treatment_drugs = [x for x in drugs if x not in control_drugs + drugs_to_ignore]

In [13]:
control_drugs

['Vehicle', 'vehicle']

In [14]:
treatment_drugs

['Navitoclax',
 'Docetaxel',
 'Birinapant',
 'RO4929097',
 'Irinotecan',
 'Birinapant + Irinotecan',
 'Birinapant ',
 ' Irinotecan',
 'Fulvestrant (40 mg/kg)',
 'Fulvestrant (200 mg/kg)',
 'Fulvestrant',
 ' Fulvestrant ',
 'AC-T',
 'Eribulin',
 'Enzalutamide',
 'Cabozantinib',
 'Talazoparib']

In [15]:
# Remove drugs in ignore_drugs list
mids_before = mf.MID.nunique()
mf = mf.loc[mf.Drug.isin(drugs_to_ignore) == False]
mids_after = mf.MID.nunique()
print('Removed drugs to ignore, dropping ' + str(mids_before - mids_after) + ' unique MIDs')

Removed drugs to ignore, dropping 77 unique MIDs


In [16]:
# Get control values
mf = mf.assign(Control = mf['Drug'].isin(control_drugs).astype(int))
mf.head()

Unnamed: 0,Day,Drug,Other Info,Replicate Number,Sample,Tumor Volume mm3,source_file,excel_sheet,MID,Control
0,1.0,Navitoclax,,M0,HCI-010,163.9208,43018_2022_337_MOESM7_ESM.xlsx,6d left,0,0
1,4.0,Navitoclax,,M0,HCI-010,158.374381,43018_2022_337_MOESM7_ESM.xlsx,6d left,0,0
2,8.0,Navitoclax,,M0,HCI-010,197.154048,43018_2022_337_MOESM7_ESM.xlsx,6d left,0,0
3,11.0,Navitoclax,,M0,HCI-010,158.582177,43018_2022_337_MOESM7_ESM.xlsx,6d left,0,0
4,15.0,Navitoclax,,M0,HCI-010,176.645,43018_2022_337_MOESM7_ESM.xlsx,6d left,0,0


In [17]:
# assert treatment and control drugs labeled correctly
assert mf.loc[mf.Drug.isin(treatment_drugs)].Control.unique() == [0]
assert mf.loc[mf.Drug.isin(control_drugs)].Control.unique() == [1]

### Columns: Sample, Control, Drug, MID, Day, Volume

In [18]:
mf = mf.rename(columns = {'Tumor Volume mm3': 'Volume'})
mf = mf[['Sample', 'Control', 'Drug', 'MID', 'Day', 'Volume']]

### Add duration

In [20]:
print('Maximum Day for each MID')
mf.groupby('MID')['Day'].max().describe()

Maximum Day for each MID


count    322.000000
mean      34.611801
std       24.984012
min       12.000000
25%       22.000000
50%       32.000000
75%       36.000000
max      198.000000
Name: Day, dtype: float64

In [21]:
mf = mf.loc[mf.Day <= 35]

In [22]:
# create columns with earliest day + size
start_day = mf.loc[mf.groupby('MID')['Day'].idxmin()] 
assert(len(start_day) == mf.MID.nunique())
# create columns with latest day + size
end_day = mf.loc[mf.groupby('MID')['Day'].idxmax()]
assert(len(end_day) == mf.MID.nunique())
m = mf[['Sample', 'Control', 'Drug', 'MID']].drop_duplicates()
nmids = m.MID.nunique()
assert(len(m) == nmids)
start_day = start_day.rename(columns = {'Day': 'start', 'Volume': 'start_vol'})
end_day = end_day.rename(columns = {'Day': 'end', 'Volume': 'end_vol'})
m = m.merge(start_day[['MID', 'start', 'start_vol']], on='MID', validate='one_to_one')
assert(len(m) == nmids)
m = m.merge(end_day[['MID', 'end', 'end_vol']], on='MID', validate='one_to_one')
assert(len(m) == nmids)
m['duration'] = m['end'] - m['start']
m.head()

In [24]:
m.start_vol.describe()

count    322.000000
mean     129.108247
std       55.437597
min        0.000000
25%       90.944000
50%      113.700000
75%      153.978861
max      352.352000
Name: start_vol, dtype: float64

In [25]:
m = m.loc[m['start_vol'] >= 1]

In [26]:
m.start_vol.describe()

count    321.000000
mean     129.510453
std       55.051599
min       43.218000
25%       90.944000
50%      114.000000
75%      154.212500
max      352.352000
Name: start_vol, dtype: float64

In [27]:
m['log_V_V0'] = np.log2(m['end_vol'] / m['start_vol'])
m.head()

  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0,Sample,Control,Drug,MID,start,start_vol,end,end_vol,duration,log_V_V0
0,HCI-010,0,Navitoclax,0,1.0,163.9208,22.0,230.687446,21.0,0.492941
1,HCI-010,0,Navitoclax,1,1.0,119.794563,22.0,111.587111,21.0,-0.102392
2,HCI-010,0,Navitoclax,2,1.0,132.027025,22.0,110.372796,21.0,-0.258449
3,HCI-010,1,Vehicle,3,1.0,159.269328,22.0,261.789003,21.0,0.716936
4,HCI-010,1,Vehicle,4,1.0,222.6064,22.0,475.215494,21.0,1.094087


In [28]:
m_end0 = m.loc[m['log_V_V0'] == -np.infty]
print('There are ' + str(len(m_end0)) + ' MIDs with end_vol = 0.')
m_end0

There are 27 MIDs with end_vol = 0.


Unnamed: 0,Sample,Control,Drug,MID,start,start_vol,end,end_vol,duration,log_V_V0
66,HCI-016,0,Docetaxel,66,1.0,216.302,22.0,0.0,21.0,-inf
68,HCI-016,0,Docetaxel,68,1.0,65.0,22.0,0.0,21.0,-inf
80,HCI-027,0,Docetaxel,80,1.0,139.264,22.0,0.0,21.0,-inf
96,HCI-027,0,Birinapant,173,1.0,93.775,32.0,0.0,31.0,-inf
97,HCI-027,0,Birinapant,174,1.0,79.768,32.0,0.0,31.0,-inf
98,HCI-027,0,Birinapant,175,1.0,82.8655,32.0,0.0,31.0,-inf
99,HCI-027,0,Birinapant,176,1.0,67.626,32.0,0.0,31.0,-inf
100,HCI-027,0,Birinapant,177,1.0,124.93,32.0,0.0,31.0,-inf
129,HCI-023,0,Birinapant,206,1.0,171.0,33.0,0.0,32.0,-inf
217,HCI-023,0,Birinapant,294,1.0,171.0,33.0,0.0,32.0,-inf


In [29]:
d = m.loc[m['log_V_V0'] > -np.infty]
d

Unnamed: 0,Sample,Control,Drug,MID,start,start_vol,end,end_vol,duration,log_V_V0
0,HCI-010,0,Navitoclax,0,1.0,163.920800,22.0,230.687446,21.0,0.492941
1,HCI-010,0,Navitoclax,1,1.0,119.794563,22.0,111.587111,21.0,-0.102392
2,HCI-010,0,Navitoclax,2,1.0,132.027025,22.0,110.372796,21.0,-0.258449
3,HCI-010,1,Vehicle,3,1.0,159.269328,22.0,261.789003,21.0,0.716936
4,HCI-010,1,Vehicle,4,1.0,222.606400,22.0,475.215494,21.0,1.094087
...,...,...,...,...,...,...,...,...,...,...
311,HCI-051,0,AC-T,388,1.0,74.253155,32.0,348.496159,31.0,2.230619
317,HCI-051,1,Vehicle,394,1.0,131.720430,32.0,2162.088374,31.0,4.036874
318,HCI-051,1,Vehicle,395,1.0,112.248563,32.0,2161.162866,31.0,4.267039
320,HCI-051,1,Vehicle,397,1.0,131.439447,32.0,2399.032779,31.0,4.189983


In [30]:
# Compute log(V/C0), centered
old_len = len(d)
d = d.merge(d.groupby(['Sample', 'Drug'])['log_V_V0'].mean().reset_index(name='log_V_V0_sm'))
d['log_V_V0_cen'] = d['log_V_V0'] - d['log_V_V0_sm']
assert len(d) == old_len

In [31]:
d.head()

Unnamed: 0,Sample,Control,Drug,MID,start,start_vol,end,end_vol,duration,log_V_V0,log_V_V0_sm,log_V_V0_cen
0,HCI-010,0,Navitoclax,0,1.0,163.9208,22.0,230.687446,21.0,0.492941,0.044033,0.448907
1,HCI-010,0,Navitoclax,1,1.0,119.794563,22.0,111.587111,21.0,-0.102392,0.044033,-0.146425
2,HCI-010,0,Navitoclax,2,1.0,132.027025,22.0,110.372796,21.0,-0.258449,0.044033,-0.302482
3,HCI-010,1,Vehicle,3,1.0,159.269328,22.0,261.789003,21.0,0.716936,0.893129,-0.176193
4,HCI-010,1,Vehicle,4,1.0,222.6064,22.0,475.215494,21.0,1.094087,0.893129,0.200957


In [32]:
d.to_csv('../data/welm_w_duration.csv')

In [33]:
d21 = d.loc[d.duration >= 21]
d21.to_csv('../data/welm_min_duration_21.csv')