In [1]:
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
DATA = '../paper/peerj-07-6586-s003.csv'
SAVE_FN = '../data/malcolm_clean.csv'

In [3]:
df = pd.read_csv(DATA)
df.head()
len1 = len(df)
nmid1 = df.MID.nunique()
print('number of rows: ' + str(len1))
print('number of unique MIDs: ' + str(nmid1))

number of rows: 15433
number of unique MIDs: 1776


In [4]:
df.head()

Unnamed: 0,Study,Study_ID,Group,Drug,Control,MID,Day,Size
0,J000100672,42396T_2CAOS,3,Cisplatin,0.0,396-11,0.0,212.49
1,J000100672,42396T_2CAOS,3,Cisplatin,0.0,396-11,3.0,322.6
2,J000100672,42396T_2CAOS,3,Cisplatin,0.0,396-11,6.0,368.0
3,J000100672,42396T_2CAOS,3,Cisplatin,0.0,396-11,10.0,538.42
4,J000100672,42396T_2CAOS,3,Cisplatin,0.0,396-11,13.0,743.13


In [5]:
# remove rows with NaNs
df = df.dropna()
len2 = len(df)
nmid2 = df.MID.nunique()
print('dropped ' + str(len1 - len2) + ' rows with NaNs')
print('number of unique MIDs: ' + str(nmid2))

dropped 638 rows with NaNs
number of unique MIDs: 1776


In [6]:
# remove MIDs with incorrect data: "217-01", "703-029" 
#('peerj-07-6586-s004.r', line 30, instructs to remove these MIDs)
bad_mids = list(set(["217-01", "703-029"]).intersection(set(df.MID.unique())))
df = df.loc[df['MID'].isin(bad_mids) == False]
len3 = len(df)
nmid3 = df.MID.nunique()
assert(nmid2 - nmid3 == len(bad_mids))
print('dropped ' + str(len2 - len3) + ' rows corresponding to MIDs with incorrect data')
print('number of unique MIDs: ' + str(nmid3))

dropped 9 rows corresponding to MIDs with incorrect data
number of unique MIDs: 1775


In [7]:
# remove MIDs in treatment and control
mids_control = df.loc[df.Control == 1, 'MID'].unique()
mids_treat = df.loc[df.Control == 0, 'MID'].unique()
mids_in_treat_and_control = list(set(mids_control).intersection(set(mids_treat)))
df = df.loc[df['MID'].isin(mids_in_treat_and_control) == False]
len4 = len(df)
nmid4 = df.MID.nunique()
assert(nmid3 - nmid4 == len(mids_in_treat_and_control))
print('dropped ' + str(len3 - len4) + ' rows corresponding to MIDs in treatment and control groups')
print('number of unique MIDs: ' + str(nmid4))

dropped 228 rows corresponding to MIDs in treatment and control groups
number of unique MIDs: 1762


In [8]:
# assert a one-to-one mapping from Study to Study_ID
study_df = df[['Study', 'Study_ID']]
study_df = study_df.drop_duplicates()
assert((len(study_df) == df.Study.nunique()) and (len(study_df) == df.Study_ID.nunique()))

# since there is a one-to-one mapping between Study and Study_ID, and since the R-code for Malcolm paper 
# only seems to reference Study, we drop Study_ID
df = df[['Study', 'Group', 'Drug', 'Control', 'MID', 'Day', 'Size']]

In [9]:
# examine Study
print('number of studies: ' + str(df.Study.nunique()))
print('Study: ')
print(list(df.Study.unique()))

number of studies: 72
Study: 
['J000100672', 'TM00090', 'TM00096', 'TM00091', 'TM00097', 'TM00098', 'TM00099', 'TM01278', 'TM01117', 'TM01273', 'J000101173', 'J000100674', 'J000100675', 'J000103634', 'TM00103', 'TM00386', 'J000103917', 'J000102184', 'TM00107', 'TM00999', 'J000080739', 'J000099327', 'TM01079', 'J000100646', 'J000104256', 'TM01029', 'J000104518', 'J000105006', 'J000104944', 'TM00877', 'TM01563', 'TM00199', 'TM00219', 'TM00256', 'TM00194', 'TM00246', 'J000096652', 'TM00186', 'TM00355', 'TM00193', 'TM00212', 'TM00214', 'TM00222', 'TM00233', 'TM00253', 'TM00832', 'J000079689', 'TM00387', 'TM01039', 'TM00298', 'TM00335', 'TM00702', 'TM01149', 'J000106560', 'TM01075', 'J000102680', 'J000102630', 'TM00185', 'TM00188', 'TM00356', 'TM00192', 'TM00196', 'TM00202', 'TM00203_1', 'TM00208', 'TM00213', 'TM00226', 'TM00231', 'TM00784', 'TM00302', 'TM01087', 'TM00203_2']


In [10]:
# examine Group
print('number of groups: ' + str(df.Group.nunique()))
print('Group: ')
print(list(df.Group.unique()))

number of groups: 73
Group: 
['3', 'Gr 3: Cisplatin', 'Group 3 Cisplatin', 'Group 3 IV Cisplatin', 'Group 3 Cisplatin IV', 'IV Cisplatin', 'Group 3: Cisplatin', '3: Cisplatin', 'Cisplatin', 'GRP2 - Cisplatin', 'Group 2 IV Cisplatin', 'GRP3 - Cisplatin', 'Group 2 Ciplatin IV', 'Grp3- IV Cisplatin', 'Grp3_Cisplatin', 'Grp3 IV Cisplatin', 'Grp8_Cisplatin', 'Grp3 Cis Top', 'Group 2 Cisplatin IV', 'Grp 2 IV Cisplatin', 'GRP3 IV Cisplatin', 'Group 2', 'Group 3', 'Grp3_IV Cisplatin', 'CMC', 'Group 1 Vechicle PO', 'GRP1 - Vehicle', 'Gr 1: Vehicle', 'Group 1 D5W', 'Group 1 IV D5W', 'Group 1 D5W IV', 'IV D5W', 'Group 1 Vehicle', 'Group 1: D5W', '1: D5W', 'Vehicle', 'Vehicle (D5W)', '1', 'Grp1- IV D5W', 'Grp1_vehicle', 'Grp1 IV D5W', 'Grp1 D5W', 'Grp 1 IV D5W', 'GRP1 IV D5W', 'Group 1', 'Grp1 IV D5w', 'Grp1_IV D5W', 'Grp 1 D5W+ 0.5% CMC', 'Group 1 IV+PO Vehicl', 'Grp1_Vehicle', 'Gr 2: Docetaxel', 'Group 2 Docetaxel', 'Group 2 IV Docetaxel', 'Group 2 Decetaxel IV', 'IV Docetaxel', 'Group 2: Doceta

In [11]:
# examine Drug
print('number of drugs: ' + str(df.Drug.nunique()))
print('Drug: ')
print(list(df.Drug.unique()))

number of drugs: 7
Drug: 
['Cisplatin', 'CMC', 'D5W', 'D5W + CMC', 'DMSO', 'Docetaxel', 'None']


### Description of drugs and vehicle control from Malcolm et al.
"Mice were then randomly categorized into study groups and treated with either 2.0 mg/kg IV cisplatin once per week for up to three weeks, 10.0 mg/kg IV docetaxel once per week for up to three weeks, or a vehicle control. The compound and dosing schedule for vehicle controls varied across studies because of other treatment groups that were run simultaneously with cisplatin or docetaxel. Most models (66) used 5.0 ml/kg IV 5% dextrose in water; the remaining used one of the following alternatives: (1) 10.0 ml/kg PO 0.5% CMC daily (3 models), (2) 5.0 ml/kg PO 2.5% DMSO in PBS daily (1 model), (3) 5.0 ml/kg IV combination of 5% dextrose in water and 0.5% CMC (1 model)."

From this, I'm assuming that the treatments are:    
Cisplatin --> 2.0 mg/kg IV cisplatin once per week for up to three weeks    
Docetaxel --> 10.0 mg/kg IV docetaxel once per week for up to three weeks    

And that the controls are:    
CMC --> 10.0 ml/kg PO 0.5% CMC daily (3 models)         
D5W --> used 5.0 ml/kg IV 5% dextrose in water (66 models)        
D5W + CMC --> 5.0 ml/kg IV combination of 5% dextrose in water and 0.5% CMC (1 model)        
DMSO --> 5.0 ml/kg PO 2.5% DMSO in PBS daily (1 model)

In [12]:
# Drop rows with drug set to 'None'
df = df.loc[df.Drug != 'None']
len5 = len(df)
print('dropped ' + str(len4 - len5) + ' rows with Drug = None')

dropped 72 rows with Drug = None


In [13]:
# find number of studies per treatment drug
print('number of studies for each treatment drug')
print('Cisplatin: ' + str(df.loc[df.Drug == 'Cisplatin', 'Study'].nunique()))
print('Docetaxel: ' + str(df.loc[df.Drug == 'Docetaxel', 'Study'].nunique()))

number of studies for each treatment drug
Cisplatin: 71
Docetaxel: 58


### Note on treatment drugs
Note that the paper says that there are 66 D5W studies and 1 D5W + CMC study, but we found 65 D5W studies and 2 D5W + CMC studies.

In [14]:
# check number of studies per control drug
print('number of studies for each control drug')
print('CMC: ' + str(df.loc[df.Drug == 'CMC', 'Study'].nunique()))
print('D5W: ' + str(df.loc[df.Drug == 'D5W', 'Study'].nunique()))
print('D5W + CMC: ' + str(df.loc[df.Drug == 'D5W + CMC', 'Study'].nunique()))
print('DMSO: ' + str(df.loc[df.Drug == 'DMSO', 'Study'].nunique()))

number of studies for each control drug
CMC: 3
D5W: 65
D5W + CMC: 2
DMSO: 1


In [15]:
treat_drugs = ['Cisplatin', 'Docetaxel']
control_drugs = ['CMC', 'D5W', 'D5W + CMC', 'DMSO']

# check that all drugs are in either treatment or control
df = df.loc[df.Drug.isin(treat_drugs + control_drugs)]
len6 = len(df)
assert(len5 == len6)

In [16]:
# assert that all treatment drugs are labeled Control=0
assert(df.loc[df.Drug.isin(treat_drugs), 'Control'].unique() == 0)

#and all control drugs are labeled Control=1
assert(df.loc[df.Drug.isin(control_drugs), 'Control'].unique() == 1)

### Remove mids in multiple studies (ie mice w multiple tumor samples)

In [17]:
# remove mids in multiple studies
mid_studies = df.groupby('MID')['Study'].nunique().reset_index(name='num_studies')
mid_mult_studies = mid_studies.loc[mid_studies.num_studies > 1].MID

In [18]:
old_nmids = df.MID.nunique()
df = df.loc[df.MID.isin(mid_mult_studies) == False]
print('dropped ' + str(len(mid_mult_studies)) + ' MIDs associated with multiple studies')
print('number of unique mids: ' + str(len(df.MID.unique())))

dropped 20 MIDs associated with multiple studies
number of unique mids: 1734


In [19]:
assert (old_nmids - df.MID.nunique() == len(mid_mult_studies))
# assert that each MID belongs to exactly one study
assert(df.groupby('MID')['Study'].nunique().max() == 1)

### Groups
Mice in the same group all receive the same treatment. This refers to the treatment of the mice, but not the study or tumor status of the mice. Mice in the same group may belong to different studies. 

In [20]:
# assert that every mouse belongs to exactly 1 group 
assert(df.groupby('MID')['Group'].nunique().max() == 1)

In [21]:
# check for groups with multiple drugs
grp_drugs = df.groupby('Group')['Drug'].nunique().reset_index(name='num_drugs')
groups_w_multiple_drugs = grp_drugs.loc[grp_drugs['num_drugs'] > 1, 'Group'].unique()
old_ngrps = df.Group.nunique()
old_nmids = df.MID.nunique()
df = df.loc[df.Group.isin(groups_w_multiple_drugs) == False]
assert(old_ngrps - len(groups_w_multiple_drugs) == df.Group.nunique())
print('dropped ' + str(len(groups_w_multiple_drugs)) + ' groups with multiple drugs')
print('dropped ' + str(old_nmids - df.MID.nunique()) + ' MIDS in groups with multiple drugs')
print('number of unique mids: ' + str(df.MID.nunique()))

dropped 1 groups with multiple drugs
number of unique mids: 1683


In [22]:
# assert that every group has exactly one drug
assert(df.groupby('Group')['Drug'].nunique().max() == 1)

In [23]:
# assert that every group has exactly 1 control value
assert(df.groupby('Group')['Control'].nunique().max() == 1)

### Do (Study, Drug) pairs have multiple Groups? Are the resulting Groups mostly the same?

In [24]:
gf = df.groupby(['Study', 'Drug'])['Group'].nunique().reset_index(name='num_groups')
print('number of (Study, Drug) pairs with > 1 Group: ' + str(len(gf.loc[gf.num_groups > 1])))
gf.loc[gf.num_groups > 1]

number of (Study, Drug) pairs with > 1 Group: 1


Unnamed: 0,Study,Drug,num_groups
119,TM00231,Cisplatin,2


In [25]:
gf_mult_groups = list(df.loc[(df.Study == 'TM00231') & (df.Drug == 'Cisplatin')]['Group'].unique())
print('groups corresponding to (TM00231, Cisplatin): ' + str(gf_mult_groups))

groups corresponding to (TM00231, Cisplatin): ['Group 2', 'Group 3']


In [26]:
old_len = len(df)
old_mids = df.MID.nunique()
df = df.loc[~((df.Study == 'TM00231') & (df.Drug == 'Cisplatin'))]
new_len = len(df)
new_mids = df.MID.nunique()
print('num rows dropped due to removing (TM00231, Cisplatin) pair: ' + str(old_len - new_len))
print('dropped ' + str(old_mids - new_mids) + ' MIDs in (Study, Drug) pairs with multiple groups')
print('number of unique mids: ' + str(new_mids))

num rows dropped due to removing (TM00231, Cisplatin) pair: 140
number of unique mids: 1663


Now we can stratify mice by Study and Drug. Any mice with in the same Study and Drug receive the same PDX and treatment.

In [27]:
# assert each mid is in exactly 1 study
assert(df.groupby('MID').Study.nunique().max() == 1)
# assert each mid is in exactly 1 drug
assert(df.groupby('MID').Drug.nunique().max() == 1)
# assert each mid is in exactly 1 group
assert(df.groupby('MID').Group.nunique().max() == 1)
# assert each mid is in exactly 1 control
assert(df.groupby('MID').Control.nunique().max() == 1)
# assert each (study, drug) pair has exactly 1 group
assert(df.groupby(['Study', 'Drug']).Group.nunique().max() == 1)
# assert each (study, drug) pair has exactly 1 control
assert(df.groupby(['Study', 'Drug']).Control.nunique().max() == 1)

In [28]:
# save df
df.to_csv(SAVE_FN, index=False)