In [1]:
import numpy as np
import pandas as pd

In [2]:
# edit DATA to load in welm_pdx.csv
DATA = '../welm/data/welm_pdx.csv'
df = pd.read_csv(DATA)
print('Number of rows: ' + str(len(df)))

Number of rows: 5955


In [3]:
# Collapse drug names
df['Drug'] = df['Drug'].str.strip()
df['Drug'] = df['Drug'].replace('vehicle', 'Vehicle')
df.Drug.unique()

array(['Navitoclax', 'Vehicle', 'Docetaxel', 'E2 pellet only',
       'E2 pellet + E2 water', 'E2', 'OVX', 'Intact', 'Birinapant',
       'RO4929097', 'Irinotecan', 'Birinapant + Irinotecan',
       'Fulvestrant (40 mg/kg)', 'Fulvestrant (200 mg/kg)', 'Fulvestrant',
       'AC-T', 'Eribulin', 'Enzalutamide', 'Cabozantinib', 'Talazoparib'],
      dtype=object)

In [4]:
# Remove data from the following figures...
# Extended Data Fig 1: This data is from testing the response of tumor samples to estrogen.
# 3h: This data is an experiment involving retreatment related to drug resistance.
# '7d mid right', vehicle lines only: Data duplicated in Figure 6
# 7e, vehicle & birinapant lines: These are repeated in 7c
# 8: No drug overlap with in-vitro drugs. And Fig 8 is related to real-time tests for a single patient.
# So Fig 8 is somewhat different from other figs.

In [5]:
# Remove all data from Extended Data Fig 1
extended_data_fig1_fn = '43018_2022_337_MOESM11_ESM.xlsx'
df = df.loc[df['source_file'] != extended_data_fig1_fn]

# Remove all data from Fig 8
fig8_fn = '43018_2022_337_MOESM10_ESM.xlsx'
df = df.loc[df['source_file'] != fig8_fn]

In [6]:
# Remove data from 3h left and 3h right
df = df.loc[~df['excel_sheet'].isin(['3h left', '3h right'])]

# Remove data from 6d sheets, vehicle lines only
sheet_6d = ['6d left', '6d mid left', '6d mid', '6d mid right']
df = df.loc[~((df['excel_sheet'].isin(sheet_6d)) & (df['Drug'] == 'Vehicle'))]

In [7]:
# Remove data from sheet '7d mid right', vehicle lines only
df = df.loc[~((df['excel_sheet'] == '7d mid right') & (df['Drug'] == 'Vehicle'))]

# Remove data from 7e, vehicle and birinapant lines only
sheet_7e = ['7e left', '7e mid', '7e right']
df = df.loc[~((df['excel_sheet'].isin(sheet_7e)) & (df['Drug'].isin(['Birinapant', 'Vehicle'])))]

In [8]:
cols = ['Sample', 'Drug', 'Replicate Number', 'Day', 'Tumor Volume mm3']
assert df.groupby(cols)['source_file'].nunique().max() == 1
assert df.groupby(cols)['excel_sheet'].nunique().max() == 1

In [9]:
print('Number of rows: ' + str(len(df)))

Number of rows: 2660


In [10]:
# save data
df.to_csv('data/welm_pdx_clean.csv', index=False)

### Exploring duplicate volume measurements

In [None]:
print(len(df[['Sample', 'Drug', 'Day', 'Tumor Volume mm3']]))
print(len(df[['Sample', 'Drug', 'Day', 'Tumor Volume mm3']].drop_duplicates()))

In [None]:
c = ['Sample', 'Drug', 'Day', 'Tumor Volume mm3']
g = df.groupby(c)['Replicate Number'].count().reset_index(name='nreplicates')
grep = g.loc[g['nreplicates'] > 1]
grep = grep.sort_values(by='Tumor Volume mm3', ascending=False)
grep.head(20)

In [None]:
gg = grep.groupby(['Sample', 'Drug'])['Day'].nunique().reset_index(name='nprs')
gg

In [None]:
sample = 'HCI-019'
drug = 'Birinapant'
day = 22
vol = 503.3815

df.loc[(df['Sample'] == sample) & (df['Drug'] == drug) & (df['Day'] == day) & (df['Tumor Volume mm3'] == vol)]

In [None]:
df.loc[(df['Sample'] == 'HCI-011') & (df['Drug'] == 'Vehicle') & (df['Day'] == 15) & (df['Tumor Volume mm3'] == 550.525)]

In [None]:
# now we know that there are no duplicates left

In [None]:
cols = ['Sample', 'Drug', 'Replicate Number', 'excel_sheet']
assert df.groupby(cols)['Tumor Volume mm3'].nunique().max() == 1

In [None]:
g.loc[g['sheet_ct'] > 1].Sample.unique()

In [None]:
g.excel_sheet.unique()

In [None]:
# I really only want to keep the 5 drugs that overlap between the two datasets: 
# 'Navitoclax', 'Birinapant', 'Docetaxel', 'RO4929097', 'Fulvestrant', 'Vehicle'

In [None]:
df.loc[df['source_file'] == '43018_2022_337_MOESM10_ESM.xlsx'].Drug.unique()

In [None]:
df.excel_sheet.unique()

### Grouping by (Sample, Drug, Replicate Number)  does not uniquely identify a mouse 

Let's select for Sample: HCI-015, Drug: Vehicle, Replicate Number: M0. Note that (1) the measurements for excel_sheet "6d mid" and "6f top right" are duplicates and (2) the measurements from "7d left" are different.

In [None]:
s = df.loc[(df['Sample'] == 'HCI-015') & (df['Drug'] == 'Vehicle') & (df['Replicate Number'] == 'M0')]
s

### Data that's hard to identify as duplicated
From looking through the excel sheets, I also found that sheet '3f left' and '3h left' have some data that is duplicated, but which is hard to identify as duplicated because the drugs are named differently. The duplication makes sense for the context: '3h left' is a long (100+ day) experiment and '3f left' shows the data for the first weeks of the experiment. However in '3h left' the drug is labeled as 'Fulvestrant' and in '3f left' the drug is labeled as 'Fulvestrant (200 mg/kg)'. For example, in the following two dataframes, the volume measurements are the same through day 26.

I think my main question here is how to check for other data that is duplicated, but which doesn't match exactly on one of the main values like Sample, Drug, or Replicate Number. I'm not sure if there is other duplicated data like this, but it seems like it would be good to check for.

In [None]:
ex1 = df.loc[(df['Sample'] == 'HCI-003') & (df['Replicate Number'] == 'M0') & (df['excel_sheet'] == '3f left') & (df['Drug'] == 'Fulvestrant (200 mg/kg)')]
ex1.head(15)

In [None]:
ex2 = df.loc[(df['Sample'] == 'HCI-003') & (df['Replicate Number'] == 'M0') & (df['excel_sheet'] == '3h left') & (df['Drug'] == 'Fulvestrant')]
ex2.head(15)

### Part 2: An attempt at assigning unique identifiers to mice (work in progress)

To start, let's look at each ('Sample', 'Drug', 'Replicate Number', 'Day', 'excel_sheet') tuple and count the number of associated unique volume measurements. 

In [None]:
df.groupby(['Sample', 'Drug', 'Replicate Number', 'Day', 'excel_sheet'])['Tumor Volume mm3'].nunique().describe()

Some tuples have multiple volume measurements, even in the same excel sheet. This is because the Extended Data Figure (ED1) includes some mice who were given multiple tumors. For now, let's remove the ED1 sheets, to make things simpler.

In [None]:
# Remove data corresponding to ED1 sheets
ed_sheets = ['ED1c', 'ED1d left', 'ED1d right', 'ED1e left', 'ED1e right', 'ED1f left', 'ED1f right', 
             'ED1g left', 'ED1g right']
d = df.loc[~df.excel_sheet.isin(ed_sheets)]

In [None]:
d.groupby(['Sample', 'Drug', 'Replicate Number', 'Day', 'excel_sheet'])['Tumor Volume mm3'].nunique().describe()

From the above we see that each ('Sample', 'Drug', 'Replicate Number', 'Day', 'excel_sheet') tuple now has exactly 1 volume measurement. To start, we'll assign an ID to each ('Sample', 'Drug', 'Replicate Number', 'excel_sheet') tuple. Some sheets have duplicate data, so this will likely be overcounting mice, but we can collapse duplicated data later on.

In [None]:
# Assign an MID to each (Sample, Drug, Replicate Number, excel_sheet) tuple
t1 = ['Sample', 'Drug', 'Replicate Number', 'excel_sheet']
old_len = len(d)
d = d.merge(d.groupby(t1).apply(lambda x: x.name).reset_index(name='ID'), 
              on=t1, 
              validate='many_to_one')
assert len(d) == old_len

In [None]:
d.head()

In [None]:
print('Number of unique IDs: ' + str(d.ID.nunique()))

As described before, it seems likely that multiple IDs correspond to the same mouse. So we need a way to collapse these IDs. Here I am going to assume that if two ('Sample', 'Drug', 'Replicate Number', 'Day', 'Tumor Volume mm3') tuples are identical, they correspond to the same mouse. Next we'll groupby said tuple and examine the associated IDs. We assume that all the IDs associated with a unique tuple correspond to the same mouse, and should be collapsed.

In [None]:
t2 = ['Sample', 'Drug', 'Replicate Number', 'Day', 'Tumor Volume mm3']
g = d.groupby(t2).ID.aggregate(lambda x: x.unique().tolist()).reset_index(name='id_list')
g

We can look up the example tuple from Part 1 ('HCI-015', 'Vehicle', 'M0') in dataframe g. Note that the duplicated measurements from Part 1 are now collapsed. The cases where we had multiple measurements on the same day can be separated out by their different file names.

In [None]:
g.loc[(g['Sample'] == 'HCI-015') & (g['Drug'] == 'Vehicle') & (g['Replicate Number'] == 'M0')]

Next we'd like to map each ID to an MID so that if two IDs appear in the same id_list, they are mapped to the same MID. This should result in a one-to-one mapping between mice and MIDs. (I haven't finished this yet...)

In [None]:
# get a list of unique id_list's
unique_lists = []
for elem in list(g.id_list):
    append = True
    for s in unique_lists:
        if elem == s:
            append = False
    if append:
        unique_lists.append(elem)     

In [None]:
# are there any id's that appear in multiple lists?
id_counts = {}
for id_name in list(d.ID):
    count = 0
    for l in unique_lists:
        for elt in l:
            if id_name == elt:
                count += 1
    id_counts[id_name] = count

In [None]:
for id_name in id_counts.keys():
    if id_counts[id_name] > 1:
        print(id_name)

### Next steps
My next goal is to create sets of IDs that all belong together (because they appear in id_lists together) and assign each set an MID. Then I would map each ID in the set to it's respective MID. These MIDs will (hopefully) uniquely identify mice.