In [1]:
import numpy as np
import pandas as pd

In [2]:
data_fn = '../results/2023-06-12/clean_and_split_data/welm_pdx_clean_mid_volume.csv'
df = pd.read_csv(data_fn)
df.head()

Unnamed: 0,MID,Sample,Drug,log(V_V0)
0,21,HCI-023,Docetaxel,5.306245
1,22,HCI-023,Docetaxel,5.306245
2,23,HCI-023,Docetaxel,5.306245
3,24,HCI-023,Vehicle,5.306245
4,25,HCI-023,Vehicle,5.306245


In [3]:
df.Drug.unique()

array(['Docetaxel', 'Vehicle', 'RO4929097', 'Fulvestrant (40 mg/kg)',
       'Fulvestrant (200 mg/kg)', 'Navitoclax', 'Birinapant',
       'Irinotecan', 'Birinapant + Irinotecan'], dtype=object)

In [4]:
# Collapse Fulvestrant drugs
drug_map = {'Fulvestrant (200 mg/kg)': 'Fulvestrant', 'Fulvestrant (40 mg/kg)': 'Fulvestrant'}
df['Drug'] = df['Drug'].replace(drug_map)

In [5]:
df.head()

Unnamed: 0,MID,Sample,Drug,log(V_V0)
0,21,HCI-023,Docetaxel,5.306245
1,22,HCI-023,Docetaxel,5.306245
2,23,HCI-023,Docetaxel,5.306245
3,24,HCI-023,Vehicle,5.306245
4,25,HCI-023,Vehicle,5.306245


### Samples

In [11]:
samples = list(df.Sample.unique())
n_samples = len(samples)
print('Number of samples in dataset: ' + str(n_samples))
print(samples)

Number of samples in dataset: 13
['HCI-023', 'HCI-002', 'HCI-003', 'HCI-010', 'HCI-024', 'HCI-015', 'HCI-027', 'HCI-019', 'HCI-016', 'HCI-001', 'HCI-012', 'HCI-011', 'HCI-017']


### Drugs

In [9]:
drugs = list(df.Drug.unique())
n_drugs = len(drugs)
print('Number of drugs in dataset, including vehicle: ' + str(n_drugs))
print(drugs)

Number of drugs in dataset, including vehicle: 8
['Docetaxel', 'Vehicle', 'RO4929097', 'Fulvestrant', 'Navitoclax', 'Birinapant', 'Irinotecan', 'Birinapant + Irinotecan']


### Sample-Drug Pairs

In [13]:
df['pair'] = df[['Sample', 'Drug']].apply(tuple, axis=1)
pairs = list(df.pair.unique())
n_pairs = len(pairs)
print('Number of sample-drug pairs in dataset: ' + str(n_pairs))
print(pairs)

Number of sample-drug pairs in dataset: 47
[('HCI-023', 'Docetaxel'), ('HCI-023', 'Vehicle'), ('HCI-002', 'Docetaxel'), ('HCI-002', 'Vehicle'), ('HCI-002', 'RO4929097'), ('HCI-003', 'Fulvestrant'), ('HCI-003', 'Vehicle'), ('HCI-010', 'Navitoclax'), ('HCI-024', 'Navitoclax'), ('HCI-015', 'Navitoclax'), ('HCI-027', 'Navitoclax'), ('HCI-002', 'Navitoclax'), ('HCI-015', 'Docetaxel'), ('HCI-015', 'Vehicle'), ('HCI-019', 'Docetaxel'), ('HCI-019', 'Vehicle'), ('HCI-016', 'Docetaxel'), ('HCI-016', 'Vehicle'), ('HCI-027', 'Docetaxel'), ('HCI-027', 'Vehicle'), ('HCI-010', 'Docetaxel'), ('HCI-010', 'Vehicle'), ('HCI-024', 'Docetaxel'), ('HCI-024', 'Vehicle'), ('HCI-001', 'Docetaxel'), ('HCI-001', 'Vehicle'), ('HCI-027', 'Birinapant'), ('HCI-015', 'Birinapant'), ('HCI-001', 'Birinapant'), ('HCI-002', 'Birinapant'), ('HCI-019', 'Birinapant'), ('HCI-012', 'Birinapant'), ('HCI-023', 'Birinapant'), ('HCI-012', 'Vehicle'), ('HCI-015', 'RO4929097'), ('HCI-027', 'RO4929097'), ('HCI-010', 'RO4929097'), ('

First we need to find the number of pairs each drug appears in. This is a clear lower bound on the number of folds possible.

In [17]:
d = df[['Sample', 'Drug', 'pair']].drop_duplicates()
d = d.groupby('Drug')['pair'].count()
d

Drug
Birinapant                  7
Birinapant + Irinotecan     3
Docetaxel                   9
Fulvestrant                 3
Irinotecan                  3
Navitoclax                  5
RO4929097                   4
Vehicle                    13
Name: pair, dtype: int64

So it seems at best we could have 3 distinct test sets covering all drugs. Let's see if this is possible...

In [18]:
m = pd.merge(d, on='Drug', )

Drug
Birinapant                 7
Birinapant + Irinotecan    3
Docetaxel                  9
Fulvestrant                3
Irinotecan                 3
Name: pair, dtype: int64

In [7]:
# Constraints:
# (0) No (sample, drug) pair in the test appears in the training set.
# (1) Every drug in the test set appears in the training set
# (2) Every sample in the test set appears in the training set
# (3) Test set includes 20-25% of the sample, drug pairs
# --------------------
# (3) Fulvestrant dosages are collapsed to one drug
# (4) What about Birinapant, Irinotecan, Birinapant + Irinotecan?

In [None]:
# Options for train-test splits:
# (A) Hold out a block of samples, drugs: What is the max test set size feasible?
# (B) K-fold: requires K *disjoint* test sets ---> How many disjoint test sets are feasible?
# (C) Random train-test splits: how much overlap will they have?