In [1]:
import numpy as np
import pandas as pd

In [2]:
trainRNA = pd.read_csv('../../data/processed/RNA_train_cancergenes.csv', index_col=0)
valRNA = pd.read_csv('../../data/processed/RNA_val_cancergenes.csv', index_col=0)

In [3]:
allCellLines = list(trainRNA.index) + list(valRNA.index)

In [4]:
cellLineDF = pd.read_csv('../../data/processed/drugCellLinePairsData.csv').loc[:, ['DepMap_ID','cancer_type']]
print(cellLineDF.shape)
cellLineDF.drop_duplicates(keep='first', inplace=True)
print(cellLineDF.shape)
cellLineDF = cellLineDF[cellLineDF.DepMap_ID.isin(allCellLines)]
print(cellLineDF.shape)

(67789, 2)
(473, 2)
(300, 2)


In [5]:
cellLineDF = cellLineDF.sample(frac=1., random_state=12345)

In [6]:
folds = {}

In [7]:
fold0 = cellLineDF.groupby(by='cancer_type').sample(frac=0.2, random_state=12345)

In [8]:
cancerCounts = fold0.cancer_type.value_counts()
print(cancerCounts.sum())
cancerCounts

59


Lung Cancer                   15
Skin Cancer                    6
Brain Cancer                   5
Ovarian Cancer                 5
Pancreatic Cancer              5
Colon/Colorectal Cancer        4
Esophageal Cancer              4
Bladder Cancer                 3
Breast Cancer                  3
Endometrial/Uterine Cancer     3
Head and Neck Cancer           3
Liver Cancer                   3
Name: cancer_type, dtype: int64

In [9]:
folds['fold0'] = list(fold0.DepMap_ID)

In [10]:
print(cellLineDF.shape)
cellLineDF = cellLineDF[~cellLineDF.DepMap_ID.isin(folds['fold0'])]
print(cellLineDF.shape)

(300, 2)
(241, 2)


In [11]:
for i in range(1, 4):
    fold = []
    for ct, subdf in cellLineDF.groupby('cancer_type'):
        nSamp = cancerCounts[ct]
        fold.extend(list(subdf.sample(n=nSamp, random_state=12345).DepMap_ID))
    
    folds[f'fold{i}'] = fold
    cellLineDF = cellLineDF[~cellLineDF.DepMap_ID.isin(fold)]

In [12]:
folds['fold4'] = list(cellLineDF.DepMap_ID)

In [13]:
for f in folds:
    print(f, len(folds[f]))

fold0 59
fold1 59
fold2 59
fold3 59
fold4 64


In [14]:
len(folds[f])

64

In [17]:
longest = max([len(lst) for lst in folds.values()])

for f in folds.keys():
    lastCL = folds[f][-1]
    while len(folds[f]) < longest:
        folds[f].append(lastCL)

In [18]:
foldDF = pd.DataFrame(folds)
foldDF

Unnamed: 0,fold0,fold1,fold2,fold3,fold4
0,ACH-000741,ACH-000026,ACH-000720,ACH-000566,ACH-000890
1,ACH-000396,ACH-000753,ACH-000018,ACH-000011,ACH-000845
2,ACH-000839,ACH-000973,ACH-000905,ACH-000142,ACH-000875
3,ACH-000273,ACH-000231,ACH-000558,ACH-000756,ACH-000869
4,ACH-000323,ACH-000137,ACH-000469,ACH-000232,ACH-000572
...,...,...,...,...,...
59,ACH-000968,ACH-000458,ACH-000822,ACH-000425,ACH-000296
60,ACH-000968,ACH-000458,ACH-000822,ACH-000425,ACH-000619
61,ACH-000968,ACH-000458,ACH-000822,ACH-000425,ACH-000802
62,ACH-000968,ACH-000458,ACH-000822,ACH-000425,ACH-000862


In [19]:
foldDF.to_csv('../../data/processed/cellLinesForCV.csv', index=False)