In [2]:
import pandas as pd
import numpy as np
import scipy as sp
from collections import Counter
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

### Load datasets

In [3]:
df_RPKM = pd.read_csv('CCLE_RNAseq_RPKM.gct',sep = '\t', header=0, skiprows = range(0,2)) 
#skip rows first then set header, so header = 0
df_GR = pd.read_csv('gcsi_drug_response.csv')

In [4]:
df_mut = pd.read_csv('CCLE_mutation_data.txt',sep = '\t',low_memory = False)

### Broad ID of cell lines and the corresponding number of genes with mutation status

In [12]:
ID = list(df_mut['Broad_ID'])
counts = dict(Counter(ID))
df_mc = pd.DataFrame(counts,index = [0]).T
df_mc= df_mc.reset_index(inplace=False)
df_mc.columns = ['Broad ID','Number of genes with mut status recorded for this cell line']
df_mc = df_mc.sort_values(by = ['Broad ID'],ascending= True) 
#Broad ID of cell lines and the corresponding number of genes with mut status"
#df_mc.to_csv('Cell lines and corresponding number of genes with variant classification.csv',sep=',')

In [5]:
df_mc[0:20]

Unnamed: 0,Broad ID,Number of genes with mut status recorded for this cell line
0,ACH-000001,303
1,ACH-000002,195
2,ACH-000003,225
3,ACH-000004,296
4,ACH-000005,315
5,ACH-000006,220
6,ACH-000007,535
7,ACH-000008,363
8,ACH-000009,439
9,ACH-000010,201


It is pretty obvious that the cell lines don't have mutations status for all the genes, and different cell lines have differet number of genes with recorded mutation status.

### Generate dict of cell name and corresponding broad ID in RNAseq expression data

In [7]:
rpkm_cells = list(df_RPKM.columns)
rpkm_cells.remove('Name')
rpkm_cells.remove('Description')

In [8]:
cell_ID = dict()
for item in rpkm_cells:
    pair = item.split(' (')
    value = pair[0].split('_')[0]
    key = pair[1][:-1]
    cell_ID[key]=value
#dict of cell name and corresponding broad ID

### Intersection of all three datasets

In [9]:
GR_cell = set(list(df_GR['Cell_Line']))
RPKM_cell = set(cell_ID.values())

In [10]:
cell_GR_RPKM = GR_cell.intersection(RPKM_cell)

In [13]:
#convert broad ID to cell line name for cell lines with variant classification
VC_cell = []
VC_cell_broad = set(df_mc['Broad ID'])
VC_cell_int = VC_cell_broad.intersection(set(cell_ID.keys()))
for item in VC_cell_int:
    VC_cell.append(cell_ID[item])
VC_cell = set(VC_cell)

In [14]:
cell_GR_RPKM_VC = VC_cell.intersection(cell_GR_RPKM)

In [93]:
len(cell_GR_RPKM_VC)

355

In [17]:
#cell_GR_RPKM_VC

The total number of cell lines found in all three dataset is 355.

### Processing of the gcsi dataset

In [24]:
df_GR_truncated = pd.concat([df_GR['Cell_Line'],df_GR['Perturbagen'],df_GR['GRinf'],df_GR['GR_AOC'],df_GR['Tissue']],axis =1)
df = pd.DataFrame()
for item in cell_GR_RPKM_VC:
    df=df.append(df_GR_truncated[df_GR_truncated['Cell_Line']==item])
df.reset_index(inplace=True,drop=True)

In [25]:
response = []
for index, row in df.iterrows():
    if row['GR_AOC']>0.5 and row['GRinf'] <0:
        response.append('Yes')
    else:
        response.append('No')
df['Response']=response

In [26]:
df[0:20]

Unnamed: 0,Cell_Line,Perturbagen,GRinf,GR_AOC,Tissue,Response
0,U2OS,bid1870,0.85864,0.046695,Bone,No
1,U2OS,bortezomib,-0.83761,0.55488,Bone,Yes
2,U2OS,crizotinib,0.43104,-0.001386,Bone,No
3,U2OS,docetaxel,0.089875,0.35513,Bone,No
4,U2OS,doxorubicin,-0.37088,0.46794,Bone,No
5,U2OS,erlotinib,0.64766,0.11387,Bone,No
6,U2OS,gdc0941,0.66665,0.20431,Bone,No
7,U2OS,gemcitabine,0.71421,0.1346,Bone,No
8,U2OS,irinotecan,-0.96577,0.087327,Bone,No
9,U2OS,lapatinib,-1.0,0.20381,Bone,No


In [27]:
df.to_csv('drug_response_processed.csv',sep =',')

In [37]:
tissue = list(df.Tissue)

In [33]:
graoc = df.set_index('Cell_Line')['GR_AOC'].to_dict()

In [35]:
len(graoc)

355

In [57]:
tissue_counts= pd.DataFrame(dict(Counter(tissue)),index = [0]).T
tissue_counts= tissue_counts.reset_index(inplace=False)
tissue_counts.columns = ['Tissue','counts']
tissue_counts= tissue_counts.sort_values(by=['counts'],ascending=False)

In [60]:
tissue_counts.counts =tissue_counts.counts/16

In [61]:
tissue_counts.counts.sum()

350.25

Unnamed: 0,Tissue,counts
7,Lung,59.5625
5,Blood,29.0
11,Breast,28.3125
6,Skin,27.9375
3,Pancreas,27.0
2,Brain,24.9375
8,Ovary,24.0
12,Colon,21.9375
9,Liver,17.5
4,Stomach,16.3125


In [66]:
tissue_dict = df.set_index('Cell_Line')['Tissue'].to_dict()

In [67]:
tissue_dict

{'2313287': 'Stomach',
 '769P': 'Kidney',
 '786O': 'Kidney',
 '8305C': 'Thyroid',
 '8505C': 'Thyroid',
 'A172': 'Brain',
 'A2058': 'Skin',
 'A2780': 'Ovary',
 'A375': 'Skin',
 'A498': 'Kidney',
 'A549': 'Lung',
 'A673': 'Muscle',
 'ABC1': 'Lung',
 'ACHN': 'Kidney',
 'AGS': 'Stomach',
 'AMO1': 'Blood',
 'AN3CA': 'Uterus',
 'ASPC1': 'Pancreas',
 'AU565': 'Breast',
 'BCPAP': 'Thyroid',
 'BEN': 'Lung',
 'BFTC909': 'Kidney',
 'BHY': 'Head and Neck',
 'BT20': 'Breast',
 'BT474': 'Breast',
 'BT549': 'Breast',
 'BXPC3': 'Pancreas',
 'C2BBE1': 'Colon',
 'C32': 'Skin',
 'C33A': 'Cervix',
 'CA46': 'Blood',
 'CAKI1': 'Kidney',
 'CAKI2': 'Kidney',
 'CAL120': 'Breast',
 'CAL12T': 'Lung',
 'CAL148': 'Breast',
 'CAL27': 'Head and Neck',
 'CAL51': 'Breast',
 'CAL54': 'Kidney',
 'CALU3': 'Lung',
 'CAMA1': 'Breast',
 'CAPAN1': 'Pancreas',
 'CAPAN2': 'Pancreas',
 'CASKI': 'Cervix',
 'CCFSTTG1': 'Brain',
 'CFPAC1': 'Pancreas',
 'CHAGOK1': 'Lung',
 'CHP212': 'Brain',
 'CI1': 'Lymphoid',
 'COLO201': 'Colon',

In [90]:
df_t = pd.DataFrame(tissue_dict,index = [0]).T
df_t = df_t.reset_index(inplace= False)
df_t.columns = ['Cell Line','Tissue']
df_t = df_t.sort_values(by = ['Tissue'],ascending = True)
df_t = df_t.reset_index(inplace= False,drop=True)

In [91]:
df_t.to_csv('cell_line_tissue.csv',sep = ',')