In [1]:
import os
import numpy as np
import pandas as pd

## Read data

In [2]:
data_dir = '/Users/eshikasaxena/Documents/MLHC/mm-cell_lines/data/'

In [3]:
rnaseq = pd.read_csv(os.path.join(data_dir,'CCLE_expression.csv'))
rnaseq = rnaseq.rename(columns={'Unnamed: 0': 'DepMap_ID'})

In [4]:
crispr = pd.read_csv(os.path.join(data_dir,'Achilles_gene_effect.csv'))

In [5]:
doseresponse = pd.read_csv(os.path.join(data_dir,'sanger-dose-response.csv'))
doseresponse = doseresponse.rename(columns={'ARXSPAN_ID':'DepMap_ID'})

In [6]:
sample_info = pd.read_csv(os.path.join(data_dir,'sample_info.csv'))

In [7]:
doseresponse_bortez = doseresponse[doseresponse.DRUG_NAME.str.contains("BORTEZOMIB", na=False)]
doseresponse_lenal = doseresponse[doseresponse.DRUG_NAME.str.contains("LENALIDOMIDE", na=False)]

## Restrict data to IDs present in knockout, rnaseq, and doseresponse with 2 chosen drugs

In [8]:
ids = list(set(crispr.DepMap_ID).intersection(set(rnaseq.DepMap_ID)))
bortez_ids = list(set(ids).intersection(set(doseresponse_bortez.DepMap_ID)))
lenal_ids = list(set(ids).intersection(set(doseresponse_lenal.DepMap_ID)))

print(len(ids), len(bortez_ids), len(lenal_ids))

800 366 405


In [9]:
# genes suggested by Romanos
bortez_genes = [col for col in crispr.columns if 'PSMB' in col]
lenal_genes = [col for col in crispr.columns if 'IKZF1' in col or 'IKZF3' in col]

print(bortez_genes)
print(lenal_genes)

['PSMB1 (5689)', 'PSMB10 (5699)', 'PSMB11 (122706)', 'PSMB2 (5690)', 'PSMB3 (5691)', 'PSMB4 (5692)', 'PSMB5 (5693)', 'PSMB6 (5694)', 'PSMB7 (5695)', 'PSMB8 (5696)', 'PSMB9 (5698)']
['IKZF1 (10320)', 'IKZF3 (22806)']


In [10]:
crispr_bortez = crispr[crispr.DepMap_ID.isin(bortez_ids)][bortez_genes + ['DepMap_ID']]
crispr_lenal = crispr[crispr.DepMap_ID.isin(lenal_ids)][lenal_genes + ['DepMap_ID']]

In [11]:
mapping = pd.read_csv('Ensembl_HGNC_map_042421.csv')
cols_to_keep = ['DepMap_ID'] + list(mapping.HGNC_ID) 
rnaseq = rnaseq[cols_to_keep]

rnaseq_bortez = rnaseq[rnaseq.DepMap_ID.isin(bortez_ids)]
rnaseq_lenal = rnaseq[rnaseq.DepMap_ID.isin(lenal_ids)]

## Manually knockout the genes in the RNA and save CRISPR response value  

In [12]:
bortez_combined = []

for index, row in rnaseq_bortez.iterrows():
    for gene in bortez_genes:
        new_row = row.copy()
        new_row[gene] = 0
        new_row['Knockout'] = gene
        new_row['Response'] = crispr_bortez[crispr_bortez.DepMap_ID == new_row['DepMap_ID']][gene].values[0]
        bortez_combined.append(new_row.to_dict())

In [13]:
lenal_combined = []

for index, row in rnaseq_lenal.iterrows():
    for gene in lenal_genes:
        new_row = row.copy()
        new_row[gene] = 0
        new_row['Knockout'] = gene
        new_row['Response'] = crispr_lenal[crispr_lenal.DepMap_ID == new_row['DepMap_ID']][gene].values[0]
        lenal_combined.append(new_row.to_dict())

In [14]:
bortez_df = pd.DataFrame(bortez_combined)
lenal_df = pd.DataFrame(lenal_combined)

In [15]:
print(len(bortez_df), len(lenal_df))

4026 810


In [16]:
bortez_df.to_csv('bortezomib_crispr_knockout.csv', index=False)
lenal_df.to_csv('lenalidomide_crispr_knockout.csv', index=False)