# Simulating RNA-seq profiles to represent the perturbed cell line transcriptome

* Author: Eshika Saxena
* Objective: Simulate RNA-seq profiles to represesnt the perturbed cell line transcriptome by setting the RNA-seq value for the perturbed gene to 0 

## Load libraries

In [1]:
import os
import numpy as np
import pandas as pd

## Read data

In [2]:
data_dir = '/Users/eshikasaxena/Documents/MLHC/mm-cell_lines/data/'

In [3]:
rnaseq = pd.read_csv(os.path.join(data_dir,'CCLE_expression.csv'))
rnaseq = rnaseq.rename(columns={'Unnamed: 0': 'DepMap_ID'})

In [4]:
crispr = pd.read_csv(os.path.join(data_dir,'Achilles_gene_effect.csv'))

In [5]:
doseresponse = pd.read_csv(os.path.join(data_dir,'sanger-dose-response.csv'))
doseresponse = doseresponse.rename(columns={'ARXSPAN_ID':'DepMap_ID'})

In [6]:
doseresponse_bortez = doseresponse[doseresponse.DRUG_NAME.str.contains("BORTEZOMIB", na=False)]
doseresponse_lenal = doseresponse[doseresponse.DRUG_NAME.str.contains("LENALIDOMIDE", na=False)]

## Restrict data to IDs present in knockout, rnaseq, and doseresponse with 2 chosen drugs

In [7]:
ids = list(set(crispr.DepMap_ID).intersection(set(rnaseq.DepMap_ID)))
bortez_ids = list(set(ids).intersection(set(doseresponse_bortez.DepMap_ID)))
lenal_ids = list(set(ids).intersection(set(doseresponse_lenal.DepMap_ID)))

print(len(ids), len(bortez_ids), len(lenal_ids))

800 366 405


In [8]:
# genes of interest
bortez_genes = [col for col in crispr.columns if 'PSMB' in col]
lenal_genes = [col for col in crispr.columns if 'IKZF1' in col or 'IKZF3' in col]

print(bortez_genes)
print(lenal_genes)

['PSMB1 (5689)', 'PSMB10 (5699)', 'PSMB11 (122706)', 'PSMB2 (5690)', 'PSMB3 (5691)', 'PSMB4 (5692)', 'PSMB5 (5693)', 'PSMB6 (5694)', 'PSMB7 (5695)', 'PSMB8 (5696)', 'PSMB9 (5698)']
['IKZF1 (10320)', 'IKZF3 (22806)']


In [9]:
crispr_bortez = crispr[crispr.DepMap_ID.isin(bortez_ids)][bortez_genes + ['DepMap_ID']]
crispr_lenal = crispr[crispr.DepMap_ID.isin(lenal_ids)][lenal_genes + ['DepMap_ID']]

In [10]:
mapping = pd.read_csv('../utils/Ensembl_HGNC_map_042421.csv')
cols_to_keep = ['DepMap_ID'] + list(mapping.HGNC_ID) 
rnaseq = rnaseq[cols_to_keep]

rnaseq_bortez = rnaseq[rnaseq.DepMap_ID.isin(bortez_ids)]
rnaseq_lenal = rnaseq[rnaseq.DepMap_ID.isin(lenal_ids)]

## Manually knockout the genes in the RNA-seq and save corresponding cell viability score  

In [11]:
bortez_combined = []

for index, row in rnaseq_bortez.iterrows():
    for gene in bortez_genes:
        new_row = row.copy()
        new_row[gene] = 0
        new_row['Knockout'] = gene
        new_row['Response'] = crispr_bortez[crispr_bortez.DepMap_ID == new_row['DepMap_ID']][gene].values[0]
        bortez_combined.append(new_row.to_dict())

In [12]:
lenal_combined = []

for index, row in rnaseq_lenal.iterrows():
    for gene in lenal_genes:
        new_row = row.copy()
        new_row[gene] = 0
        new_row['Knockout'] = gene
        new_row['Response'] = crispr_lenal[crispr_lenal.DepMap_ID == new_row['DepMap_ID']][gene].values[0]
        lenal_combined.append(new_row.to_dict())

## Save data to csv files

In [13]:
bortez_df = pd.DataFrame(bortez_combined)
lenal_df = pd.DataFrame(lenal_combined)

In [14]:
print(len(bortez_df), len(lenal_df))

4026 810


In [15]:
bortez_df.head()

Unnamed: 0,DepMap_ID,TSPAN6 (7105),TNMD (64102),DPM1 (8813),SCYL3 (57147),C1orf112 (55732),FGR (2268),CFH (3075),FUCA2 (2519),GCLC (2729),...,PADI6 (353238),CCL3L3 (414062),OR13C2 (392376),DUX4 (100288687),LYPD8 (646627),CCL15 (6359),MTRNR2L12 (100462981),ZNF8 (7554),Knockout,Response
0,ACH-000233,0.097611,0.0,5.919102,3.983678,3.733354,0.028569,6.11124,2.963474,3.415488,...,0.0,1.427606,0.0,0.028569,0.028569,0.0,1.124328,2.042644,PSMB1 (5689),-0.921257
1,ACH-000233,0.097611,0.0,5.919102,3.983678,3.733354,0.028569,6.11124,2.963474,3.415488,...,0.0,1.427606,0.0,0.028569,0.028569,0.0,1.124328,2.042644,PSMB10 (5699),-0.124671
2,ACH-000233,0.097611,0.0,5.919102,3.983678,3.733354,0.028569,6.11124,2.963474,3.415488,...,0.0,1.427606,0.0,0.028569,0.028569,0.0,1.124328,2.042644,PSMB11 (122706),0.194359
3,ACH-000233,0.097611,0.0,5.919102,3.983678,3.733354,0.028569,6.11124,2.963474,3.415488,...,0.0,1.427606,0.0,0.028569,0.028569,0.0,1.124328,2.042644,PSMB2 (5690),-1.266868
4,ACH-000233,0.097611,0.0,5.919102,3.983678,3.733354,0.028569,6.11124,2.963474,3.415488,...,0.0,1.427606,0.0,0.028569,0.028569,0.0,1.124328,2.042644,PSMB3 (5691),-2.250744


In [16]:
save_data = False
if save_data: 
    bortez_df.to_csv(os.path.join(data_dir,'bortezomib_crispr_knockout.csv'), index=False)
    lenal_df.to_csv(os.path.join(data_dir,'lenalidomide_crispr_knockout.csv'), index=False)