# Preprocessing CCLE RNA-seq and CRISPR knockout data

* Author: Eshika Saxena
* Objective: Combine RNA-seq and CRISPR knockout data based on IDs

## Load libraries

In [1]:
import os
import numpy as np
import pandas as pd

## Read data

In [2]:
data_dir = '/Users/eshikasaxena/Documents/MLHC/mm-cell_lines/data/'

In [3]:
rnaseq = pd.read_csv(os.path.join(data_dir,'CCLE_expression.csv'))
rnaseq = rnaseq.rename(columns={'Unnamed: 0': 'DepMap_ID'})

In [4]:
crispr = pd.read_csv(os.path.join(data_dir,'Achilles_gene_effect.csv'))

## Data selection

* Only keep IDs that are in both RNA-seq and CRISPR knockout.
* Restrict knockout cell viability scores to 13 genes of interest.
* Only keep RNA seq cols that map to Ensembl IDs.

In [5]:
ids = list(set(crispr.DepMap_ID).intersection(set(rnaseq.DepMap_ID)))

print(len(ids))

800


In [6]:
# genes of interest
genes = [col for col in crispr.columns if 'PSMB' in col or 'IKZF1' in col or 'IKZF3' in col]

print(len(genes))
print(genes)

13
['IKZF1 (10320)', 'IKZF3 (22806)', 'PSMB1 (5689)', 'PSMB10 (5699)', 'PSMB11 (122706)', 'PSMB2 (5690)', 'PSMB3 (5691)', 'PSMB4 (5692)', 'PSMB5 (5693)', 'PSMB6 (5694)', 'PSMB7 (5695)', 'PSMB8 (5696)', 'PSMB9 (5698)']


In [7]:
crispr = crispr[crispr.DepMap_ID.isin(ids)][genes + ['DepMap_ID']]

print(len(crispr))
print(len(crispr.columns))

800
14


In [8]:
crispr.head()

Unnamed: 0,IKZF1 (10320),IKZF3 (22806),PSMB1 (5689),PSMB10 (5699),PSMB11 (122706),PSMB2 (5690),PSMB3 (5691),PSMB4 (5692),PSMB5 (5693),PSMB6 (5694),PSMB7 (5695),PSMB8 (5696),PSMB9 (5698),DepMap_ID
0,-0.251064,-0.071323,-1.155305,0.057432,-0.000702,-1.222269,-2.260595,-1.577801,-1.332704,-1.689503,-0.887485,-0.108344,-0.165439,ACH-000004
1,0.063878,-0.403692,-1.594858,-0.192526,-0.308517,-1.016299,-2.368922,-2.043022,-1.087012,-1.978912,-0.666326,-0.141511,-0.293103,ACH-000005
2,0.187426,-0.295669,-0.843872,0.091649,-0.218583,-0.899423,-1.983529,-1.728158,-0.681205,-1.328054,-0.526499,0.03364,0.092887,ACH-000007
3,0.130305,-0.204921,-0.893719,-0.172479,-0.203111,-1.693056,-1.993529,-2.051607,-1.910243,-2.209431,-0.83308,0.168518,-0.168452,ACH-000009
4,0.120217,-0.11119,-1.257408,-0.281493,-0.08527,-1.309566,-2.289617,-1.635516,-1.174024,-1.981024,-0.730417,0.12016,-0.063422,ACH-000011


In [9]:
mapping = pd.read_csv('../utils/Ensembl_HGNC_map_042421.csv')
cols_to_keep = ['DepMap_ID'] + list(mapping.HGNC_ID) 
rnaseq = rnaseq[cols_to_keep]
rnaseq = rnaseq[rnaseq.DepMap_ID.isin(ids)]

print(len(rnaseq))

800


In [10]:
rnaseq.head()

Unnamed: 0,DepMap_ID,TSPAN6 (7105),TNMD (64102),DPM1 (8813),SCYL3 (57147),C1orf112 (55732),FGR (2268),CFH (3075),FUCA2 (2519),GCLC (2729),...,CCL3L3 (414062),OR13C2 (392376),ZNF727 (442319),DUX4 (100288687),LYPD8 (646627),CCL15 (6359),POLR2J2 (246721),MTRNR2L12 (100462981),UGT2A1 (10941),ZNF8 (7554)
1,ACH-001289,5.209843,0.545968,7.070604,2.538538,3.510962,0.0,0.176323,3.836934,4.20085,...,0.0,0.0,1.831877,0.028569,0.042644,0.0,0.286881,0.31034,0.0,1.618239
3,ACH-001538,5.726831,0.0,7.086189,2.543496,3.102658,0.0,5.914565,6.099716,4.475733,...,0.0,0.0,0.475085,0.0,0.863938,0.056584,0.275007,0.298658,0.0,2.111031
7,ACH-000233,0.097611,0.0,5.919102,3.983678,3.733354,0.028569,6.11124,2.963474,3.415488,...,1.427606,0.0,0.0,0.028569,0.028569,0.0,2.395063,1.124328,0.070389,2.042644
8,ACH-000461,4.712596,0.0,6.406333,2.247928,3.032101,0.028569,0.097611,5.528571,6.383704,...,0.042644,0.0,0.014355,0.042644,0.014355,0.0,1.786596,0.545968,0.0,1.427606
10,ACH-001794,4.463361,0.042644,6.714658,1.847997,3.171527,0.62293,6.975905,6.993788,4.234961,...,0.084064,0.0,0.0,0.0,2.056584,0.0,0.084064,0.555816,0.0,1.627607


## Merge and save RNA-seq and knockout data

In [11]:
merged = rnaseq.merge(crispr, on='DepMap_ID')
print(len(merged))

800


In [12]:
merged.head()

Unnamed: 0,DepMap_ID,TSPAN6 (7105),TNMD (64102),DPM1 (8813),SCYL3 (57147),C1orf112 (55732),FGR (2268),CFH (3075),FUCA2 (2519),GCLC (2729),...,PSMB10 (5699)_y,PSMB11 (122706)_y,PSMB2 (5690)_y,PSMB3 (5691)_y,PSMB4 (5692)_y,PSMB5 (5693)_y,PSMB6 (5694)_y,PSMB7 (5695)_y,PSMB8 (5696)_y,PSMB9 (5698)_y
0,ACH-001289,5.209843,0.545968,7.070604,2.538538,3.510962,0.0,0.176323,3.836934,4.20085,...,-0.451711,-0.10644,-1.402389,-2.247205,-1.71851,-1.606852,-1.916276,-0.73549,0.206366,-0.242669
1,ACH-001538,5.726831,0.0,7.086189,2.543496,3.102658,0.0,5.914565,6.099716,4.475733,...,-0.165351,-0.17798,-1.072225,-1.723389,-1.688879,-1.182487,-1.565613,-1.011669,0.125924,-0.200086
2,ACH-000233,0.097611,0.0,5.919102,3.983678,3.733354,0.028569,6.11124,2.963474,3.415488,...,-0.124671,0.194359,-1.266868,-2.250744,-1.739661,-0.856249,-1.614195,-0.91198,-0.047837,0.022769
3,ACH-000461,4.712596,0.0,6.406333,2.247928,3.032101,0.028569,0.097611,5.528571,6.383704,...,-0.144834,-0.043003,-1.228835,-2.13586,-1.899289,-1.952942,-2.377131,-1.012483,0.062852,-0.196778
4,ACH-001794,4.463361,0.042644,6.714658,1.847997,3.171527,0.62293,6.975905,6.993788,4.234961,...,-0.276815,-0.112101,-1.378883,-1.709262,-1.62316,-0.604302,-1.740776,-1.193669,0.171775,-0.142581


In [13]:
save_data = False
if save_data:
    merged.to_csv(os.path.join(data_dir,'rnaseq_crispr_merged.csv'), index=False)