In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re

# CCL name cleaning GDSC

In [2]:
gdsc_dr = pd.read_csv('data/GDSC/metrics.csv').rename(columns={'Cell_line':'CCL'})
gdsc_ge = pd.read_csv('data/GDSC/ge.csv', sep='\t').rename(columns={'Unnamed: 0':'CCL'}).transpose()


In [3]:
def clean(s):
    new = []
    for i in s:
        new.append(re.sub("[^A-Za-z0-9]", '', i).upper())
    return new

In [4]:
poz_r = list(gdsc_dr['CCL'].unique())
cell_r = list(gdsc_ge.index.to_series().reset_index()[0])

cell = clean(cell_r)
poz = clean(poz_r)

In [5]:
miss1 = []
for ccl in poz:
    if ccl not in cell:
        miss1.append(ccl)
print(len(miss1))
miss1

8


['786O', 'COLO320', 'EOL1', 'KNS81', 'NCIH510', 'TTTHYROID', 'U266B1', 'UO31']

In [6]:
poz2 = {k:re.sub("[^A-Za-z0-9]", '', k).upper() for k in poz_r if re.sub("[^A-Za-z0-9]", '', k).upper() not in miss1}

miss1_sub = ['7860', 'COLO320HSR','EOL1CELL', 'KNS81FD', 'NCIH510A','TT' , 'U266', 'U031']

for i, ele in enumerate(miss1):
    poz2[ele] = miss1_sub[i]

In [7]:
gdsc_dr['CCL'] = gdsc_dr['CCL'].replace(poz2)

In [8]:
gdsc_dr.to_csv('data/Processed/gdsc_poz_dr.csv', index =False)

In [9]:
gdsc_ge = gdsc_ge.reset_index()
gdsc_ge.columns = gdsc_ge.iloc[0]
gdsc_ge = gdsc_ge[1:]

In [10]:
cell_sub = {k:re.sub("[^A-Za-z0-9]", '', k).upper() for k in gdsc_ge['CCL']}
gdsc_ge['CCL'] = gdsc_ge['CCL'].replace(cell_sub)

In [11]:
drop = ['index']
for i in gdsc_ge['CCL']:
    if i not in gdsc_dr['CCL'].unique():
        drop.append(i)
gdsc_ge = gdsc_ge[~gdsc_ge['CCL'].isin(drop)].reset_index()

In [12]:
gdsc_ge.drop('index', axis=1).to_csv('data/Processed/gdsc_cell_ge.csv', index=False)

# CCL name cleaning CCLE

In [13]:
ccle_ge = pd.read_csv('data/CCLE/ge.csv', sep='\t').rename(columns={'Unnamed: 0':'CCL'}).transpose()
ccle_dr = pd.read_csv('data/CCLE/metrics.csv').rename(columns={'Cell_line':'CCL'})

In [14]:
poz = list(ccle_dr['CCL'].unique())
cell = list(ccle_ge.index.to_series().str.upper().reset_index()[0])

In [15]:
poz = clean(poz)
cell = clean(cell)

In [16]:
miss2 = []
for ccl in poz:
    if ccl not in cell:
        miss2.append(ccl)
miss2

['BGC823',
 'GLC82',
 'KP1N',
 'KP1NL',
 'M059J',
 'MB157',
 'OVCAR3',
 'SF8657',
 'SNUC2B']

In [17]:
ccle_dr_2 = ccle_dr[~ccle_dr['CCL'].isin(['BGC823', 'GLC82', 'KP1N', 'KP1NL', 'M059J', 'SF8657', 'SNUC2B', 'MB157'])]
ccle_dr_2.to_csv('data/Processed/ccle_poz_dr.csv', index =False)

In [18]:
ccle_ge = ccle_ge.reset_index()
ccle_ge.columns = ccle_ge.iloc[0]
ccle_ge = ccle_ge[1:]

In [19]:
cell_sub = {k:re.sub("[^A-Za-z0-9]", '', k).upper() for k in ccle_ge['CCL'].unique()}
cell_sub['NIH:OVCAR-3'] = 'OVCAR3'
ccle_ge['CCL'] = ccle_ge['CCL'].replace(cell_sub)

In [20]:
drop = ['index']
for i in ccle_ge['CCL']:
    if i not in ccle_dr['CCL'].unique():
        drop.append(i)
ccle_ge = ccle_ge[~ccle_ge['CCL'].isin(drop)].reset_index()

In [21]:
ccle_ge.drop('index', axis=1).to_csv('data/Processed/ccle_cell_ge.csv', index =False)

# CCL name cleaning CTRP

In [22]:
ctrp_ge = pd.read_csv('data/CTRP/cell_ge.csv', sep='\t').rename(columns={'Unnamed: 0':'CCL'}).transpose()
ctrp_dr = pd.read_csv('data/CTRP/metrics.csv').rename(columns={'Cell_line':'CCL'})

In [23]:
import re

poz_r = list(ctrp_dr['CCL'].unique())
poz = clean(poz_r)

In [24]:
miss = []
for ccl in poz:
    if ccl not in cell:
        miss.append(ccl)
len(miss)

49

In [25]:
miss

['2004',
 'ASKA',
 'BT112',
 'BT131',
 'BT139',
 'BT145',
 'BT147',
 'BT159',
 'BT16',
 'BT164',
 'BT172',
 'BT179',
 'BT216',
 'BT224',
 'BT228',
 'BT231',
 'BT232',
 'BT239',
 'BT245',
 'BT248',
 'BT271',
 'BT286',
 'BT320',
 'BT328',
 'BT330',
 'BT333',
 'BT340',
 'BT359',
 'BT416',
 'BT422',
 'BT428',
 'BT440',
 'BT444',
 'BT482',
 'BT498',
 'BT504',
 'H292',
 'HS578BST',
 'HS888LU',
 'HSTS',
 'KRIJ',
 'PSTS',
 'SW982',
 'SYO1',
 'TM8716',
 'TTTHYROID',
 'UCH1',
 'YAMATO',
 'BT187']

We will drop all CCL's of the series BT and some others not found on cellminercdb

In [26]:
drop = [v for v in miss if v[:2]=='BT']

In [27]:
drop.extend(['2004', 'ASKA', 'HSTS', 'KRIJ', 'PSTS', 'SW982', 'SYO1', 'TM8716', 'UCH1', 'YAMATO'])

In [28]:
poz2 = {k:re.sub("[^A-Za-z0-9]", '', k) for k in poz_r if re.sub("[^A-Za-z0-9]", '', k).upper() not in miss}

miss1_sub = ['NCIH292', 'HS578T', 'HS888T', 'TT']

miss = [v for v in miss if v not in drop]
for i, ele in enumerate(miss):
    poz2[ele] = miss1_sub[i]

In [29]:
ctrp_dr = ctrp_dr[~ctrp_dr['CCL'].isin(drop)]
ctrp_dr['CCL'] = ctrp_dr['CCL'].replace(poz2)

In [None]:
ctrp_dr.to_csv('data/Processed/ctrp_poz_dr.csv', index =False)

In [None]:
ctrp_ge = ccle_ge
drop = ['index']
for i in ctrp_ge['CCL']:
    if i not in ctrp_dr['CCL'].unique():
        drop.append(i)
ctrp_ge = ctrp_ge[~ctrp_ge['CCL'].isin(drop)].reset_index()

In [None]:
ctrp_ge

In [None]:
ctrp_ge.drop(['index', 'level_0'], axis=1).to_csv('data/Processed/ctrp_cell_ge.csv', index =False)