# Multiomic CCLE analysis using the Network Zoo
Marouen Ben Guebila <sup>1</sup>

<sup>1</sup> Harvard T.H. Chan School of Public Health, Boston, MA, USA.

# Introduction
The Cancer Cell Line Encyclopedia (CCLE) has collected various omic data for more than a thousand cancer cell lines, representative of many lineages and tissue type. In this analysis, we will first use DRAGON to find associations between multiomic data types, and second, we will use PANDA-LIONESS-MONSTER to model a transition from primary to metastatic melanoma and identify drivers of this transition. 
# Import packages

In [None]:
import numpy as np
from scipy.stats import skew
import matplotlib.pyplot as plt
import os
import pandas as pd
import seaborn as sns

We set plotting parameters

In [None]:
imputationMissing='zero'
plt.rcParams["font.family"] = "arial"

Next, we define data path on netbooks server.

In [None]:
ppath = '/opt/data/netZooPy/ccle/'

Then, we define a set of functions to import and process CCLE data.

In [None]:
def processdepdata(imputationMissing):
    dep = pd.read_csv(ppath+'Achilles_gene_effect.csv', index_col=0)
    if imputationMissing=='zero':
        dep.replace(to_replace=np.nan, value=0, inplace=True)
    return dep

In [None]:
def processmirnadata(imputationMissing,cellNames):
    mirna=pd.read_csv(ppath+'CCLE_miRNA_20181103.gct',sep='\t',comment='#',skiprows=2,index_col=1)
    # remove unnecessary columns
    mirna = mirna.iloc[:,1:]
    # convert cell names to depmap IDs
    mirna=convertToDepMap(mirna,cellNames)
    mirna=mirna.transpose()
    return mirna

In [None]:
def processDrugs(imputationMissing):
    drugs=pd.read_csv(ppath+'primary-screen-replicate-collapsed-logfold-change.csv',index_col=0)
    drugMeta=pd.read_csv(ppath+'primary-screen-replicate-collapsed-treatment-info.csv')
    # remove failed drug experiments
    keepind=[]
    for i in range(drugs.shape[0]):
        if len(drugs.index[i].split('_')) == 1:
            keepind.append(i)
    #filter drug df
    drugs=drugs.iloc[keepind,:]
    #change drug name
    xy, x_ind, y_ind = np.intersect1d(drugs.columns,drugMeta.loc[:,'column_name'], return_indices=True)
    #first reorganize df by intersection
    drugs=drugs.iloc[:,x_ind]
    #then map drug names
    drugs.columns=drugMeta.loc[y_ind,'name']
    if imputationMissing=='zero':
        drugs.replace(to_replace=np.nan, value=0, inplace=True)
    return drugs

In [None]:
def processPPI(imputationMissing,cellNames):
    ppi = pd.read_csv(ppath+'Table_S2_Protein_Quant_Normalized.csv')
    # remove extra columns manually
    ppi=ppi.iloc[:,:426]
    # Keep SW948_LARGE_INTESTINE_TenPx20, CAL120_BREAST_TenPx28, and HCT15_LARGE_INTESTINE_TenPx18
    # according to https://www.biorxiv.org/content/10.1101/2020.02.03.932384v1
    swintestine=[i for i,item in enumerate(ppi.columns) if "SW948_LARGE_INTESTINE" in item] #132
    calbreast=[i for i,item in enumerate(ppi.columns) if "CAL120_BREAST" in item] #64
    hctintestine=[i for i,item in enumerate(ppi.columns) if "HCT15_LARGE_INTESTINE" in item] #338
    ppi= ppi.drop(labels=ppi.columns[[132,64,338]],axis=1)
    # remove more metadata columns
    ppiindex=ppi.iloc[:,1]
    ppi=ppi.iloc[:,49:]
    ppi.index=ppiindex
    if imputationMissing=='zero':
        ppi=ppi.fillna(0)
    # rename columns
    newColumns=[]
    for i in range(len(ppi.columns)):
        newColumns.append('_'.join(str.split(ppi.columns[i],'_')[0:2]))
    ppi.columns=newColumns
    # remove nan entries in index
    ppi = ppi.loc[ppi.index.dropna()]
    ppi = convertToDepMap(ppi, cellNames)
    ppi = ppi.transpose()
    return ppi

In [None]:
def processmetabolism():
    metabolism = pd.read_csv(ppath+'CCLE_metabolomics_20190502.csv', index_col=1)
    # remove extra column in metabolism
    # manually remove nan row
    metabolism = metabolism.iloc[:, 1:]
    metabolism = metabolism.loc[metabolism.index.dropna()]
    return metabolism

# 1. DRAGON multiomic CCLE network
First, we load the metadata.

In [None]:
cellNames=pd.read_csv(ppath+'sample_info.csv')
drugMeta=pd.read_csv(ppath+'primary-screen-replicate-collapsed-treatment-info.csv')

## 1.1. Correlations between miRNA and gene dependency

We compute correlations between miRNA levels and gene dependency.

In [None]:
def estimatemirnadep(imputationMissing,cellNames):
    dep = processdepdata(imputationMissing)
    mirna=processmirnadata(imputationMissing,cellNames)
    # align dataframes
    dep,mirna=alignDF(dep,mirna)
    # Call DRAGON
    mirnaMat     = mirna.values
    depMat       = dep.values
    # Transpose and scale arrays (do not transpose expression)
    mirnaMat     = Scale(mirnaMat)
    depMat       = Scale(depMat)
    # Estimate lambdas
    r_mir_dep, adj_p_vals_mir_dep, p_vals_mir_dep=estimateDragonValues(mirnaMat, depMat, pval=False)
    # edge format top 5k and bottom 5k edges
    mir_dep_edges = createVisNet(mirna, dep, r_mir_dep, 'mir', 'dep', nedges=0)
    return mir_dep_edges

In [None]:
mir_dep_edges=estimatemirnadep(imputationMissing,cellNames)
sortedarray = np.sort(mir_dep_edges.stack().values)[::-1]
#indsort = np.argwhere(np.abs(sortedarray) > 0.003)
plt.plot(sortedarray,'o',mfc='none', alpha=0.1, color='slategrey')
plt.xticks([])
plt.savefig('/Users/mab8354/netzoopap/figures/figure2/figure2a.png',format='png',dpi=1200)

c=np.argsort(mir_dep_edges.values, axis=None)#small to large
tdindices=np.unravel_index(c, mir_dep_edges.shape)
numindex=tdindices[0][2]
numcol=tdindices[1][2]
print(mir_dep_edges.iloc[numindex,numcol])
print(mir_dep_edges.index[numindex])
print(mir_dep_edges.columns[numcol])
#for -1:-1 http://mirdb.org/cgi-bin/search.cgi
#for 0:0
# mirdb: http://mirdb.org/cgi-bin/search.cgi?searchType=miRNA&searchBox=hsa-miR-664a-3p&full=1
# targetscan: http://www.targetscan.org/cgi-bin/targetscan/vert_71/targetscan.cgi?mirg=hsa-miR-664a-3p

## 1.2. Correlations between drug cell viability and gene dependency
We compute DRAGON partial correlations between drug cell viability and gene dependency.

In [None]:
def estimatedepdrug(imputationMissing):
    print('Dep-Drug')
    # Read proteins and drugs
    dep  = processdepdata(imputationMissing)
    drugs= processDrugs(imputationMissing)
    # align dfs
    dep,drugs=alignDF(dep,drugs)
    # Call DRAGON
    depMat       = dep.values
    drugsMat     = drugs.values
    # Transpose and scale arrays (do not transpose expression)
    depMat       = Scale(depMat) #replace by dragon.scale
    drugsMat     = Scale(drugsMat)
    # Estimate lambdas
    r_dep_drugs, adj_p_vals_dep_drugs, p_vals_dep_drugs=estimateDragonValues(depMat, drugsMat)
    # edge format top 5k and bottom 5k edges
    dep_drugs_edges=createVisNet(dep, drugs, r_dep_drugs,'dep','drugs',nedges=0)
    return dep_drugs_edges

In [None]:

dep_drugs_edges = estimatedepdrug(imputationMissing)

oncdep_drugs_edges=dep_drugs_edges[oncdrugindex]
oncdep_drugs_edges['dabrafenib'].sort_values()[-20:]
oncdep_drugs_edges['gemcitabine'].sort_values()[-20:]
oncdep_drugs_edges['trametinib'].sort_values()[-20:]#interesting RAF, EGFR, kRAS
oncdep_drugs_edges['irinotecan'].sort_values()[-20:]
oncdep_drugs_edges['vinblastine'].sort_values()[-20:]
oncdep_drugs_edges['topotecan'].sort_values()[-20:]
oncdep_drugs_edges['methotrexate'].iloc[:,0].sort_values()[-20:]
oncdep_drugs_edges['methotrexate'].iloc[:,1].sort_values()[-20:]
oncdep_drugs_edges['methotrexate'].iloc[:,2].sort_values()[-20:]
oncdep_drugs_edges['methotrexate'].iloc[:,3].sort_values()[-20:]
#dep_drugs_edges['ceritinib'].sort_values()[-20:]
flierprops = dict(markerfacecolor='0.75', markersize=5,
              linestyle='none',marker='o')
sns_plot = sns.boxplot(oncdep_drugs_edges['dabrafenib'], orient='v',width=.6,flierprops=flierprops)
plt.savefig('/Users/mab8354/netzoopap/figures/figure2/figure2c.eps',format='eps')

## 1.3. Correlations between LDH protein levels and metabolite levels
We compute correlations between LDH proteins levels and metabolite levels.

In [None]:
def estimateprotmet(cellNames):
    # IV. Protein-metabolome
    print('Prot-met')
    # Read proteins and metabolism
    ppi        = processPPI(imputationMissing,cellNames)
    metabolism = processmetabolism()
    # align dataframes
    metabolism, ppi = alignDF(metabolism, ppi)
    # Call DRAGON
    ppiMat        = ppi.values
    metabolismMat = metabolism.values
    # Transpose and scale arrays (do not transpose expression)
    ppiMat        = Scale(ppiMat)  # replace by dragon.scale
    metabolismMat = Scale(metabolismMat)
    # Estimate lambdas
    r_ppi_met, adj_p_vals_ppi_met, p_vals_ppi_met = estimateDragonValues(ppiMat, metabolismMat)
    # edge format top 5k and bottom 5k edges
    ppi_met_edges = createVisNet(ppi, metabolism, r_ppi_met, 'prot', 'met',nedges=0)
    return ppi_met_edges

In [None]:
ppi_met_edges=estimateprotmet(cellNames)
c=ppi_met_edges.loc['LDHA',].sort_values()
d=ppi_met_edges.loc['LDHB',].sort_values()
##warburg effect: neg corr g3p, PEP with LDHA and low corr with fumarate/maleate shows that TCA is not used
sns_plot = sns.swarmplot([Scale(c.values),Scale(d.values], orient='v')
plt.savefig('/Users/mab8354/netzoopap/figures/figure3/figure3_ldha_ldhb.eps',format='eps')

f = {'LDHA': c.values, 'LDHB': d.values}
dff=pd.DataFrame(data=f)
sns_plot = sns.swarmplot(data=Scale(dff), orient='v')

##warburg effect: neg corr g3p, PEP with LDHA and low corr with fumarate/maleate shows that TCA is not used
plt.savefig('/Users/mab8354/netzoopap/figures/figure3/figure3_ldha_ldhb.eps',format='eps')

np.where(d.index=='lactate')
ll=np.zeros(225)
ll[156]=1
f = {'LDHA': c.values, 'LDHB': d.values, 'lactate':ll}
dff=pd.DataFrame(data=f)
sns_plot = sns.swarmplot(data=Scale(dff), y='LDHB', x=np.ones(len(dff)), hue="lactate")

ddd[0]=[3.7052299080735225e-05]

## 1.4. Correlations between TF targeting scores and metabolite levels
We first load TF and gene targeting scores for all CCLE cell lines.

In [None]:
genetar = pd.read_csv(ppath+'CCLE_genetar.csv',index_col=0)
tftar   = pd.read_csv(ppath+'CCLE_tftar.csv',index_col=0)

We compute correlations between TF targeting scores and metabolite levels.

In [None]:
def estimatetftarmet(cellNames, tftar):
    print('tftar-met')
    tftar=tftar.transpose()
    # Read proteins and metabolism
    metabolism = processmetabolism()
    # align dataframes
    metabolism, tftar = alignDF(metabolism, tftar)
    # Call DRAGON
    tftarMat        = tftar.values
    metabolismMat   = metabolism.values
    # Transpose and scale arrays (do not transpose expression)
    tftarMat        = Scale(tftarMat)  # replace by dragon.scale
    metabolismMat = Scale(metabolismMat)
    # Estimate lambdas
    r_tftar_met, adj_p_vals_tftar_met, p_vals_tftar_met = estimateDragonValues(tftarMat, metabolismMat)
    # edge format top 5k and bottom 5k edges
    tftar_met_edges = createVisNet(tftar, metabolism, r_tftar_met, 'tftar', 'met',nedges=0)
    return tftar_met_edges

In [None]:
tftar_met_edges = estimatetftarmet(cellNames, tftar)
c=tftar_met_edges['2-hydroxyglutarate'].sort_values()
sns_plot = sns.boxplot(data=Scale(c.values))
plt.savefig('/Users/mab8354/netzoopap/figures/figure3/figure3_oncomet.eps',format='eps')

# 2. MONSTER transition analysis in melanoma