In [1]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt


In [2]:
# ACH-000768	MDAMB231_BREAST	MDA-MB-231	905960	1013	Breast Cancer	Basal	Female	ATC


In [3]:
CELLLINE='ACH-000768'
CELLINENAME='MDA-MB-231'
CELLTISSUE='BREAST'

In [4]:
CELLLINE='ACH-000837'
CELLINENAME='NCI-H322M'
CELLTISSUE='LUNG'

In [5]:
CELLLINE='ACH-000510'
CELLINENAME='NCI-H1299'
CELLTISSUE='LUNG'

In [6]:
#ACH-000788	A2058_SKIN	A2058	906792
CELLLINE='ACH-000788'
CELLINENAME='A2058'
CELLTISSUE='SKIN'

In [7]:
#ACH-001190	SKMEL2_SKIN	SK-MEL2	905955	1191	Skin Cancer	malignant_melanoma	Male	
CELLLINE='ACH-001190'
CELLINENAME='SK-MEL-2'
CELLTISSUE='SKIN'

In [8]:
CELLLINE='ACH-000496'
CELLINENAME='NCI-H1792'
CELLTISSUE='LUNG'

In [22]:
CELLLINE='ACH-000667'
CELLINENAME='NCI-HCC44'
CELLTISSUE='LUNG'

### Data Function

In [23]:
def getRNAExpression(celline):    
    # https://depmap.org/portal/data_page/?tab=customDownloads - Expression_Public_24Q4_subsetted.csv
    df = pd.read_csv('data/Expression_Public_24Q4_subsetted.csv', index_col=0)
    filtered_df = df.loc[celline]
    transformed_df = filtered_df.reset_index()   
    transformed_df = transformed_df.rename(columns={filtered_df.name: 'rnaexp'})      
    transformed_df = transformed_df.rename(columns={'index': 'gene'})     
    print("Public RNA Expression data for 'cellline:"+celline,transformed_df.shape)
    print(transformed_df.head(5))
    return transformed_df


In [24]:
def getCN(celline):    
    # https://depmap.org/portal/data_page/?tab=customDownloads - Copy_Number_Public_24Q4_(Log2_transformed)_subsetted.csv
    df = pd.read_csv('data/Copy_Number_Public_24Q4_(Log2_transformed)_subsetted.csv', index_col=0)
    filtered_df = df.loc[celline]
    transformed_df = filtered_df.reset_index()   
    transformed_df = transformed_df.rename(columns={filtered_df.name: 'copynumber'})      
    transformed_df = transformed_df.rename(columns={'index': 'gene'})     
    print("Copy Number data for 'cellline:"+celline,transformed_df.shape)
    print(transformed_df.head(5))
    return transformed_df


In [25]:
def getCRISPRCas9(celline):    
    # CRISPR-Cas9 gene   knockout dependency
    # https://depmap.org/portal/data_page/?tab=customDownloads - CRISPR_(DepMap_Public_24Q4+Score,_Chronos)_subsetted.csv
    df = pd.read_csv('data/CRISPR_(DepMap_Public_24Q4+Score,_Chronos)_subsetted.csv', index_col=0)
    filtered_df = df.loc[celline]
    transformed_df = filtered_df.reset_index()   
    transformed_df = transformed_df.rename(columns={filtered_df.name: 'crisprcas9'})      
    transformed_df = transformed_df.rename(columns={'index': 'gene'})     
    print("CRISPR-Cas9 gene knockout dependency data for 'cellline:"+celline,transformed_df.shape)
    print(transformed_df.head(5))
    return transformed_df


In [26]:

def getRNAi(celline):    
    # https://depmap.org/portal/data_page/?tab=customDownloads - RNAi_(Achilles+DRIVE+Marcotte,_DEMETER2)_subsetted.csv
    df = pd.read_csv('data/RNAi_(Achilles+DRIVE+Marcotte,_DEMETER2)_subsetted.csv', index_col=0)
    filtered_df = df.loc[celline]
    transformed_df = filtered_df.reset_index()   
    transformed_df = transformed_df.rename(columns={filtered_df.name: 'rnai'})      
    transformed_df = transformed_df.rename(columns={'index': 'gene'})     
    print("Public RNAi data for 'cellline:"+celline,transformed_df.shape)
    print(transformed_df.head(5))
    return transformed_df


In [27]:
def getCCLExpression(celline):    
    # RNAseq TPM gene expression data for all genes using RSEM. Log2 tra
    # https://depmap.org/portal/data_page/?tab=allData&releasename=DepMap+Public+20Q1&filename=CCLE_expression_full.csv - CCLE_expression_full.csv
    df = pd.read_csv('data/CCLE_expression_full.csv', index_col=0)
    filtered_df = df.loc[celline]
    transformed_df = filtered_df.reset_index()   
    transformed_df = transformed_df.rename(columns={filtered_df.name: 'ccleexp'})      
    transformed_df = transformed_df.rename(columns={'index': 'gene'})     
    transformed_df['gene'] = transformed_df['gene'].str.split('(').str[0].str.strip()
    print("RNAseq TPM gene expression  data for 'cellline:"+celline,transformed_df.shape)
    print(transformed_df.head(5))
    return transformed_df



In [28]:
def getMethylation(celline):    
    if celline=='ACH-000510':
        celline='ACH-000511'
    # RNAseq TPM gene expression data for all genes using RSEM. Log2 tra
    # https://depmap.org/portal/data_page/?tab=customDownloads Methylation_(1kb_upstream_TSS)_subsetted_NAsdropped.csv
    df = pd.read_csv('data/Methylation_(1kb_upstream_TSS)_subsetted_NAsdropped.csv', index_col=0)
    filtered_df = df.loc[celline]
    transformed_df = filtered_df.reset_index()   
    transformed_df = transformed_df.rename(columns={filtered_df.name: 'methylation'})      
    transformed_df = transformed_df.rename(columns={'index': 'gene'})     
    transformed_df['gene'] = transformed_df['gene'].str.split('_1_').str[0]
    print("Methylation data for 'cellline:"+celline,transformed_df.shape)
    print(transformed_df.head(5))
    return transformed_df



In [29]:
def getProteinGygi(celline):    
    #The Harmonized MS CCLE Gygi protein expression dataset refers to a collection of protein expression data obtained from the CCLE
    # which has been harmonized and generated using mass spectrometry (MS) methods, specifically following the Gygi lab's protocols.
    # https://depmap.org/portal/data_page/?tab=customDownloads - Harmonized_MS_CCLE_Gygi_subsetted.csv
    df = pd.read_csv('data/Harmonized_MS_CCLE_Gygi_subsetted.csv', index_col=0)
    filtered_df = df.loc[celline]
    transformed_df = filtered_df.reset_index()   
    transformed_df = transformed_df.rename(columns={filtered_df.name: 'proteingygi'})      
    transformed_df = transformed_df.rename(columns={'index': 'gene'})     
    transformed_df['gene'] = transformed_df['gene'].str.extract(r'\((.*?)\)', expand=False)
    print("Harmonized_MS_CCLE_Gygi_subsetted  data for 'cellline:"+celline,transformed_df.shape)
    print(transformed_df.head(5))
    return transformed_df



In [30]:
def getProteinRPPA(celline):    
    # TReverse Phase Protein Array (RPPA) data
    # https://depmap.org/portal/data_page/?tab=customDownloads - Harmonized_RPPA_CCLE_subsetted.csv
    df = pd.read_csv('data/Harmonized_RPPA_CCLE_subsetted.csv', index_col=0)
    filtered_df = df.loc[celline]
    transformed_df = filtered_df.reset_index()   
    transformed_df = transformed_df.rename(columns={filtered_df.name: 'proteinrpna'})      
    transformed_df = transformed_df.rename(columns={'index': 'gene'})     
    transformed_df['gene'] = transformed_df['gene'].str.extract(r'\((.*?)\)', expand=False)
    print("Harmonized_RPPA_CCLE_subsetted  data for 'cellline:"+celline,transformed_df.shape)
    print(transformed_df.head(5))
    return transformed_df



In [31]:
#https://github.com/DepMap-Analytics/CoRe/blob/master/notebooks/CoRe_Benchmarking.ipynb
#gene dependency data derived from CRISPR screens, which are used to identify which genes are essential for the survival of cancer cells.
#These essential genes are essentially fitness genes or CFGs.

def getSangerCFGs(celline):    
    df = pd.read_csv('data/Sanger_scaled_Behan2019.csv', index_col=0)
    transformed_df = df.reset_index()   
    transformed_df = transformed_df.rename(columns={'index': 'gene'})   
    transformed_df = transformed_df.rename(columns={'gene_name': 'gene'})     
    transformed_df = transformed_df.rename(columns={CELLINENAME: 'sangercfgs'})     
    transformed_df = transformed_df[['gene','sangercfgs']]
    print("Sanger_scaled_Behan2019 CFGs for cellline:"+celline,transformed_df.shape)
    print(transformed_df.head(5))
    return transformed_df



In [32]:
def getGCContent():    
    gccontent=pd.read_csv('genedata/gccontent.csv') 
    gccontent=gccontent.dropna()
    print(gccontent.shape)
    gccontent=gccontent[['hgnc_symbol','percentage_gene_gc_content']]
    gccontent = gccontent.rename(columns={'hgnc_symbol': 'gene'})
    print('gc content :',gccontent.shape)
    print(gccontent.head())
    return gccontent

In [33]:
def getGOgenes():    
    dfgo_genes = pd.read_csv('genedata/go_genes.csv') 
    dfgo_genes=dfgo_genes.dropna()
    dfgo_genes.shape
    dfgo_genes=dfgo_genes[['hgnc_symbol','go_id']]
    dfgo_genes = dfgo_genes.rename(columns={'hgnc_symbol': 'gene'})
    print('GO Ids:',dfgo_genes.shape)
    print(dfgo_genes.head())
    return dfgo_genes

### Extract data now

In [34]:
df_final = getRNAExpression(CELLLINE)
df_final.drop_duplicates(subset='gene', keep='first')
print('getRNAExpression ',df_final.shape)

dftemp1 = getCN(CELLLINE)
dftemp1.drop_duplicates(subset='gene', keep='first')
df_final = pd.merge(df_final, dftemp1, how='left', on='gene')
df_final = df_final.fillna(0)
print('getCN ',df_final.shape)

dftemp1 = getCRISPRCas9(CELLLINE)
dftemp1.drop_duplicates(subset='gene', keep='first')
df_final = pd.merge(df_final, dftemp1, how='left', on='gene')
df_final = df_final.fillna(0)
print('getCRISPRCas9 ',df_final.shape)


dftemp1 = getRNAi(CELLLINE)
dftemp1.drop_duplicates(subset='gene', keep='first')
df_final = pd.merge(df_final, dftemp1, how='left', on='gene')
df_final = df_final.fillna(0)
print('getRNAi ',df_final.shape)

dftemp1 = getCCLExpression(CELLLINE)
dftemp1.drop_duplicates(subset='gene', keep='first')
df_final = pd.merge(df_final, dftemp1, how='left', on='gene')
df_final = df_final.fillna(0)
print('getCCLExpression ',df_final.shape)

dftemp1 = getMethylation(CELLLINE)
dftemp1.drop_duplicates(subset='gene', keep='first')
df_final = pd.merge(df_final, dftemp1, how='left', on='gene')
df_final = df_final.fillna(0)
print('getMethylation ',df_final.shape)

try:
    dftemp1 = getProteinGygi(CELLLINE)
    dftemp1.drop_duplicates(subset='gene', keep='first')
    df_final = pd.merge(df_final, dftemp1, how='left', on='gene')
    df_final = df_final.fillna(0)
    print('getProteinGygi ',df_final.shape)
except:
    df_final['proteingygi']=0
    
try:
    dftemp1 = getProteinRPPA(CELLLINE)
    dftemp1.drop_duplicates(subset='gene', keep='first')
    df_final = pd.merge(df_final, dftemp1, how='left', on='gene')
    df_final = df_final.fillna(0)
    print('getProteinRPPA ',df_final.shape)
except:
    df_final['proteinrpna']=0


try:
    dftemp1 = getSangerCFGs(CELLLINE)
    dftemp1.drop_duplicates(subset='gene', keep='first')
    df_final = pd.merge(df_final, dftemp1, how='left', on='gene')
    df_final = df_final.fillna(0)
    print('getSangerCFGs ',df_final.shape)
except:
    print("no sanger")



dftemp1 = getGCContent()
dftemp1.drop_duplicates(subset='gene', keep='first')
df_final = pd.merge(df_final, dftemp1, how='left', on='gene')
df_final = df_final.fillna(0)
print('getGCContent ',df_final.shape)


dftemp1 = getGOgenes()
dftemp1 = dftemp1.groupby('gene')['go_id'].agg(' '.join).reset_index()
dftemp1.drop_duplicates(subset='gene', keep='first')
df_final = pd.merge(df_final, dftemp1, how='left', on='gene')
df_final = df_final.fillna(0)
print('getGOgenes ',df_final.shape)

df_final.to_csv(CELLINENAME+'.csv',index=False)
df_final.head()

Public RNA Expression data for 'cellline:ACH-000667 (19153, 2)
     gene    rnaexp
0  TSPAN6  3.475085
1    TNMD  0.000000
2    DPM1  5.632850
3   SCYL3  1.786596
4   FIRRM  2.797013
getRNAExpression  (19153, 2)
Copy Number data for 'cellline:ACH-000667 (38589, 2)
        gene  copynumber
0       RHEB    1.228123
1      TIPIN    0.818824
2     OR4A47    0.957604
3     NUP133    1.068772
4  CPNE8-AS1    1.073707
getCN  (19153, 3)
CRISPR-Cas9 gene knockout dependency data for 'cellline:ACH-000667 (17916, 2)
      gene  crisprcas9
0     A1BG   -0.032698
1     A1CF   -0.115469
2      A2M    0.098039
3    A2ML1    0.168418
4  A3GALT2    0.029768
getCRISPRCas9  (19153, 4)
Public RNAi data for 'cellline:ACH-000667 (16836, 2)
   gene      rnai
0  A1BG -0.122757
1  NAT2  0.154515
2   ADA -0.250882
3  CDH2  0.124039
4  AKT3  0.203598
getRNAi  (19153, 5)
RNAseq TPM gene expression  data for 'cellline:ACH-000667 (58676, 2)
       gene   ccleexp
0    TSPAN6  4.221877
1      TNMD  0.000000
2      DP

Unnamed: 0,gene,rnaexp,copynumber,crisprcas9,rnai,ccleexp,methylation,proteingygi,proteinrpna,percentage_gene_gc_content,go_id
0,TSPAN6,3.475085,0.846132,-0.064129,0.003667,4.221877,0.0,-0.582833,0.0,40.4,GO:0016020 GO:0005515 GO:0043123 GO:0070062 GO...
1,TNMD,0.0,0.846132,0.088048,0.0,0.0,0.0,0.0,0.0,40.78,GO:0005634 GO:0005737 GO:0016020 GO:0005515 GO...
2,DPM1,5.63285,0.846061,-0.031131,0.297415,5.512227,0.0,-0.219692,0.0,40.2,GO:0005783 GO:0016740 GO:0016757 GO:0006486 GO...
3,SCYL3,1.786596,1.069042,-0.090572,-0.166058,1.794936,0.0,0.377583,0.0,40.14,GO:0005524 GO:0006468 GO:0004672 GO:0005737 GO...
4,FIRRM,2.797013,1.069042,0.026379,0.0,0.0,0.0,0.0,0.0,39.22,GO:0005634 GO:0005634 GO:0005737 GO:0005856 GO...


In [None]:
dfgo = getGOgenes()
dfgo = dfgo.groupby('gene')['go_id'].agg(' '.join).reset_index()

dfgo=dfgo.head(10)
print(dfgo)
G = nx.Graph()
for _, row in dfgo.iterrows():
    gene = row['gene']
    gos=row['go_id'].split(' ')
    for go_id in gos:
        G.add_edge(gene,go_id,)  
 
plt.figure(figsize=(6, 4))
pos = nx.kamada_kawai_layout(G)  
nx.draw(G, pos, with_labels=True, node_size=500, node_color='pink', font_size=10, edge_color='darkblue')
plt.title("Go term - Gene Network")
plt.show()



In [None]:
dfgo=dfgo.head(200)
G = nx.Graph()
for _, row in dfgo.iterrows():
    gene = row['gene_name']
    go_id = row['go_id']
    G.add_edge(gene,go_id,)  # Adding an edge between the gene and its associated GO term
 
 
plt.figure(figsize=(6, 4))
pos = nx.kamada_kawai_layout(G)  # Positions for nodes

nx.draw(G, pos, with_labels=True, node_size=500, node_color='pink', font_size=10, edge_color='darkblue')
plt.title("Go term - Gene Network")
plt.show()



In [None]:
G = nx.erdos_renyi_graph(10,0.3)
for _, row in dfgo.iterrows():
    gene = row['gene_name']
    go_id = row['go_id']
    G.add_edge(gene, go_id)  # Adding an edge between the gene and its associated GO term
 
pos = nx.kamada_kawai_layout(G)
plt.figure(figsize=(8, 6))
nx.draw(G, pos, with_labels=True, node_size=700, node_color='lightblue', font_size=12)
plt.title("Circular Layout Example")
plt.show()
