In [35]:
%%sh
# MC delete this cell at the end, as it is just for interfacing with the underlying linux environment.
ls report

Montagud2022_Prostate_Cancer.bnd
Montagud2022_Prostate_Cancer.cfg
Montagud2022_Prostate_Cancer.zginml
Montagud2022_interactions_sources.xlsx
Montagud2022_nodes in pathways.xlsx
elife-72626-v2.pdf
supplementary_files


# Introduction to data analysis for natural and social sciences
This notebook contitutes the first part of the exam.

Here the steps of article "Patient-specific Boolean models of signalling networks guide personalised treatments" are retraced and the results reproduced.

## Imports and global settings

In [36]:
import numpy as np
import pandas as pd

In [37]:
PATH_REPORT = "report"

EXT_EXCEL = "xlsx"
EXT_TAB = "tsv"

# Prostate Boolean model construction
The Boolean model is constructed starting from information available in literature. Then further pathways are identified by the use of software ROMA and pypath and they are added to the existing network.

The authors collected all the data regarding the network, such as nodes, their role, logical rules, in the two following Excel files:

In [38]:
f_nodes_pathways = "Montagud2022_nodes in pathways.xlsx"
f_nodes_network = "Montagud2022_interactions_sources.xlsx"

Data are loaded in Pandas dataframe to ease their manipulation.

In [39]:
df_nodes_pathways = pd.read_excel(
    io=f"{PATH_REPORT}/{f_nodes_pathways}",
    header=None,
    names=["node", "pathway"]
)
sheet_interactions = "Nodes"
df_nodes_interactions = pd.read_excel(
    io=f"{PATH_REPORT}/{f_nodes_network}",
    sheet_name=sheet_interactions,
    header=1,
    converters={"Reference: PMID": lambda c: np.str_(c).strip()}  # Remove a useless line break in a cell.
)
sheet_unique="Nodes_unique"
df_nodes_unique = pd.read_excel(
    io=f"{PATH_REPORT}/{f_nodes_network}",
    sheet_name=sheet_unique
)

In [40]:
# MC debug.
print(df_nodes_pathways.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 133 entries, 0 to 132
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   node     133 non-null    object
 1   pathway  133 non-null    object
dtypes: object(2)
memory usage: 2.2+ KB
None


In [41]:
# MC debug.
print(df_nodes_interactions.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 462 entries, 0 to 461
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Target node       462 non-null    object
 1   HUGO names        454 non-null    object
 2   Interaction type  462 non-null    object
 3   Source            462 non-null    object
 4   Description       462 non-null    object
 5   Reference: PMID   453 non-null    object
 6   Logical rule      462 non-null    object
dtypes: object(7)
memory usage: 25.4+ KB
None


In [42]:
# One single logical rule is associated to each node, indeed the result of grouping by node and rule is a list of exactly 133 rows.
df_count = df_nodes_interactions.groupby(["Target node", "Logical rule"]).count()
display(df_count)
del df_count

Unnamed: 0_level_0,Unnamed: 1_level_0,HUGO names,Interaction type,Source,Description,Reference: PMID
Target node,Logical rule,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AKT,((HSPs | (PDK1 & PIP3) | PIP3 | (SHH & PIP3)) & !PTCH1),5,5,5,5,5
AMPK,(ATR | HIF1 | AMP_ATP | ATM) & !FGFR3,6,6,6,6,6
AMP_ATP,(!Nutrients),1,1,1,1,1
APAF1,((Caspase8 | BAX | p53 | Bak | HSPs) & !Bcl_XL & !BCL2 & !AKT),8,8,8,8,8
AR,((GLI | EP300 | HSPs | NKX3_1 | EZH2 | NCOA3 | PKC | SMAD | Androgen) & !PTEN & !NCOR1 & !NCOR2 & !MDM2),13,13,13,13,13
...,...,...,...,...,...,...
p21,((p53 | SMAD | HIF1 | ZBTB17) & !TERT & !MYC_MAX & !MDM2 & !AKT & !ERK),9,9,9,9,9
p38,(MAP3K1_3 & !ERK & !GADD45),3,3,3,3,3
p53,((Acidosis | CHK1_2 | p38 | HIF1) & !BCL2 & !MDM2 & !HSPs & !Snail),9,9,9,9,9
p70S6kab,(mTORC2 | PDK1),2,2,2,2,2


In [43]:
# MC debug.
print(df_nodes_unique.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121 entries, 0 to 120
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Node          121 non-null    object
 1   HGNC symbols  121 non-null    object
 2   unique        121 non-null    object
 3   comments      39 non-null     object
dtypes: object(4)
memory usage: 3.9+ KB
None


In [44]:
# df_nodes_unique contains only 121 nodes and not 133 as df_nodes_pathways.
# In other words, df_nodes_unique["Node"] should be a subset of df_nodes_pathways["node"]. Why?
df = df_nodes_pathways.set_index("node")
df_subset = df_nodes_unique.set_index("Node")

# The input nodes should be all removed, since they are not regulated by authors' choice,
# hence they are not part of any pathway. But this is not what happens.
display(df.loc[df["pathway"] == "Input"])
display(df.drop(labels=df_subset.index, errors="ignore"))

# Moreover, in df_nodes_unique there is a node called MAX which is not part of the nodes considered for the final network.
# Surprisingly, it is not present among the nodes in df_nodes_interactions.
try:
    df.drop(labels=df_subset.index)
except Exception as e:
    print(e)
finally:
    del df
    del df_subset

# In conclusion, it seems that the choice of nodes from the Excel files can not be deduced directly
# just from the observation of the content of the files.
# In particular, I should use data in df_nodes_unique with caution
# since their relation with the other data is not straightforward.

Unnamed: 0_level_0,pathway
node,Unnamed: 1_level_1
Acidosis,Input
Androgen,Input
Carcinogen,Input
EGF,Input
FGF,Input
fused_event,Input
Hypoxia,Input
Nutrients,Input
SPOP,Input
TGFb,Input


Unnamed: 0_level_0,pathway
node,Unnamed: 1_level_1
Acidosis,Input
Androgen,Input
Apoptosis,Output
Carcinogen,Input
DNA_Damage,DNA repair pathw
DNA_Repair,Output
EMT,Invasion pathw
Hypoxia,Input
Invasion,Output
Metastasis,Output


"['MAX'] not found in axis"


Data about nodes are then exported in files with tab-separated values (TSV) format, to import them in Cytoscape in a later time.

In [49]:
name_nodes_pathways = f_nodes_pathways.removesuffix(f".{EXT_EXCEL}")
name_nodes_network = f_nodes_network.removesuffix(f".{EXT_EXCEL}")

df_nodes_pathways.to_csv(
    path_or_buf=f"{name_nodes_pathways}.{EXT_TAB}",
    sep='\t',
    index=False
)
df_nodes_interactions.to_csv(
    path_or_buf=f"{name_nodes_network}_{sheet_interactions}.{EXT_TAB}",
    sep='\t',
    index=False
)
df_nodes_unique.to_csv(
    path_or_buf=f"{name_nodes_network}_{sheet_unique}.{EXT_TAB}",
    sep='\t',
    index=False
)

To create the network, one single data file can be used, which contains data about interactions and pathways.

In [68]:
df_other = df_nodes_pathways.set_index("node")
df_other = df_other["pathway"].apply(lambda x: np.str_(x).removesuffix(" pathw"))

df_cytoscape = df_nodes_interactions.join(
    other=df_other,
    on="Target node"
)

df_cytoscape.to_csv(
    path_or_buf=f"cytoscape_data.{EXT_TAB}",
    sep='\t',
    index=False
)

After having imported the file in Cytoscape, node "0/1" is hidden because it is generated by the software as source node for input nodes.

In [None]:
# MC nodes without inputs.
df = df_nodes_interactions.loc[df_nodes_interactions["Interaction type"] != "input"]
df.to_csv(
    path_or_buf=f"{name_nodes_network}_{sheet_interactions}_noinput.{EXT_TAB}",
    sep='\t',
    index=False
)

In [48]:
# MC debug.
display(df_nodes_interactions.loc[df_nodes_interactions["Source"] == "0/1"])
display(df_nodes_interactions.loc[df_nodes_interactions["Interaction type"] == "input"])
#display(df_nodes_interactions.loc[df_nodes_interactions["Source"] == "Proliferation"])
#display(df_nodes_interactions.loc[df_nodes_interactions["Target node"] == "0/1"])
#display(df_nodes_interactions.loc[df_nodes_interactions["Logical rule"] == "((!TGFBR & !FRS2 & !EGFR & !PKC & (ERK | Androgen | EGF)) | (!TGFBR & !FRS2 & (Androgen | EGF)))"])

Unnamed: 0,Target node,HUGO names,Interaction type,Source,Description,Reference: PMID,Logical rule
0,Acidosis,input,input,0/1,Input of the model,,(Acidosis)
13,Androgen,input,input,0/1,Input of the model,,(Androgen)
88,Carcinogen,input,input,0/1,"From Fumia et al, 2013",23922675,(Carcinogen)
150,EGF,input,input,0/1,Input of the model,,(EGF)
180,FGF,"FGF1, FGF2, FGF3, FGF4, FGF5, FGF6, FGF7, FGF8...",input,0/1,Input of the model,,(FGF)
197,fused_event,TMPRSS2,input,0/1,TMPRSS2-Ets gene fusions were identified in pr...,"23264855, 20118910",(fused_event)
198,fused_event,SLC45A3,input,0/1,TMPRSS2 and SLC45A3 were the only 5' partner i...,20118910,(fused_event)
199,fused_event,NDRG1,input,0/1,ERG gene rearrangements and mechanism of rearr...,20118910,(fused_event)
232,Hypoxia,input,input,0/1,Input of the model,,(Hypoxia)
327,Nutrients,input,input,0/1,Input of the model,,(Nutrients)


Unnamed: 0,Target node,HUGO names,Interaction type,Source,Description,Reference: PMID,Logical rule
0,Acidosis,input,input,0/1,Input of the model,,(Acidosis)
13,Androgen,input,input,0/1,Input of the model,,(Androgen)
88,Carcinogen,input,input,0/1,"From Fumia et al, 2013",23922675,(Carcinogen)
150,EGF,input,input,0/1,Input of the model,,(EGF)
180,FGF,"FGF1, FGF2, FGF3, FGF4, FGF5, FGF6, FGF7, FGF8...",input,0/1,Input of the model,,(FGF)
197,fused_event,TMPRSS2,input,0/1,TMPRSS2-Ets gene fusions were identified in pr...,"23264855, 20118910",(fused_event)
198,fused_event,SLC45A3,input,0/1,TMPRSS2 and SLC45A3 were the only 5' partner i...,20118910,(fused_event)
199,fused_event,NDRG1,input,0/1,ERG gene rearrangements and mechanism of rearr...,20118910,(fused_event)
232,Hypoxia,input,input,0/1,Input of the model,,(Hypoxia)
327,Nutrients,input,input,0/1,Input of the model,,(Nutrients)


# Prostate Boolean model simulation