**Computational Health Laboratory Project, A.Y. 2021/2022**

**Authors:** Niko Dalla Noce, Alessandro Ristori, Andrea Zuppolini

# **Colab setup**
Takes care of the project setup on Colab.

In [2]:
if 'google.colab' in str(get_ipython()):
    import subprocess
    from google.colab import drive
    out_clone = subprocess.run(["git", "clone", "https://github.com/nikodallanoce/ComputationalHealthLaboratory"], text=True, capture_output=True)
    print("{0}{1}".format(out_clone.stdout, out_clone.stderr))
    %pip install -U PyYAML
    drive.mount("/content/drive/")
    %cp "/content/drive/Shareddrives/CHL/config.yml" "/content/ComputationalHealthLaboratory"
    %cd ComputationalHealthLaboratory

fatal: destination path 'ComputationalHealthLaboratory' already exists and is not an empty directory.

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
/content/ComputationalHealthLaboratory


# **Obtain all the genes that interacts with the starting one**


In [3]:
import requests
import json
import pandas as pd
import numpy as np
from config import ACCESS_KEY, BASE_URL

In [4]:
genes = pd.read_csv("datasets/geneset.csv", sep=";")
genes["InteractorA"] = genes["InteractorA"].str.upper()
genes.drop_duplicates(inplace=True)
proteins_list = list(genes["InteractorA"])

In [5]:
genes

Unnamed: 0,InteractorA,InteractorB
0,YWHAG,SON
1,YWHAB,SON
3,SIRT7,SON
4,TCF3,SON
5,SF3B1,SON
...,...,...
149,NSP8,SON
150,NSP9,SON
151,ORF6,SON
152,ORF8,SON


# **Expand the network**

In [13]:
request_url = BASE_URL + "/interactions"
data = {}

step = 146
for i in range(0, len(proteins_list), step):
    end = i+step
    if end >= len(proteins_list):
        end = len(proteins_list)
    
    # List of genes to search for
    geneList = proteins_list[i:end] # ["SRPK2"]

    params = {
        "accesskey": ACCESS_KEY,
        "format": "json",  # Return results in TAB2 format
        "geneList": "|".join(geneList),  # Must be | separated
        "searchNames": "true",  # Search against official names
        "includeInteractors": "false",  # Set to true to get any interaction involving EITHER gene, set to false to get interactions between genes
        "includeInteractorInteractions": "false",  # Set to true to get interactions between the geneList’s first order interactors
        "includeEvidence": "false",  # If false "evidenceList" is evidence to exclude, if true "evidenceList" is evidence to show
        "selfInteractionsExcluded": "true", # If true no self-interactions will be included
    }

    r = requests.get(request_url, params=params)
    interactions = r.json()
    
    # Check if the interactions are more than the allowed number
    if len(interactions)==10000:
      assert False

    # Create a hash of results by interaction identifier
    for interaction_id, interaction in interactions.items():
        data[interaction_id] = interaction

In [14]:
# Load the data into a pandas dataframe
dataset = pd.DataFrame.from_dict(data, orient="index")

# Re-order the columns and select only the columns we want to see
columns = ["OFFICIAL_SYMBOL_A", "OFFICIAL_SYMBOL_B"]
dataset = dataset[columns]

# Rename the columns and make all the values uppercase
dataset = dataset.rename(columns={"OFFICIAL_SYMBOL_A": "InteractorA", "OFFICIAL_SYMBOL_B": "InteractorB"})
dataset["InteractorA"] = dataset["InteractorA"].str.upper()
dataset["InteractorB"] = dataset["InteractorB"].str.upper()

# Print the dataframe
dataset

Unnamed: 0,InteractorA,InteractorB
17282,SFPQ,NONO
22627,EZH2,EED
119679,SRPK2,U2AF2
120105,SRSF6,RNPS1
120300,U2AF2,PUF60
...,...,...
3324902,BRD4,HIST1H4A
3324964,BRD3,NFIA
3324983,NSP10,NSP16
3325359,SFPQ,NONO


In [33]:
# Look for duplicated interactions
duplicated_interactions = pd.DataFrame(np.sort(dataset[["InteractorA", "InteractorB"]].values, 1)).duplicated()
print("Duplicated interactions:\n{0}".format(duplicated_interactions.value_counts()))

# Delete such interactions from the dataset
dataset = dataset[~duplicated_interactions.values]

Duplicated interactions:
False    2769
True      146
dtype: int64


In [34]:
# Look for interactions where both proteins are the same
same_proteins_interactions = pd.DataFrame(dataset[["InteractorA", "InteractorB"]].nunique(axis=1) == 1)
print("Useless interactions:\n{0}".format(same_proteins_interactions.value_counts()))

# Delete such interactions from the dataset
dataset = dataset[~same_proteins_interactions.values]

Useless interactions:
False    2769
dtype: int64


In [31]:
dataset = dataset.append(genes)
dataset

Unnamed: 0,InteractorA,InteractorB
17282,SFPQ,NONO
22627,EZH2,EED
119679,SRPK2,U2AF2
120105,SRSF6,RNPS1
120300,U2AF2,PUF60
...,...,...
149,NSP8,SON
150,NSP9,SON
151,ORF6,SON
152,ORF8,SON


In [35]:
nodes = dataset["InteractorA"].append(dataset["InteractorB"]).unique()
print("Number of nodes: {0}".format(len(nodes)))

Number of nodes: 147


## **Draw the network**

In [40]:
import networkx as nx
import matplotlib.pyplot as plt
from matplotlib import cm

In [36]:
protein_graph=nx.Graph(name='Protein Interaction Graph')
interactions = np.array(dataset)
for interaction in interactions:
    a = interaction[0] # protein a node
    b = interaction[1] # protein b node
    protein_graph.add_edges_from([(a,b)]) # add weighted edge to graph

In [46]:
# function to rescale list of values to range [newmin,newmax]
def rescale(l,newmin,newmax):
    arr = list(l)
    return [(x-min(arr))/(max(arr)-min(arr))*(newmax-newmin)+newmin for x in arr]
# use the matplotlib plasma colormap
graph_colormap = cm.get_cmap('plasma', 12)
# node color varies with Degree
c = rescale([protein_graph.degree(v) for v in protein_graph],0.0,0.9) 
c = [graph_colormap(i) for i in c]
# node size varies with betweeness centrality - map to range [10,100] 
bc = nx.betweenness_centrality(protein_graph) # betweeness centrality
s =  rescale([v for v in bc.values()],1500,7000)

In [None]:
pos = nx.spring_layout(protein_graph)
plt.figure(figsize=(59,59),facecolor=[0.7,0.7,0.7,0.4])
nx.draw_networkx(protein_graph, pos=pos, with_labels=True, node_color=c, node_size=s,font_color='white',font_weight='bold',font_size='9')
plt.axis('off')
plt.show()

## **Pathway analysis**

In [39]:
!pip install reactome2py



In [49]:
from reactome2py import content, analysis