**Computational Health Laboratory Project, A.Y. 2021/2022**

**Authors:** Niko Dalla Noce, Alessandro Ristori, Andrea Zuppolini

# **Colab setup**
Takes care of the project setup on Colab.

In [1]:
if 'google.colab' in str(get_ipython()):
    import subprocess
    from google.colab import drive
    out_clone = subprocess.run(["git", "clone", "https://github.com/nikodallanoce/ComputationalHealthLaboratory"], text=True, capture_output=True)
    print("{0}{1}".format(out_clone.stdout, out_clone.stderr))
    %pip install -U PyYAML
    drive.mount("/content/drive/")
    %cp "/content/drive/Shareddrives/CHL/config.yml" "/content/ComputationalHealthLaboratory"
    %cd ComputationalHealthLaboratory

Cloning into 'ComputationalHealthLaboratory'...

Collecting PyYAML
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 5.1 MB/s 
[?25hInstalling collected packages: PyYAML
  Attempting uninstall: PyYAML
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully uninstalled PyYAML-3.13
Successfully installed PyYAML-6.0
Mounted at /content/drive/
/content/ComputationalHealthLaboratory


# **Obtain all the genes that interacts with the starting one**


In [2]:
import requests
import json
import pandas as pd
import numpy as np
from config import ACCESS_KEY, BASE_URL

In [3]:
genes = pd.read_csv("datasets/geneset.csv", sep=";")
genes["InteractorA"] = genes["InteractorA"].str.upper()
genes.drop_duplicates(inplace=True)
proteins_list = list(genes["InteractorA"])

In [4]:
genes

Unnamed: 0,InteractorA,InteractorB
0,YWHAG,SON
1,YWHAB,SON
3,SIRT7,SON
4,TCF3,SON
5,SF3B1,SON
...,...,...
149,NSP8,SON
150,NSP9,SON
151,ORF6,SON
152,ORF8,SON


# **Expand the network**

In [5]:
request_url = BASE_URL + "/interactions"
data = {}

step = 5
for i in range(0, len(proteins_list), step):
    end = i+step
    if end >= len(proteins_list):
        end = len(proteins_list)
    
    # List of genes to search for
    geneList = proteins_list[i:end] # ["SRPK2"]

    params = {
        "accesskey": ACCESS_KEY,
        "format": "json",  # Return results in TAB2 format
        "geneList": "|".join(geneList),  # Must be | separated
        "searchNames": "true",  # Search against official names
        "includeInteractors": "true",  # Set to true to get any interaction involving EITHER gene, set to false to get interactions between genes
        "includeInteractorInteractions": "false",  # Set to true to get interactions between the geneList’s first order interactors
        "includeEvidence": "false",  # If false "evidenceList" is evidence to exclude, if true "evidenceList" is evidence to show
    }

    r = requests.get(request_url, params=params)
    interactions = r.json()
    
    # Check if the interactions are more than the allowed number
    if len(interactions)==10000:
      assert False

    # Create a hash of results by interaction identifier
    for interaction_id, interaction in interactions.items():
        data[interaction_id] = interaction

In [6]:
# Load the data into a pandas dataframe
dataset = pd.DataFrame.from_dict(data, orient="index")

# Re-order the columns and select only the columns we want to see
columns = ["OFFICIAL_SYMBOL_A", "OFFICIAL_SYMBOL_B"]
dataset = dataset[columns]

# Rename the columns and make all the values uppercase
dataset = dataset.rename(columns={"OFFICIAL_SYMBOL_A":"InteractorA", "OFFICIAL_SYMBOL_B":"InteractorB"})
dataset["InteractorA"] = dataset["InteractorA"].str.upper()
dataset["InteractorB"] = dataset["InteractorB"].str.upper()

# Print the dataframe
dataset

Unnamed: 0,InteractorA,InteractorB
8289,TCF3,HAND2
8324,TCF3,ID3
31348,VAP-33B,SIRT7
31539,SIRT7,CKIIBETA
37873,SIRT7,POLO
...,...,...
3305885,CCNF,ZBTB1
3305886,CCNF,ZGPAT
3305887,CCNF,ZNF638
3305888,CCNF,ZNF687


In [7]:
# Look for duplicated interactions
duplicated_interactions = pd.DataFrame(np.sort(dataset[["InteractorA", "InteractorB"]].values, 1)).duplicated()
print("Duplicated interactions:\n{0}".format(duplicated_interactions.value_counts()))

# Delete such interactions from the dataset
dataset = dataset[~duplicated_interactions.values]

Duplicated interactions:
False    79380
True     25574
dtype: int64


In [8]:
# Look for interactions where both proteins are the same
same_proteins_interactions = pd.DataFrame(dataset[["InteractorA", "InteractorB"]].nunique(axis=1) == 1)
print("Useless interactions:\n{0}".format(same_proteins_interactions.value_counts()))

# Delete such interactions from the dataset
dataset = dataset[~same_proteins_interactions.values]

Useless interactions:
False    79283
True        97
dtype: int64


In [9]:
# Append the interactions from the starting gene into the expanded genese dataframe
dataset = dataset.append(genes)
dataset

Unnamed: 0,InteractorA,InteractorB
8289,TCF3,HAND2
8324,TCF3,ID3
31348,VAP-33B,SIRT7
31539,SIRT7,CKIIBETA
37873,SIRT7,POLO
...,...,...
149,NSP8,SON
150,NSP9,SON
151,ORF6,SON
152,ORF8,SON


In [10]:
nodes = dataset["InteractorA"].append(dataset["InteractorB"]).unique()
print("Number of nodes: {0}".format(len(nodes)))

Number of nodes: 13010
