**Computational Health Laboratory Project, A.Y. 2021/2022**

**Authors:** Niko Dalla Noce, Alessandro Ristori, Andrea Zuppolini

# **Colab setup**

In [17]:
!git clone "https://github.com/nikodallanoce/ComputationalHealthLaboratory"
!pip install -U PyYAML
%cd ComputationalHealthLaboratory

fatal: destination path 'ComputationalHealthLaboratory' already exists and is not an empty directory.
/content/ComputationalHealthLaboratory


# **Obtain all the genes that interacts with the starting one**


In [18]:
import requests
import json
import pandas as pd
import numpy as np
from config import ACCESS_KEY, BASE_URL

In [14]:
genes = pd.read_csv("datasets/geneset.csv", sep=";")
genes["InteractorA"] = genes["InteractorA"].str.upper()
genes.drop_duplicates(inplace=True)
proteins_list = list(genes["InteractorA"])

# **Expand the network**

In [8]:
request_url = BASE_URL + "/interactions"

# List of genes to search for
geneList = proteins_list # ["SRPK2"]

params = {
    "accesskey": ACCESS_KEY,
    "format": "json",  # Return results in TAB2 format
    "geneList": "|".join(geneList),  # Must be | separated
    "searchNames": "true",  # Search against official names
    "includeInteractors": "true",  # Set to true to get any interaction involving EITHER gene, set to false to get interactions between genes
    "includeInteractorInteractions": "true",  # Set to true to get interactions between the geneList’s first order interactors
    "includeEvidence": "false",  # If false "evidenceList" is evidence to exclude, if true "evidenceList" is evidence to show
}

r = requests.get(request_url, params=params)
interactions = r.json()

In [9]:
# Create a hash of results by interaction identifier
data = {}
for interaction_id, interaction in interactions.items():
    data[interaction_id] = interaction

# Load the data into a pandas dataframe
dataset = pd.DataFrame.from_dict(data, orient="index")

# Re-order the columns and select only the columns we want to see
columns = [
    "OFFICIAL_SYMBOL_A",
    "OFFICIAL_SYMBOL_B",
]

dataset = dataset[columns]

# Pretty print out the results
dataset["OFFICIAL_SYMBOL_A"] = dataset["OFFICIAL_SYMBOL_A"].str.upper()
dataset["OFFICIAL_SYMBOL_B"] = dataset["OFFICIAL_SYMBOL_B"].str.upper()
dataset

Unnamed: 0,OFFICIAL_SYMBOL_A,OFFICIAL_SYMBOL_B
117,MYPN,ACTN2
278,GATA2,PML
418,RPA2,STAT3
663,XRN1,ALDOA
866,APP,APPBP2
...,...,...
245352,NBN,ATM
245353,RAD50,ATM
245354,RAD51,ATM
245355,FANCA,ATP5B


In [11]:
# Look for duplicated interactions
duplicated_interactions = pd.DataFrame(np.sort(dataset[["OFFICIAL_SYMBOL_A", "OFFICIAL_SYMBOL_B"]].values, 1)).duplicated()
print("Duplicated interactions:\n{0}".format(duplicated_interactions.value_counts()))

# Delete such interactions from the dataset
dataset = dataset[~duplicated_interactions.values]

Duplicated interactions:
False    6874
True     3126
dtype: int64


In [12]:
# Look for interactions where both proteins are the same
same_proteins_interactions = pd.DataFrame(dataset[["OFFICIAL_SYMBOL_A", "OFFICIAL_SYMBOL_B"]].nunique(axis=1) == 1)
print("Useless interactions:\n{0}".format(same_proteins_interactions.value_counts()))

# Delete such interactions from the dataset
dataset = dataset[~same_proteins_interactions.values]

Useless interactions:
False    6552
True      322
dtype: int64


In [13]:
dataset

Unnamed: 0,OFFICIAL_SYMBOL_A,OFFICIAL_SYMBOL_B
117,MYPN,ACTN2
278,GATA2,PML
418,RPA2,STAT3
663,XRN1,ALDOA
866,APP,APPBP2
...,...,...
245350,MSH2,ATM
245351,MSH6,ATM
245353,RAD50,ATM
245354,RAD51,ATM
