In [None]:
#API_KEY = "874fc67d-cee6-4294-b799-18e2b4e63fb8"  #ez a mienk
API_KEY = "ad6669df-65b6-45f9-8e02-7ba74e788acd"  #ez nem a mienk

Below you can find Python and R code snippets showing how to programmatically interact with the REST API by means of YOUR_API_KEY.

⇒ Query the REST API with your API key:  the code below shows how to retrieve gene-disease associations related to the gene APP amyloid beta precursor protein with NCBI ID equal to 351 (gene_ncbi_id parameter equal to 351). To this purpose we call the gene-disease associations endpoint (i.e. gda endpoint, see the REST API Interactive console for more details). We retrieve the top-100 gene-disease associations involving such gene, ordered by descending gene-disease score, by asking for the first page of results (page_number parameter equal to 0).

In [None]:
import requests
import json
import time

# Specify query parameters by means of a dictionary 
params = {}
# ...retrieve disease associated to gene with NCBI ID 
#params["gene_ncbi_id"] = "351"
# lung cancer, breast cancer, prostate cancer, colorectal cancer, ...
params["disease"] = """DO_1324, DO_1612, DO_10286, DO_9256, DO_10652,
DO_14330, DO_1339, DO_1485, DO_12858, DO_11723,  DO_9744,
DO_9352, DO_1459, DO_9255, DO_7148, DO_9074, DO_1826, DO_332, 
DO_13833, DO_12356, DO_5419, DO_3312, DO_1287, DO_1985, DO_12858"""

# Create a dictionary with the HTTP headers of your API call 
HTTPheadersDict = {}
# Set the 'Authorization' HTTP header equal to API_KEY (your API key) 
HTTPheadersDict['Authorization'] = API_KEY
# Set the 'accept' HTTP header to specify the response format: one of 'application/json', 'application/xml', 'application/csv' 
HTTPheadersDict['accept'] = 'application/json'

all_payloads = []
# Loop through pages 0 to 99
for page_number in range(100):
    # Update the params dictionary with the current page number
    params['page_number'] = page_number

    # Query the gda summary endpoint 
    response = requests.get("https://api.disgenet.com/api/v1/gda/summary",
                            params=params, headers=HTTPheadersDict, verify=False)

    # If the status code of response is 429, it means you have reached one of your query limits 
    # You can retrieve the time you need to wait until doing a new query in the response headers 
    if not response.ok:
        if response.status_code == 429:
            while response.ok is False:
                print("You have reached a query limit for your user. Please wait {} seconds until next query".format(
                    response.headers['x-rate-limit-retry-after-seconds']))
                time.sleep(int(response.headers['x-rate-limit-retry-after-seconds']))
                print("Your rate limit is now restored")

                # Repeat your query
                response = requests.get("https://api.disgenet.com/api/v1/gda/summary",
                                        params=params, headers=HTTPheadersDict, verify=False)
                if response.ok is True:
                    break
                else:
                    continue

    # Parse response content in JSON format since we set 'accept:application/json' as HTTP header 
    response_parsed = json.loads(response.text)
    all_payloads.extend(response_parsed.get("payload",[]))
    #print('STATUS: {}'.format(response_parsed["status"]))
    #print('TOTAL NUMBER OF RESULTS: {}'.format(response_parsed["paging"]["totalElements"]))
    #print('NUMBER OF RESULTS RETRIEVED BY CURRENT CALL (PAGE NUMBER {}): {}'.format(
          #response_parsed["paging"]["currentPageNumber"], response_parsed["paging"]["totalElementsInPage"]))

In [None]:
import pandas as pd

print(len(all_payloads))
df = pd.DataFrame(all_payloads)
df.to_csv("disgenet_gda_summary.csv", index=False)




In [None]:
# Check for duplicates in the 'assocID' column
num_duplicates = df['assocID'].duplicated().sum()

# Print the number of duplicates
print(f"Number of duplicate assocID values: {num_duplicates}")

In [None]:
print(df.columns)

In [None]:
# Keep only the specified columns
df_filtered = df[[ 'geneNcbiID', 'geneDSI', 'geneDPI', 'diseaseName','score']]

# Display the first few rows of the filtered dataframe
print(df_filtered.head())

In [None]:
# Print every different occurrence of diseaseClasses_HPO
from itertools import chain

# Print every different occurrence of diseaseClasses_HPO
unique_disease_classes = df_filtered['diseaseName'].unique()
print("Different occurrences of diseaseName:")
for disease_class in unique_disease_classes:
    print(disease_class)

# Count the number of different occurrences
count_unique_disease_classes = len(unique_disease_classes)
print(f"Total number of different occurrences of diseaseName: {count_unique_disease_classes}")

In [None]:
import pandas as pd

# Create the mapping dictionary
disease_mapping = {
    # Alzheimer's Disease
    "Alzheimer's Disease": "Alzheimer's Disease",
    "Familial Alzheimer's Disease": "Alzheimer's Disease",
    
    # Parkinson's Disease
    "Parkinson's Disease": "Parkinson's Disease",
    "Hemiparkinsonism": "Parkinson's Disease",
    "Primary Parkinsonism": "Parkinson's Disease",
    "Idiopathic Parkinsonism": "Parkinson's Disease",
    "Amyotrophic Lateral Sclerosis-Parkinsonism/Dementia Complex 1": "Parkinson's Disease",
    
    # Huntington's Disease
    "Huntington's Disease": "Huntington's Disease",
    "Chorea, Huntington": "Huntington's Disease",
    "Dementia in Huntington's Disease": "Huntington's Disease",
    
    # Duchenne Muscular Dystrophy
    "Duchenne Muscular Dystrophy": "Duchenne Muscular Dystrophy",
    "Duchenne Becker Muscular Dystrophy": "Duchenne Muscular Dystrophy",
    "Muscular Dystrophy, Duchenne": "Duchenne Muscular Dystrophy",
    
    # Sickle Cell Anemia
    "Sickle Cell Anemia": "Sickle Cell Anemia",
    
    # Cystic Fibrosis
    "Cystic Fibrosis": "Cystic Fibrosis",
    
    # Down Syndrome
    "Down Syndrome": "Down Syndrome",
    
    # Marfan Syndrome
    "Marfan Syndrome": "Marfan Syndrome",
    
    # Fabry Disease
    "Fabry Disease": "Fabry Disease",
    
    # Breast Cancer
    "Breast Neoplasms": "Breast Cancer",
    "Cancer, Breast": "Breast Cancer",
    "Carcinoma of Breast": "Breast Cancer",
    "Breast Cancer": "Breast Cancer",
    
    # Lung Cancer
    "Cancer, Lung": "Lung Cancer",
    "Malignant Neoplasm of Bronchus or Lung": "Lung Cancer",
    "Lung Cancer": "Lung Cancer",
    
    # Prostate Cancer
    "Carcinoma of Prostate": "Prostate Cancer",
    "Prostate Cancer": "Prostate Cancer",
    
    # Colorectal Cancer
    "Colorectal Neoplasm": "Colorectal Cancer",
    "Malignant Neoplasm of Colon": "Colorectal Cancer",
    "CRC (Colorectal Cancer)": "Colorectal Cancer",
    "Malignant Neoplasm of Large Intestine": "Colorectal Cancer",
    "Colorectal Cancer": "Colorectal Cancer",
    
    # Type 1 Diabetes
    "Type 1 Diabetes": "Type 1 Diabetes",
    
    # Type 2 Diabetes
    "Adult-Onset Diabetes Mellitus": "Type 2 Diabetes",
    "Insulin-Resistant Diabetes Mellitus": "Type 2 Diabetes",
    "Type 2 Diabetes": "Type 2 Diabetes",
    
    # Hemophilia
    "Hemophilia": "Hemophilia",
    
    # Polycystic Kidney Disease
    "Polycystic Kidney Disease": "Polycystic Kidney Disease",
    
    # Systemic Lupus Erythematosus
    "Lupus": "Systemic Lupus Erythematosus",
    "Systemic Lupus Erythematosus 16": "Systemic Lupus Erythematosus",
    
    # Rheumatoid Arthritis
    "Rheumatoid Arthritis": "Rheumatoid Arthritis",
    
    # Autoimmune Diseases
    "Autoimmune Diseases (e.g., Rheumatoid Arthritis)": "Autoimmune Diseases",
    
    # Schizophrenia
    "Schizophrenia": "Schizophrenia",
    
    # Bipolar Disorder
    "Bipolar Disorders": "Bipolar Disorder",
    "Bipolar I Disorder": "Bipolar Disorder",
    "Bipolar II Disorder": "Bipolar Disorder",
    "Bipolar Depression": "Bipolar Disorder",
    "Mixed Bipolar Disorder": "Bipolar Disorder",
    "NOS": "Bipolar Disorder",
    
    # Tourette Syndrome
    "Tourette Syndrome": "Tourette Syndrome",
    
    # Porphyria
    "Porphyria": "Porphyria",
    
    # Amyotrophic Lateral Sclerosis (ALS)
    "Amyotrophic Lateral Sclerosis": "Amyotrophic Lateral Sclerosis (ALS)",
    
    # Cardiovascular Diseases
    "Cardiovascular Disease": "Cardiovascular Diseases",
    "Cardiac Disease": "Cardiovascular Diseases",
    "Cardiovascular Diseases": "Cardiovascular Diseases"
}

# Apply the mapping to the 'diseaseName' column
df_filtered['diseaseName'] = df_filtered['diseaseName'].map(disease_mapping).fillna(df_filtered['diseaseName'])

#df_filtered.loc[:, 'diseaseName'] = df_filtered['diseaseName'].map(disease_mapping).fillna(df_filtered['diseaseName'])

# Print the unique occurrences of the standardized disease names
unique_diseases = df_filtered['diseaseName'].unique()


# Optionally, count the number of different occurrences
print(f"Total number of standardized disease names: {len(unique_diseases)}")

print(df_filtered.head(10))

In [67]:

print(df_filtered.head(10))
df_filtered.to_csv("disgenet_gda_summary_filtered.csv", index=False)

   geneNcbiID  geneDSI  geneDPI                           diseaseName  score
0        3630    0.254    0.957                       Type 2 Diabetes    1.0
1        2099    0.298    0.957                         Breast Cancer    1.0
2        2099    0.298    0.957                         Breast Cancer    1.0
3        2064    0.298    0.957                         Breast Cancer    1.0
4        3630    0.254    0.957  Diabetes Mellitus, Insulin-Dependent    1.0
5        4137    0.351    0.957                     Alzheimer Disease    1.0
6        6622    0.318    0.957                     Parkinson Disease    1.0
7         351    0.381    0.957                     Alzheimer Disease    1.0
8         348    0.326    0.957                     Alzheimer Disease    1.0
9        1080    0.381    0.913                       Cystic Fibrosis    1.0


In [None]:
from torch_geometric.data import HeteroData
import torch
import numpy as np

# Initialize HeteroData object
data = HeteroData()

# Add gene nodes
gene_ids = df['geneNcbiID'].unique()
gene_features = torch.tensor(
    df.groupby('geneNcbiID')[['geneDSI', 'geneDPI']].first().values, dtype=torch.float
)
data['gene'].x = gene_features
data['gene'].node_ids = torch.tensor(gene_ids)

# Add disease nodes
disease_names = df['diseaseName'].unique()
disease_idx = {name: i for i, name in enumerate(disease_names)}
disease_features = torch.eye(len(disease_names))  # One-hot encoding for simplicity
data['disease'].x = disease_features
data['disease'].node_ids = torch.tensor([disease_idx[name] for name in disease_names])

# Add edges with attributes between gene and disease nodes
gene_edge_indices = []
disease_edge_indices = []
edge_attr = []

# Define mapping for disease indices
disease_mapping = {name: i for i, name in enumerate(disease_names)}

# Process edges
for _, row in df.iterrows():
    gene_idx = int(np.where(gene_ids == row['geneNcbiID'])[0][0])  # Get gene index
    disease_idx = disease_mapping[row['diseaseName']]  # Use mapping for disease index
    gene_edge_indices.append(gene_idx)
    disease_edge_indices.append(disease_idx)

    # Combine DSI, DPI, and score into edge attributes
    edge_attr.append([row['geneDSI'], row['geneDPI'], row['score']])

# Convert lists to tensors and add them to the HeteroData structure
data['gene', 'interacts_with', 'disease'].edge_index = torch.tensor(
    [gene_edge_indices, disease_edge_indices], dtype=torch.long
)
data['gene', 'interacts_with', 'disease'].edge_attr = torch.tensor(edge_attr, dtype=torch.float)

# Convert edge indices and attributes to tensors if not already
gene_edge_indices = torch.tensor(gene_edge_indices, dtype=torch.long)
disease_edge_indices = torch.tensor(disease_edge_indices, dtype=torch.long)
edge_attr_tensor = torch.tensor(edge_attr, dtype=torch.float)

# Add reverse edges
data['disease', 'rev_interacts_with', 'gene'].edge_index = torch.stack([disease_edge_indices, gene_edge_indices])
data['disease', 'rev_interacts_with', 'gene'].edge_attr = edge_attr_tensor  # Use the same attributes or create new ones
edge_attr_dict = {
    ('gene', 'interacts_with', 'disease'): data['gene', 'interacts_with', 'disease'].edge_attr,
    ('disease', 'rev_interacts_with', 'gene'): data['disease', 'rev_interacts_with', 'gene'].edge_attr
}

In [65]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import HeteroConv, SAGEConv

class HeteroGNN(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        # Define HeteroConv with separate layers for each node type
        self.conv1 = HeteroConv({
            ('gene', 'interacts_with', 'disease'): SAGEConv((-1, -1), hidden_channels),
            ('disease', 'rev_interacts_with', 'gene'): SAGEConv((-1, -1), hidden_channels)
        }, aggr='mean')

        self.conv2 = HeteroConv({
            ('gene', 'interacts_with', 'disease'): SAGEConv((-1, -1), out_channels),
            ('disease', 'rev_interacts_with', 'gene'): SAGEConv((-1, -1), out_channels)
        }, aggr='mean')

    def forward(self, x_dict, edge_index_dict, edge_attr_dict):
        # Pass through the first layer
        x_dict = self.conv1(x_dict, edge_index_dict, edge_attr_dict)
        x_dict = {key: F.relu(x) for key, x in x_dict.items()}

        # Pass through the second layer
        x_dict = self.conv2(x_dict, edge_index_dict, edge_attr_dict)
        return x_dict


In [66]:
import torch.optim as optim

# Define the model, optimizer, and loss function
model = HeteroGNN(hidden_channels=32, out_channels=1)
optimizer = optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.BCEWithLogitsLoss()

# Example training loop
for epoch in range(100):
    model.train()
    optimizer.zero_grad()

    # Forward pass
    out = model(data.x_dict, data.edge_index_dict, edge_attr_dict)

    # Compute loss (replace this with link prediction loss if needed)
    # target here should represent the labels of gene-disease edges
    loss = criterion(out['gene', 'interacts_with', 'disease'], target)
    
    # Backward pass
    loss.backward()
    optimizer.step()

    print(f'Epoch {epoch}, Loss: {loss.item()}')

RuntimeError: Boolean value of Tensor with more than one value is ambiguous

In [None]:
# Add reverse edges to HeteroData
data['disease', 'rev_interacts_with', 'gene'].edge_index = torch.stack(
    [disease_edge_indices, gene_edge_indices], dim=0
)
data['disease', 'rev_interacts_with', 'gene'].edge_attr = torch.tensor(reverse_edge_attr, dtype=torch.float)