### Find matched ids for drugs and side effects in ChSe-Decagon_monopharmacy.csv.gz

Our knowledge graph is based on the DrugBank database. Before we use the drug-sideeffect file to annotate our predicted results, we need to find the drug bank ids for all drugs and the MESH/SYMP ids for all side effects in the file.

### Dependencies

In [3]:
import pandas as pd
import requests
import json


def convert_pubchem_to_drugbank(pubchem_ids):
    # mychem.info API URL
    url = "https://mychem.info/v1/query"

    # Dictionary to hold DrugBank to MeSH ID mappings
    mapping = {}

    for i in range(0, len(pubchem_ids), 100):
        # Prepare the query
        q = ",".join(pubchem_ids[i : i + 100])
        params = {
            "q": q,
            "fields": "drugbank.id,drugcentral.xrefs.drugbank_id,pharmgkb.xrefs.drugbank,unichem.drugbank",
            "scopes": "pubchem.cid",
        }

        # Send the request
        response = requests.post(url, params=params)

        # Check if the response is valid
        print(response.status_code, response.text)
        results = response.json()
        for result in results:
            if result.get("drugbank"):
                mapping[result["query"]] = result["drugbank"]["id"]
            else:
                mapping[result["query"]] = None

    return mapping


def convert_id_to_umls(id, id_type, api_key):
    """
    Convert a ID to UMLS ID using BioPortal's REST API.

    :param id: The ID to convert.
    :param id_type: The type of ID to convert. Must be one of MESH, SNOMEDCT, SYMP, MEDDRA.
    :param api_key: Your BioPortal API key.
    :return: The corresponding UMLS ID, if found.
    """
    base_url = "http://data.bioontology.org"
    headers = {"Authorization": f"apikey token={api_key}"}

    # More details on the API here: https://data.bioontology.org/documentation#Class
    # You can get the related UMLS ids for SYMP from the downloaded file here: https://bioportal.bioontology.org/ontologies/SYMP?p=summary
    if id_type not in ["MESH", "SNOMEDCT", "MEDDRA"]:
        print(
            f"Error: {id_type} is not a valid ID type, must be one of MESH, SNOMEDCT, MEDDRA"
        )
        return None

    if id_type in ["MESH", "SNOMEDCT", "MEDDRA"]:
        path = f"http%3A%2F%2Fpurl.bioontology.org%2Fontology%2F{id_type}%2F{id}"

    url = f"{base_url}/ontologies/{id_type}/classes/{path}"
    print("The URL is: ", url)

    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        data = response.json()
        print(json.dumps(data, indent=2))
        mappings = data.get("cui", [])
        if len(mappings) > 0:
            return mappings[0]
        else:
            print(f"Error: No mappings found for {id}")
            return None
    else:
        print(f"Error: {response.status_code}")
        return None

### Convert the pubchem ids to drugbank ids

#### Input

In [2]:
import pandas as pd

data = pd.read_csv(
    "./ChSe-Decagon_monopharmacy.csv.gz",
    compression="gzip",
)
output_file = "./ChSe-Decagon_monopharmacy_drugbank.csv"

#### Output

In [3]:
data = data.rename(
    columns={
        "# STITCH": "pubchem_id",
        "Individual Side Effect": "ulms_id",
        "Side Effect Name": "side_effect_name",
    }
)

In [4]:
pubchem_ids = data["pubchem_id"].unique().tolist()
formatted_pubchem_ids = [x.replace("CID", "").strip("0") for x in pubchem_ids]
id_map = dict(zip(pubchem_ids, formatted_pubchem_ids))
mapping = convert_pubchem_to_drugbank(formatted_pubchem_ids)

drugbank_ids = []
for pubchem_id in data["pubchem_id"]:
    drugbank_id = mapping.get(id_map.get(pubchem_id))
    drugbank_ids.append(drugbank_id)
data["drugbank_id"] = drugbank_ids

200 [{"query":"3062316","_id":"ZBNZXTGUTAYRHI-UHFFFAOYSA-N","_score":17.083454,"drugbank":{"_license":"https://bit.ly/3Hikpvm","id":"DB01254"}},{"query":"3117","_id":"AUZONCFQVSMFAP-UHFFFAOYSA-N","_score":17.083454,"drugbank":{"_license":"https://bit.ly/3Hikpvm","id":"DB00822"}},{"query":"3114","_id":"UVTNFZQICZKOEM-UHFFFAOYSA-N","_score":17.083544,"drugbank":{"_license":"https://bit.ly/3Hikpvm","id":"DB00280"}},{"query":"373","notfound":true},{"query":"3736","_id":"DGAIEPBNLOQYER-UHFFFAOYSA-N","_score":17.083454,"drugbank":{"_license":"https://bit.ly/3Hikpvm","id":"DB09156"}},{"query":"3734","_id":"XQZXYNRDCRIARQ-UHFFFAOYSA-N","_score":17.083544},{"query":"2646","_id":"WDLWHQDACQUCJR-UHFFFAOYSA-N","_score":17.08357},{"query":"28112","_id":"OCZDCIYGECBNKL-UHFFFAOYSA-N","_score":17.083454},{"query":"4183806","_id":"OJLOPKGSLYJEMD-UHFFFAOYSA-N","_score":17.083454},{"query":"2462","_id":"VOVIALXJUBGFJZ-UHFFFAOYSA-N","_score":17.083454},{"query":"5381","_id":"OGQICQVSFDPSEI-UHFFFAOYSA-N","

In [6]:
data.head()

Unnamed: 0,pubchem_id,ulms_id,side_effect_name,drugbank_id
0,CID003062316,C1096328,central nervous system mass,DB01254
1,CID003062316,C0162830,Photosensitivity reaction,DB01254
2,CID003062316,C1611725,leukaemic infiltration brain,DB01254
3,CID003062316,C0541767,platelet adhesiveness abnormal,DB01254
4,CID003062316,C0242973,Ventricular dysfunction,DB01254


In [5]:
data.to_csv(output_file, index=False)

### Format the data to match the biomedgps format

More details on the data format can be found [here](https://open-prophetdb.github.io/biomedgps-data/graph_data_index/#knowledge-graph-file).

Examples:

| relation_type                  | resource | source_id | source_type | target_id   | target_type | source_name                    | target_name |
|--------------------------------|----------|-----------|-------------|-------------|-------------|--------------------------------|-------------|
| DGIDB::INHIBITOR::Gene:Compound| DGIDB    | ENTREZ:4311 | Gene        | MESH:D015244| Compound    | membrane metalloendopeptidase  | Thiorphan   |
| DGIDB::INHIBITOR::Gene:Compound| DGIDB    | ENTREZ:4311 | Gene        | MESH:C097292| Compound    | membrane metalloendopeptidase  | aladotrilat |



In [3]:
import pandas as pd

input_file = "./ChSe-Decagon_monopharmacy_drugbank.csv"
output_file = "./formatted_biosnap_compound_sideeffect.csv"
formatted_data = pd.read_csv(input_file, sep=",")

# Currently, we don't have enough information to determine the target type. Disease or Symptom? BioMedGPS::SideEffect::Compound:Symptom or BioMedGPS::SideEffect::Compound:Disease
formatted_data["relation_type"] = ""
formatted_data["source_id"] = "DrugBank:" + formatted_data["drugbank_id"]
formatted_data["source_type"] = "Compound"
formatted_data["source_name"] = ""
formatted_data["target_id"] = "UMLS:" + formatted_data["ulms_id"]
formatted_data["target_type"] = ""
formatted_data["target_name"] = formatted_data["side_effect_name"]
formatted_data["resource"] = "BioSNAP"

formatted_data = formatted_data[
    [
        "source_id",
        "source_type",
        "source_name",
        "target_id",
        "target_type",
        "target_name",
        "relation_type",
        "resource",
    ]
]

formatted_data.to_csv(output_file, index=False, sep="\t")

#### Determine the target_type for the side effects, map the side effect ids to the disease ids

In [None]:
disease_

relation_type = "BioMedGPS::SideEffect::Compound:Disease"
target_type = "Disease"

#### Determine the target_type for the side effects, map the side effect ids to the symptom ids

In [None]:
relation_type = "BioMedGPS::SideEffect::Compound:Symptom"
target_type = "Symptom"