In [None]:
import requests
import json
import pandas as pd
import re
from io import StringIO
import os

In [None]:
data_path = 'data'

In [None]:
def extract_identifier(names, pattern):
    if pd.notna(names):
        match = re.search(pattern, names)
        return match.group(0) if match else None
        # db_values = [name for name in names.split('|') if name.startswith(identifier)]
        # return db_values[0] if db_values else None
    return None

In [None]:
url = "https://pubchem.ncbi.nlm.nih.gov/sdq/sdqagent.cgi"
headers = {'Content-Type': 'application/json'}

In [None]:
query = {
        "download": ["cid", "cmpdsynonym"],
        "collection": "compound",
        "order": ["relevancescore,desc"],
        "start": 1,
        "limit": 10000000,
        "downloadfilename": "compounds",
        "where": {
            "ands": []
        }
    }

# BindingDB

In [None]:
bdb_path = os.path.join(data_path, 'BindingDB.csv')

In [None]:
bdb = pd.read_csv(bdb_path)

In [None]:
query['where']['ands'] = [{"*": "bindingdb"}]
pattern= r'\BDBM\d+\b'
identifier_column = 'BindingDB MonomerID'

# Define parameters in a dictionary/json format
parameters = {
    "infmt": "json",
    "outfmt": "csv",
    "query": json.dumps(query)
}

response = requests.post(url, params=parameters, headers=headers)
response.text
csv_file_like = StringIO(response.text)
# Read CSV data into a pandas DataFrame
data = pd.read_csv(csv_file_like)
data = data.rename(columns={' cid': 'CID', 'cmpdsynonym': 'synonyms'})
# Extract bdb ID
data[identifier_column] = data['synonyms'].apply(lambda x: extract_identifier(x, pattern))
data = data.drop(columns='synonyms')

In [None]:
bdb = pd.merge(bdb, data, on=identifier_column, how='left')

In [None]:
bdb.to_csv(bdb_path, index=False)

# ChEMBL

In [None]:
chembl_path = os.path.join(data_path, 'ChEMBL.csv')

In [None]:
chembl = pd.read_csv(chembl_path)

In [None]:
query['where']['ands'] = [{"*": "chembl"}]
pattern= r'\bCHEMBL\d+\b'
identifier_column = 'Molecule ChEMBL ID'

# Define parameters in a dictionary/json format
parameters = {
    "infmt": "json",
    "outfmt": "csv",
    "query": json.dumps(query)
}

response = requests.post(url, params=parameters, headers=headers)
response.text
csv_file_like = StringIO(response.text)
# Read CSV data into a pandas DataFrame
data = pd.read_csv(csv_file_like)
data = data.rename(columns={' cid': 'CID', 'cmpdsynonym': 'synonyms'})
# Extract chembl ID
data[identifier_column] = data['synonyms'].apply(lambda x: extract_identifier(x, pattern))
data = data.drop(columns='synonyms')

In [None]:
chembl = pd.merge(chembl, data, on=identifier_column, how='left')

In [None]:
chembl.to_csv(chembl_path, index=False)

# Drug Target Commons

In [None]:
dtc_path = os.path.join(data_path, 'DTC.csv')

In [None]:
dtc = pd.read_csv(dtc_path)

DTC uses ChEMBL IDS as well

In [None]:
identifier_columns = 'compound_id'
data.rename(columns={'Molecule ChEMBL ID': identifier_column})

In [None]:
dtc = pd.merge(dtc, data, on=identifier_column, how='left')

In [None]:
dtc.to_csv(dtc_path, index=False)

# Comparaive Toxicogenomics Database

In [None]:
ctd_path = os.path.join(data_path, 'CTD.csv')

In [None]:
ctd = pd.read_csv(ctd_path)

In [None]:
query['where']['ands'] = [{"*": "ctd"}, {"*": "comparative"}, {"*": "toxicogenomics"}, {"*": "database"}]
pattern= r'\b(C|D)\d+\b'
identifier_column = 'ChemicalID'

# Define parameters in a dictionary/json format
parameters = {
    "infmt": "json",
    "outfmt": "csv",
    "query": json.dumps(query)
}

response = requests.post(url, params=parameters, headers=headers)
response.text
csv_file_like = StringIO(response.text)
# Read CSV data into a pandas DataFrame
data = pd.read_csv(csv_file_like)
data = data.rename(columns={' cid': 'CID', 'cmpdsynonym': 'synonyms'})
# Extract ctd ID
data[identifier_column] = data['synonyms'].apply(lambda x: extract_identifier(x, pattern))
data = data.drop(columns='synonyms')

In [None]:
ctd = pd.merge(ctd, data, on=identifier_column, how='left')

In [None]:
ctd.to_csv(ctd_path, index=False)

# DrugBank

In [None]:
db_path = os.path.join(data_path, 'DB.csv')

In [None]:
db = pd.read_csv(db_path)

In [None]:
query['where']['ands'] = [{"*": "drugbank"}]
pattern= r'\DB\d+\b'
identifier_column = 'drugbank-id'

# Define parameters in a dictionary/json format
parameters = {
    "infmt": "json",
    "outfmt": "csv",
    "query": json.dumps(query)
}

response = requests.post(url, params=parameters, headers=headers)
response.text
csv_file_like = StringIO(response.text)
# Read CSV data into a pandas DataFrame
data = pd.read_csv(csv_file_like)
data = data.rename(columns={' cid': 'CID', 'cmpdsynonym': 'synonyms'})
# Extract db ID
data[identifier_column] = data['synonyms'].apply(lambda x: extract_identifier(x, pattern))
data = data.drop(columns='synonyms')

In [None]:
db = pd.merge(db, data, on=identifier_column, how='left')

In [None]:
db.to_csv(db, index=False)

# DrugCentral

In [None]:
dc_path = os.path.join(data_path, 'DC.csv')

In [None]:
dc = pd.read_csv(dc_path)

In [None]:
# InChIKey is available in DC
query_dc = {
        "download": ["cid", "inchikey"],
        "collection": "compound",
        "order": ["relevancescore,desc"],
        "start": 1,
        "limit": 10000000,
        "downloadfilename": "compounds",
        "where": {
            "ands": []
        }
    }
query_dc['where']['ands'] = [{"*": "drugcentral"}]

identifier_column = 'InChIKey'

# Define parameters in a dictionary/json format
parameters = {
    "infmt": "json",
    "outfmt": "csv",
    "query": json.dumps(query_dc)
}

response = requests.post(url, params=parameters, headers=headers)
response.text
csv_file_like = StringIO(response.text)
# Read CSV data into a pandas DataFrame
data = pd.read_csv(csv_file_like)
data = data.rename(columns={' cid': 'CID', 'inchikey': 'InChIKey'})

In [None]:
dc = pd.merge(dc, data, on=identifier_column, how='left')

In [None]:
dc.to_csv(dc_path, index=False)