In [13]:
from bs4 import BeautifulSoup
from pprint import pprint
import requests

In [7]:
drug_identifiers = [
    'DB00619',
    'DB01048',
    'DB14093',
    'DB00173',
    'DB00734',
    'DB00218',
    'DB05196',
    'DB09095',
    'DB01053',
    'DB00274'
]
base_url = 'https://www.drugbank.ca/drugs/'

In [8]:
drugs = []
for identifier in drug_identifiers:
    drugs.append({'drug_bank_id': identifier})
    
print(drugs)

[{'drug_bank_id': 'DB00619'}, {'drug_bank_id': 'DB01048'}, {'drug_bank_id': 'DB14093'}, {'drug_bank_id': 'DB00173'}, {'drug_bank_id': 'DB00734'}, {'drug_bank_id': 'DB00218'}, {'drug_bank_id': 'DB05196'}, {'drug_bank_id': 'DB09095'}, {'drug_bank_id': 'DB01053'}, {'drug_bank_id': 'DB00274'}]


In [9]:
# Loop through all the drugs in the list
for drug in drugs:
    
    # Build the URL for the current drug and get its document
    url = base_url + drug['drug_bank_id']
    response = requests.get(url)

    # Parse document; using html5lib to ensure tags such as dl, dt, and dd
    # are parsed correctly
    soup = BeautifulSoup(response.text, 'html5lib')

    # SMILES - located in the text of the tag after the tag with id 'smiles'
    try:
        smiles = soup.find(id='smiles').next_sibling.text
        drug['smiles'] = smiles
        
    except AttributeError:
        print(f"{drug['drug_bank_id']}: SMILES not found")

    # TARGETS - located in a list of cards wrapped by a tag with id 'targets'
    try:
        targets = soup.find(id='targets')
        bond_list = targets.find('div', class_='bond-list')
        bond_cards = bond_list.find_all('div', class_='bond card')
        drug['targets'] = []

        # Iterate through all the cards in the list found here
        for bond_card in bond_cards:
            target = {}
            
            target_name = bond_card.find(class_='card-header').find('strong').find('a').text

            # ACTIONS - located within the bond-card in a list of elements in the 
            # tag following a tag with id 'actions'
            try:
                actions = bond_card.find(id='actions').next_sibling.find_all()
                target['actions'] = []
                for action in actions:
                    target['actions'].append(action.text)
            except:
                print(f"{drug['drug_bank_id']}: {target_name}: no actions found")

            # GENE NAME - located within the bond-card in the text of the tag
            # following a tag with id 'gene-name'
            try:
                gene_name = bond_card.find(id='gene-name').next_sibling.text
                target['gene_name'] = gene_name
            except:
                print(f"{drug['drug_bank_id']}: {target_name}: gene name not found")

            if len(target) > 0: 
                drug['targets'].append(target)

    except AttributeError:
        print(f"{drug['drug_bank_id']}: No targets found")

    # ALTERNATIVE IDENTIFIERS - located within a list in the tag following a
    # tag with id 'external-links'. The list is contained within a 'dl' tag
    # that has a series of alternating 'dt' and 'dd' tags that contain each
    # external link's name and value, respectively
    try:
        alt_ids = soup.find(id='external-links').next_sibling
        dts = alt_ids.find('dl').find_all('dt')
        drug['alternative_identifiers'] = {}

        for dt in dts:
            try:
                drug['alternative_identifiers'][dt.text] = dt.next_sibling.text
                
            except AttributeError:
                print(f"{drug['drug_bank_id']}: error scraping alternative identifier")
    
    except AttributeError:
        print(f"{drug['drug_bank_id']}: no alternative identifiers found")
            

DB01048: No targets found
DB14093: No targets found
DB00173: Adenine phosphoribosyltransferase: no actions found
DB00173: 5'-methylthioadenosine/S-adenosylhomocysteine nucleosidase: no actions found
DB00173: Acetyl-CoA carboxylase 2: no actions found
DB00173: Low molecular weight phosphotyrosine protein phosphatase: no actions found
DB00173: Peroxisomal trans-2-enoyl-CoA reductase: no actions found
DB00173: A/G-specific adenine glycosylase: no actions found
DB00173: Nucleoside deoxyribosyltransferase-I: no actions found
DB00173: SRSF protein kinase 2: no actions found
DB00173: Holliday junction ATP-dependent DNA helicase RuvB: no actions found
DB00173: DNA: no actions found
DB00173: DNA: gene name not found
DB00173: S-methyl-5'-thioadenosine phosphorylase: no actions found
DB00173: Nicotinate-nucleotide--dimethylbenzimidazole phosphoribosyltransferase: no actions found
DB00734: No targets found
DB05196: No targets found


In [10]:
pprint(drugs)

[{'alternative_identifiers': {'BindingDB': '13530',
                              'ChEBI': '45783',
                              'ChEMBL': 'CHEMBL941',
                              'ChemSpider': '5101',
                              'Drugs.com': 'Drugs.com Drug Page',
                              'Human Metabolome Database': 'HMDB0014757',
                              'KEGG Drug': 'D01441',
                              'PDBe Ligand': 'STI',
                              'PharmGKB': 'PA10804',
                              'PubChem Compound': '5291',
                              'PubChem Substance': '46505055',
                              'RxList': 'RxList Drug Page',
                              'RxNav': '282388',
                              'Therapeutic Targets Database': 'DNC001383',
                              'Wikipedia': 'Imatinib',
                              'ZINC': 'ZINC000019632618'},
  'drug_bank_id': 'DB00619',
  'smiles': 'CN1CCN(CC2=CC=C(C=C2)C(=O)NC2=CC(NC3

In [16]:
import json

with open("drugs.json", 'w') as json_file:
    json.dump(drugs, json_file)