https://github.com/dhimmel/drugbank/blob/gh-pages/parse.ipynb

# Parse the DrugBank XML and extract TSVs

Run using Python 3 to avoid a non-ascii character error when writing to file with the csv module.

In [15]:
import xml.etree.ElementTree as ET
import time
from tqdm import tqdm
import pandas as pd

In [2]:
from lxml import etree

# Create a parser
parser = etree.XMLParser(recover=True)

parsed_file = etree.parse('full database.xml', parser=parser)

In [3]:
root = parsed_file.getroot()

In [9]:
drugs = list(root)

In [67]:
for i in drugs[6]:
    if 'external-identifiers' in str(i): # drug's categories
        drug_categories = '|'.join([cat[0].text \
                                    for cat in list(i)])
        print(drug_categories)
        print(list(i))
    #print(i)

Drugs Product Database (DPD)|Drugs Product Database (DPD)|ChEBI|PubChem Substance|KEGG Compound|KEGG Drug|ChemSpider|BindingDB|PharmGKB|Therapeutic Targets Database|Wikipedia|ChEMBL|RxCUI
[<Element {http://www.drugbank.ca}external-identifier at 0x2767d158980>, <Element {http://www.drugbank.ca}external-identifier at 0x2767d159600>, <Element {http://www.drugbank.ca}external-identifier at 0x2767d15a000>, <Element {http://www.drugbank.ca}external-identifier at 0x2767d1597c0>, <Element {http://www.drugbank.ca}external-identifier at 0x2767d15ae80>, <Element {http://www.drugbank.ca}external-identifier at 0x2767d15a9c0>, <Element {http://www.drugbank.ca}external-identifier at 0x2767d159dc0>, <Element {http://www.drugbank.ca}external-identifier at 0x2767d159fc0>, <Element {http://www.drugbank.ca}external-identifier at 0x2767d144580>, <Element {http://www.drugbank.ca}external-identifier at 0x2767d144940>, <Element {http://www.drugbank.ca}external-identifier at 0x2767d190a00>, <Element {http://ww

In [75]:
parsed_drugs = []

for i in tqdm(range(len(drugs))):
    drug = drugs[i]

    for idx,feature in enumerate(drug):
        featureName = feature.tag

        if 'name' in featureName: # drug name
            drug_name = feature.text

        if 'synonyms' in featureName: # drug's synonyms
            drug_synm = '|'.join([synm.text for synm in list(feature)])
            
        if 'toxicity' in featureName: # drug's toxicity
            drug_toxicity = feature.text

        if 'unii' in featureName: # drug's UNII
            drug_unii = feature.text

        if 'categories' in featureName: # drug's categories
            drug_categories = '|'.join([cat[0].text for cat in list(feature)])

        if 'classification' in featureName: #type of drug
            classifications = list(feature)
            drug_class_kingdom = classifications[2].text
            drug_class_superclass = classifications[3].text

        if 'drug-interactions' in featureName: #interaction other drugs
            drug_interaction = '|'.join([di[0].text
                                        for di in list(feature)])
            
        if 'patents' in featureName:
            patents_list = list(feature)
            if len(patents_list) > 0:
                drug_patent_approved = '|'.join([cat[2].text for cat in patents_list])

        if 'calculated-properties' in featureName: # drug's categories
            for calc_prop in list(feature):
                prop_name = calc_prop[0].text
                if 'SMILES' in prop_name:
                    drug_SMILE = calc_prop[1].text

                if 'InChI' in prop_name:
                    drugInChI = calc_prop[1].text

        if 'external-identifiers' in featureName: #other drug's IDs
            drug_external_ids_resources = '|'.join([cat[0].text for cat in list(feature)])
            drug_external_ids_ids = '|'.join([cat[1].text for cat in list(feature)])
                                  
        # if 'pathways' in featureName: #related pathways
        #     drug_pathway = ';'.join([pathway[1].text \
        #                             for pathway in list(feature)])

        # if 'targets' in featureName: #if polypeptide, drug's targets
        #     targets = list(feature)

    idDB = drug[0].text # Drug Bank ID
    parsed_drugs.append([idDB, drug_name, drug_synm, drug_toxicity, drug_unii, drug_categories, drug_class_kingdom, drug_class_superclass, drug_interaction, drug_patent_approved, drug_SMILE, drugInChI, drug_external_ids_resources, drug_external_ids_ids])


  0%|          | 0/16581 [00:00<?, ?it/s]

100%|██████████| 16581/16581 [00:03<00:00, 5133.40it/s]


In [77]:
df = pd.DataFrame(parsed_drugs)
df.columns = ['idDB', 'drug_name', 'drug_synm', 'drug_toxicity', 'drug_unii', 'drug_categories', 'drug_class_kingdom', 'drug_class_superclass', 'drug_interaction', 'drug_patent_approved', 'drug_SMILE', 'drugInChI', 'drug_external_ids_resources', 'drug_external_ids_ids']
df.head(3)

Unnamed: 0,idDB,drug_name,drug_synm,drug_toxicity,drug_unii,drug_categories,drug_class_kingdom,drug_class_superclass,drug_interaction,drug_patent_approved,drug_SMILE,drugInChI,drug_external_ids_resources,drug_external_ids_ids
0,DB00001,Lepirudin,"[Leu1, Thr2]-63-desulfohirudin|Desulfatohirudi...",The acute toxicity of intravenous lepirudin wa...,Y43GF64R34,"Amino Acids, Peptides, and Proteins|Anticoagul...",Organic Compounds,Organic Acids,DB06605|DB06695|DB01254|DB01609|DB01586|DB0212...,1993-01-19,[H][C@@]12CC(=O)N1[C@@H](C([O-])=O)[C@](C)(CN1...,HFZITXBUTWITPT-YWVKMMECSA-N,Drugs Product Database (DPD)|PubChem Substance...,11916|46507011|D06880|PA450195|P01050|DAP00054...
1,DB00002,Cetuximab,Cetuximab|Cétuximab|Cetuximabum,The intravenous LD<sub>50</sub> is > 300 mg/kg...,PQX0D8J21J,"Amino Acids, Peptides, and Proteins|Antibodies...",Organic Compounds,Organic Acids,DB00255|DB00269|DB00286|DB00655|DB00783|DB0089...,1999-03-02,[H][C@@]12CC(=O)N1[C@@H](C([O-])=O)[C@](C)(CN1...,HFZITXBUTWITPT-YWVKMMECSA-N,Drugs Product Database (DPD)|PubChem Substance...,13175|46507042|D03455|J00228|PA10040|DNC000788...
2,DB00003,Dornase alfa,Deoxyribonuclease (human clone 18-1 protein mo...,Adverse reactions occur at a frequency of < 1/...,953A26OA1Y,"Amino Acids, Peptides, and Proteins|Cough and ...",Organic Compounds,Organic Acids,,2005-02-22|2004-10-26,[H][C@@]12CC(=O)N1[C@@H](C([O-])=O)[C@](C)(CN1...,HFZITXBUTWITPT-YWVKMMECSA-N,Drugs Product Database (DPD)|PubChem Substance...,650|46507792|M55983|PA10318|P24855|DAP000981|D...
