# Parse the DrugBank XML and export as CSV

In [15]:
from tqdm import tqdm
import pandas as pd
from lxml import etree

In [2]:
# Create a parser
parser = etree.XMLParser(recover=True)

parsed_file = etree.parse('full database.xml', parser=parser)

In [3]:
root = parsed_file.getroot()
drugs = list(root)

### Testing the XML parsing

In [80]:
for i in drugs[6]:
    if 'external-identifiers' in str(i): # drug's categories
        drug_categories = '|'.join([cat[0].text \
                                    for cat in list(i)])
        
        for ext in list(i):
            if str(ext[0].text) == 'ChEMBL':
                print('CHEMBL', ext[1].text)
            if str(ext[0].text) == 'ChEBI':
                print('CHEBI', ext[1].text)
            if str(ext[0].text) == 'PubChem Substance':
                print('PubChem Substance', ext[1].text)
            if str(ext[0].text) == 'BindingDB':
                print('BindingDB', ext[1].text)


        print(drug_categories)
        print(list(i))
    #print(i)

CHEBI 6427
PubChem Substance 46507635
BindingDB 50369395
CHEMBL CHEMBL1201199
Drugs Product Database (DPD)|Drugs Product Database (DPD)|ChEBI|PubChem Substance|KEGG Compound|KEGG Drug|ChemSpider|BindingDB|PharmGKB|Therapeutic Targets Database|Wikipedia|ChEMBL|RxCUI
[<Element {http://www.drugbank.ca}external-identifier at 0x278da66d840>, <Element {http://www.drugbank.ca}external-identifier at 0x278da66e7c0>, <Element {http://www.drugbank.ca}external-identifier at 0x278da66ebc0>, <Element {http://www.drugbank.ca}external-identifier at 0x278da66df40>, <Element {http://www.drugbank.ca}external-identifier at 0x278da66e180>, <Element {http://www.drugbank.ca}external-identifier at 0x278da66d800>, <Element {http://www.drugbank.ca}external-identifier at 0x278da66f500>, <Element {http://www.drugbank.ca}external-identifier at 0x278da66f780>, <Element {http://www.drugbank.ca}external-identifier at 0x278da66d040>, <Element {http://www.drugbank.ca}external-identifier at 0x278da66fd40>, <Element {htt

### Parsing

In [86]:
parsed_drugs = []

for i in tqdm(range(len(drugs))):
    drug = drugs[i]
    drug_properties = {}

    idDB = drug[0].text # Drug Bank ID
    drug_properties['id'] = idDB

    for idx,feature in enumerate(drug):
        featureName = feature.tag

        if 'name' in featureName: # drug name
            drug_properties['name'] = feature.text

        if 'synonyms' in featureName: # drug's synonyms
            drug_synm = '|'.join([synm.text for synm in list(feature)])
            drug_properties['synonyms'] = drug_synm
            
        if 'toxicity' in featureName: # drug's toxicity
            drug_properties['toxicity'] = feature.text

        if 'unii' in featureName: # drug's UNII
            drug_properties['unii'] = feature.text

        if 'categories' in featureName: # drug's categories
            drug_categories = '|'.join([cat[0].text for cat in list(feature)])
            drug_properties['categories'] = drug_categories

        if 'classification' in featureName: #type of drug
            classifications = list(feature)
            drug_class_kingdom = classifications[2].text
            drug_class_superclass = classifications[3].text
            drug_properties['class_kingdom'] = drug_class_kingdom
            drug_properties['class_superclass'] = drug_class_superclass

        if 'drug-interactions' in featureName: #interaction other drugs
            drug_interaction = '|'.join([di[0].text
                                        for di in list(feature)])
            drug_properties['interaction'] = drug_interaction
            
        if 'patents' in featureName:
            patents_list = list(feature)
            if len(patents_list) > 0:
                drug_patent_approved = '|'.join([cat[2].text for cat in patents_list])
                drug_properties['patent_approved'] = drug_patent_approved

        if 'calculated-properties' in featureName: # drug's categories
            for calc_prop in list(feature):
                prop_name = calc_prop[0].text
                if 'SMILES' in prop_name:
                    drug_SMILE = calc_prop[1].text
                    drug_properties['SMILES'] = drug_SMILE

                if 'InChI' in prop_name:
                    drugInChI = calc_prop[1].text
                    drug_properties['InChI'] = drugInChI

        if 'external-identifiers' in featureName: #other drug's IDs
            feature_list = list(feature)

            for ext in feature_list:
                if str(ext[0].text) == 'ChEMBL':
                    drug_properties['chembl'] = ext[1].text
                if str(ext[0].text) == 'ChEBI':
                    drug_properties['chebi'] = ext[1].text
                if str(ext[0].text) == 'PubChem Substance':
                    drug_properties['pubchem'] = ext[1].text
                if str(ext[0].text) == 'BindingDB':
                    drug_properties['bindingdb'] = ext[1].text

                                  
        # if 'pathways' in featureName: #related pathways
        #     drug_pathway = ';'.join([pathway[1].text \
        #                             for pathway in list(feature)])

        # if 'targets' in featureName: #if polypeptide, drug's targets
        #     targets = list(feature)
    
    parsed_drugs.append(drug_properties)

100%|██████████| 16581/16581 [00:03<00:00, 4792.61it/s]


In [87]:
df = pd.DataFrame(parsed_drugs)

In [89]:
df.head(3)

Unnamed: 0,id,name,unii,toxicity,class_kingdom,class_superclass,synonyms,categories,patent_approved,interaction,pubchem,chembl,SMILES,InChI,chebi,bindingdb
0,DB00001,Lepirudin,Y43GF64R34,The acute toxicity of intravenous lepirudin wa...,Organic Compounds,Organic Acids,"[Leu1, Thr2]-63-desulfohirudin|Desulfatohirudi...","Amino Acids, Peptides, and Proteins|Anticoagul...",1993-01-19,DB06605|DB06695|DB01254|DB01609|DB01586|DB0212...,46507011,CHEMBL1201666,,,,
1,DB00002,Cetuximab,PQX0D8J21J,The intravenous LD<sub>50</sub> is > 300 mg/kg...,Organic Compounds,Organic Acids,Cetuximab|Cétuximab|Cetuximabum,"Amino Acids, Peptides, and Proteins|Antibodies...",1999-03-02,DB00255|DB00269|DB00286|DB00655|DB00783|DB0089...,46507042,CHEMBL1201577,,,,
2,DB00003,Dornase alfa,953A26OA1Y,Adverse reactions occur at a frequency of < 1/...,Organic Compounds,Organic Acids,Deoxyribonuclease (human clone 18-1 protein mo...,"Amino Acids, Peptides, and Proteins|Cough and ...",2005-02-22|2004-10-26,,46507792,CHEMBL1201431,,,,


In [90]:
df.to_csv('data/parsed_DrugBank.csv', index=False, encoding='utf-8')