# Parse the DrugBank XML and export as CSV

In [1]:
from tqdm import tqdm
import pandas as pd
from lxml import etree

In [2]:
# Create a parser
parser = etree.XMLParser(recover=True)

parsed_file = etree.parse('full database.xml', parser=parser)

In [3]:
root = parsed_file.getroot()
drugs = list(root)

### Testing the XML parsing

In [10]:
proteins = []

for drug in tqdm(drugs):
    for i in drug:
        if 'targets' in str(i): # drug's categories
            target_features = list(i)
            for features in target_features:
                for feature in features:
                    if 'polypeptide' in str(feature):
                        proteins_attributes = {}
                        proteins_attributes['id'] = feature.get('id')
                        for attribute in list(feature):
                            if 'name' in str(attribute):
                                proteins_attributes['name'] = attribute.text
                            if 'general-function' in str(attribute):
                                proteins_attributes['general-function'] = attribute.text
                            if 'specific-function' in str(attribute):
                                proteins_attributes['specific-function'] = attribute.text
                            if 'organism' in str(attribute):
                                proteins_attributes['organism'] = attribute.text

                        proteins.append(proteins_attributes)

100%|██████████| 16581/16581 [00:01<00:00, 8548.71it/s] 


### Parsing

In [11]:
df = pd.DataFrame(proteins)

In [12]:
df.head(3)

Unnamed: 0,id,name,general-function,specific-function,organism
0,P00734,F2,Thrombospondin receptor activity,"Thrombin, which cleaves bonds after Arg and Ly...",Humans
1,P00533,EGFR,Ubiquitin protein ligase binding,Receptor tyrosine kinase binding ligands of th...,Humans
2,O75015,FCGR3B,,Receptor for the Fc region of immunoglobulins ...,Humans


In [19]:
len(df), len(df['id'].unique())

(22414, 4906)

In [20]:
df['id'].value_counts()

id
P14867    163
P34903    144
P47869    144
P31644    142
P24941    136
         ... 
P34024      1
P52045      1
Q92876      1
P00775      1
E0W492      1
Name: count, Length: 4906, dtype: int64

In [21]:
df.drop_duplicates(subset='id', inplace=True)

In [22]:
df.to_csv('data/parsed_proteins_DrugBank.csv', index=False, encoding='utf-8')