In [None]:
# Get the DrugBank XML database
!pip install gdown # downloading files from google drive.
!gdown --folder https://drive.google.com/drive/folders/1hZa_Vc9dZf_oyNjQoCsO2eKVAOzz-78e
!unzip drug-drug/drugbank_all_full_database.xml.zip

In [None]:
# Decompress the graph structure descriptor TSV file
!gzip -dk drug-drug/ChCh-Miner_durgbank-chem-chem.tsv.gz # unzip, keep original .gz file as well 

The first step is to parse the .xml database, so it will be useable.
The ElementTree.parse() function creates an ElementTree from the given xml file.
To further process the tree, we save the root first. Its not neccesary in this case.
In this tree, each row from the databse will be a direct node from the root. Further node-s will be the nested properties in the xml file.

In [1]:
import pandas as pd
import numpy as np
import xml.etree.cElementTree as et
from lxml import etree

tree = etree.parse('drug-drug/drugbank_all_full_database.xml/full database.xml')
root = tree.getroot()

# All the tags start with this
firm = "{http://www.drugbank.ca}"

Porcessing the data from the tree

In the database there were a ot of properties for each drug and we had to pick out some, based on two principle:
1. The properties value can be represented with a vector of numbers
2. The property could have meaningful data about the interactions between drugs
After thorough consideration we choose: type, state, avarage_mass, monoisotopic-mass, classification, melting points, molecular formula.

The code:

The for cycle iterates through all the nodes connected to the root.
Each node represents a drug. 
For each node the tags represent the properties and the tags texts are the values of the properties.
For each node a list named "feature" is made and its filled with the values of the above mentioned properties. If a tag doesnt exist or is empty the corresponding value will be 0.
The categorycal properties are changed to numbers using one-hot encoding. 
From the molecular formulas only the number of atom which are Carbon, Hydrogen, Nitrogen or Oxygen are added.
At the end of each cycle the feature list is added to a dictionary with the drugs ID as key.

In [195]:
import re

all_features = {}

for drug in root:
     
    iden = ""
    features = []
    
    if (drug[0] is None):
        iden = "nn"
    else: 
        iden = drug[0].text
        
    if (drug.attrib['type'] == "biotech"):
        features.append(1)
    else:
        features.append(2)
     
    if (drug.find(firm + "state") is None):
        features.append(0)
    else: 
        if (drug.findtext(firm + "state") == "gas"):
            features.append(1)
        else:
            if drug.findtext(firm + "state") == "liquid":
                features.append(2)
            else:
                features.append(3)
            
    
    if (drug.find(firm + "average-mass") is None):
        features.append(0)
    else: 
        features.append(float(drug.findtext(firm + "average-mass")))

    if (drug.find(firm + "monoisotopic-mass") is None):
        features.append(0)
    else: 
        features.append(float(drug.findtext(firm + "monoisotopic-mass")))
       
    if (drug.find(firm + "classification") is None):
        features.append(0)
    else:        
        if drug.find(firm + "classification").findtext(firm + "kingdom") == "Inorganic compounds":
            features.append(2)
        else:
            features.append(1)
#     if (drug.find(firm + "affected-organisms") is None):
#         AfftectedOrganism.append("nn")
#     else: 
#         if drug.find(firm + "affected-organisms").find(firm + "affected-organism") is None:
#             AfftectedOrganism.append("nn")
#         else:
#             AfftectedOrganism.append(drug.find(firm + "affected-organisms").findtext(firm + "affected-organism"))
    
    if drug.find(firm + 'experimental-properties') is None:
        for i in range(0,4):
            features.append(0)   
    else:
        expdata = []
        regexNum = re.compile('[0-9]+')
        mp = ""
        mf = ""
        for p in drug.find(firm + 'experimental-properties').iterfind(firm + 'property'):
            if p is not None:
                if p.findtext(firm + 'kind') == "Melting Point" and mp == "":
                    mp = p.findtext(firm + 'value')
                if p.findtext(firm + 'kind') == "Molecular Formula" and mf == "":
                    mf = p.findtext(firm + 'value')
        match = regexNum.search(mp)
        if match is None:
            expdata.append(0)
        else:
            expdata.append(float(match.group()))

        for s in ['C', 'H', 'N', 'O']:
            regex = re.compile(s + '\d+\D')
            match = regex.search(mf)
            if match is None:
                expdata.append(0)
            else:
                matchnum = regexNum.search(match.group())
                expdata.append(float(matchnum.group()))

        features.extend(expdata)
    all_features[iden] = features

In [193]:
all_features

{'DB00001': [0, 2, 0, 0, 1, 65.0, 287.0, 440.0, 80.0, 110.0],
 'DB00002': [0, 2, 0, 0, 1, 61.0, 6484.0, 10042.0, 1732.0, 2023.0],
 'DB00003': [0, 2, 0, 0, 1, 67.0, 1321.0, 1999.0, 339.0, 396.0],
 'DB00004': [0, 2, 0, 0, 1, 0, 2560.0, 4042.0, 678.0, 799.0],
 'DB00005': [0, 2, 0, 0, 1, 71.0, 2224.0, 3475.0, 621.0, 698.0],
 'DB00006': [1, 3, 2180.2853, 2178.985813062, 1, 0, 0, 0, 0, 0],
 'DB00007': [1, 3, 1209.3983, 1208.645462232, 0, 150.0, 0, 0, 0, 0],
 'DB00008': [0, 2, 0, 0, 1, 61.0, 0, 0, 0, 0],
 'DB00009': [0, 2, 0, 0, 1, 60.0, 2569.0, 3928.0, 746.0, 781.0],
 'DB00010': [0, 2, 0, 0, 1, 0, 149.0, 246.0, 44.0, 42.0],
 'DB00011': [0, 2, 0, 0, 1, 61.0, 860.0, 1353.0, 227.0, 255.0],
 'DB00012': [0, 2, 0, 0, 1, 53.0, 815.0, 1317.0, 233.0, 241.0],
 'DB00013': [0, 2, 0, 0, 1, 76.0, 1376.0, 2145.0, 383.0, 406.0],
 'DB00014': [1, 3, 1269.4105, 1268.641439486, 1, 0, 0, 0, 0, 0],
 'DB00015': [0, 2, 0, 0, 1, 60.0, 1736.0, 2671.0, 499.0, 522.0],
 'DB00016': [0, 2, 0, 0, 1, 53.0, 815.0, 1317.0, 23

The dictionary is made into a DataFrame, and the DataFrame is written into a csv file.

In [196]:
features = pd.DataFrame.from_dict(all_features, orient='index')
features

features.to_csv("features.csv", header = False)