## Insight for the medical dataset

#### To create the dataframes and files needed for our analysis, make sure that:

You have [full_database.xml](https://drive.google.com/file/d/1149kYVkazq67e0vuv-_4APyqVX6yyh2p) in `data/clean` folder, which will represent the XML version of the DrugBank
    You have [BindingDB_All.tsv](https://www.bindingdb.org/bind/downloads/BindingDB_All_202411_tsv.zip) in `data/raw` folder, which will represent the tsv version of the BindingDB

### Imports

In [19]:
import os
import gc
import re

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xml.etree.ElementTree as ET
from tqdm import tqdm
import pandas as pd

from drugbank_XML_drugparser import DrugParser
from drugbank_bindingdb_merger import DrugBank_BindingDB_Merger
from preprocessing import Preprocessing, CleanNumericAtrributesStrategy, ColumnClean
from imports import *

%matplotlib inline

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload



def keep_just_numeric(value):
    if type(value) != str:
        return pd.NA

    ## One or more non-digit charachters should be replaced
    cleaned_val = re.sub(r'[^\d.]+','', str(value)) ## There are random float / str in the dataset?? -> convert to str
    if(cleaned_val == ''): # It didn't contain any number?
        return new_class
    return float(cleaned_val)
    
bdb_preprocessor = Preprocessing(
    [
        ColumnClean('Ki (nM)', 'ki', clean=keep_just_numeric),
        # ColumnClean('pH', 'ph'),
        ColumnClean('Temp (C)', 'temp'),
        ColumnClean('IC50 (nM)', 'ic50'),
        ColumnClean('Kd (nM)', 'kd'),
        ColumnClean('kon (M-1-s-1)', 'kon'),
        ColumnClean('Article DOI', 'doi'),

        # For the merge

        ColumnClean('PubChem CID', 'pubchem_cid'),
        ColumnClean('ChEBI ID of Ligand', 'chebi_id'),
        ColumnClean('ChEMBL ID of Ligand', 'chembl_id'),
        ColumnClean('DrugBank ID of Ligand', 'drugbank_id'),
        ColumnClean('KEGG ID of Ligand', 'kegg_id'),
        ColumnClean('ZINC ID of Ligand', 'zinc_id'),
        columnclean('ligand smiles', 'smiles'),
        columnclean('ligand inchi key', 'inchi_key'),
        ColumnClean('BindingDB MonomerID', 'bindingdb_id'),
        ColumnClean('PubChem SID','pubchem_sid'),
        ColumnClean('UniProt (SwissProt) Primary ID of Target Chain.1', 'swissprot_protein_id'),
    ]
)




print("Lodaing Binding DB...")
raw_binding_df = load_BindingDB(BINDINGDB_RAW, preprocessor.get_used_old_columns())

print("Cleaning Binding DB...")
clean_binding_df = preprocessor.transform(raw_binding_df)
clean_binding_df.to_pickle(BINDINGDB_CLEAN)

### Loading the data

In [20]:

if os.path.exists(MERGED):
    print("Merged dataset exists.\n Loading...")

    merged_df = pd.read_pickle(MERGED)

    print("Merged dataset loaded")

else:
    print("Merged dataset doesn't exists.\n Creating it...")

    if os.path.exists(DRUGBANK_LIGAND_PARSED) and os.path.exists(DRUGBANK_PROTEIN_PARSED):
        print("parsed_Drugbank exists...")
        print("Loading...")

        drugbank_ligand = pd.read_pickle(DRUGBANK_LIGAND_PARSED)
        drugbank_proteins = pd.read_pickle(DRUGBANK_PROTEIN_PARSED)
    else:
        print("parsed_Drugbank doesn't exists...")
        print("Parsing DrugBank XML...")

        drugparser = DrugParser(DRUGBANK_XML)
        drugparser.parse_drugs()
        drugparser.parse_proteins()
        drugbank_ligand, drugbank_proteins = drugparser.save_parsed_drugs(DRUGBANK_LIGAND_PARSED, DRUGBANK_PROTEIN_PARSED, return_df = True)

        print("DrugBank XML is parsed. \n Loading Bind ...")
    if os.path.exists(BINDINGDB_CLEAN):
        print("BindingDB clean exists...")
        clean_binding_df = pd.read_pickle(BINDINGDB_CLEAN)
    else:
        def load_BindingDB(file_path, cols):
            return pd.read_csv(file_path, sep='\t', header=0, usecols=cols)

        def keep_just_numeric(value):
            if type(value) != str:
                return pd.NA

            ## One or more non-digit charachters should be replaced
            cleaned_val = re.sub(r'[^\d.]+','', str(value)) ## There are random float / str in the dataset?? -> convert to str
            if(cleaned_val == ''): # It didn't contain any number?
                return pd.NA
            
            return float(cleaned_val)

        def parse_int(value):
            try:
                return int(value)
            except:
                return pd.NA

        bdb_preprocessor = Preprocessing(
            [
                ColumnClean('Ki (nM)', 'ki', clean=keep_just_numeric),
                # ColumnClean('pH', 'ph'),
                ColumnClean('Temp (C)', 'temp'),
                ColumnClean('IC50 (nM)', 'ic50'),
                ColumnClean('Kd (nM)', 'kd'),
                ColumnClean('kon (M-1-s-1)', 'kon'),
                ColumnClean('Article DOI', 'doi'),

                # For the merge

                ColumnClean('PubChem CID', 'pubchem_cid'),
                ColumnClean('ChEBI ID of Ligand', 'chebi_id'),
                ColumnClean('ChEMBL ID of Ligand', 'chembl_id'),
                ColumnClean('DrugBank ID of Ligand', 'drugbank_id'),
                ColumnClean('KEGG ID of Ligand', 'kegg_id'),
                ColumnClean('ZINC ID of Ligand', 'zinc_id'),
                ColumnClean('Ligand SMILES', 'smiles'),
                ColumnClean('Ligand InChI Key', 'inchi_key'),
                ColumnClean('BindingDB MonomerID', 'bindingdb_id', clean=parse_int),
                ColumnClean('PubChem CID','pubchem_cid'),
                ColumnClean('UniProt (SwissProt) Primary ID of Target Chain.1', 'swissprot_protein_id'),
            ]
        )

        print("Lodaing Binding DB...")
        raw_binding_df = load_BindingDB(BINDINGDB_RAW, bdb_preprocessor.get_used_old_columns())

        print("Cleaning Binding DB...")
        clean_binding_df = bdb_preprocessor.transform(raw_binding_df)
        clean_binding_df.to_pickle(BINDINGDB_CLEAN)

        assert len(clean_binding_df) == len(raw_binding_df)
        del raw_binding_df
        gc.collect()


Merged dataset exists.
 Loading...
Merged dataset loaded


In [21]:
clean_binding_df['bindingdb_id'].isna().sum()

0

In [22]:

clean_binding_df.sample(20)

Unnamed: 0,ki,temp,ic50,kd,kon,doi,pubchem_cid,chebi_id,chembl_id,drugbank_id,kegg_id,zinc_id,smiles,inchi_key,bindingdb_id,swissprot_protein_id,Unique_ID
2852996,,,<1.000000,,,,44259.0,15738.0,CHEMBL388978,,C02079,ZINC03814434,CN[C@@H]1C[C@H]2O[C@@](C)([C@@H]1OC)n1c3ccccc3...,HKSZLNNOFSGOKW-FYTWVXJKSA-N,2579,,2852996
491444,0.98,,,,,,86342295.0,,,,,,O=C(Cc1cccs1)N[C@H]1CC[C@H](CCN2CCN(CC2)c2nsc3...,DGUPLZWUXDQFTF-MXVIHJGJSA-N,263383,,491444
1127610,,,,,,,130310210.0,,,,,,CS(=O)(=O)N[C@H]1CCCN([C@H]1CO[C@H]1CC[C@H](CC...,HVGLNIHLBTZQEY-HPAIXVDQSA-N,386189,,1127610
173117,2.1,,,,,10.1007/bf02245606,2818.0,,,,,ZINC19796155,CN1CCN(CC1)C1=c2ccccc2=Nc2ccc(Cl)cc2N1,ZUXABONWMNSFBN-UHFFFAOYSA-N,22869,,173117
1620812,23.7,,,,,10.1021/jm020994z,11825129.0,,CHEMBL155861,,,ZINC13493245,COc1ccccc1N1CCN(CCCCCC(=O)c2ccc(O)cc2)CC1,QVYMFVMRXWDCCS-UHFFFAOYSA-N,50123706,,1620812
111772,,,,,,,1627409.0,,,,,ZINC59827486,Nc1ccc(cc1)S(=O)(=O)NNC(=O)C(=O)NNc1ccccn1,IQVYAJWAUYGRPF-UHFFFAOYSA-N,53388,,111772
696113,,,291,,,,118916614.0,,,,,,COc1cccc(c1)-c1ccc(CNC(=O)Cc2cccc(c2)C(F)(F)F)...,GRHRLHMSOBKXLE-UHFFFAOYSA-N,360489,,696113
2564188,,,>5000,,,10.1074/jbc.m608274200,3415.0,127780.0,CHEMBL666,DB00529,C06456,ZINC24629762,OC(=O)P(O)(O)=O,ZJAOAACCNHFJAH-UHFFFAOYSA-N,50011181,,2564188
495963,,,5.00,,,,68020537.0,,,,,,CC(C)Oc1ccc(cc1C)\N=c1/[nH]c(=O)n(CC(CO)CO)c(=...,WWFZLKJKCWJRSR-UHFFFAOYSA-N,265531,,495963
1454951,650.0,,,,,10.1021/jm00046a010,10107314.0,,CHEMBL109268,,,ZINC13738705,CCCN(CCC)C1CCc2ccc3CCNc3c2C1,OTEIKDZGZUYQLL-UHFFFAOYSA-N,50037275,,1454951


In [23]:
for col in clean_binding_df.columns:
    i = clean_binding_df[col].isna().mean()
    print(f"NA ratio in {col}: {i:.2f}")

NA ratio in ki: 0.80
NA ratio in temp: 0.93
NA ratio in ic50: 0.33
NA ratio in kd: 0.96
NA ratio in kon: 1.00
NA ratio in doi: 0.47
NA ratio in pubchem_cid: 0.02
NA ratio in chebi_id: 0.97
NA ratio in chembl_id: 0.62
NA ratio in drugbank_id: 0.97
NA ratio in kegg_id: 0.97
NA ratio in zinc_id: 0.60
NA ratio in smiles: 0.00
NA ratio in inchi_key: 0.04
NA ratio in bindingdb_id: 0.00
NA ratio in swissprot_protein_id: 0.95
NA ratio in Unique_ID: 0.00


In [24]:

print("Creating merged dataset")
drugbank_binding_merger = DrugBank_BindingDB_Merger()
merged_df = drugbank_binding_merger.merge(drugbank_ligand, clean_binding_df)



Creating merged dataset


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  binding_df_id[identifier] = binding_df_id[identifier].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  drugbank_df_id[identifier] = drugbank_df_id[identifier].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  binding_df_id[identifier] = binding_df_id[identifier].astype(str)
A val

run


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  binding_df_id[identifier] = binding_df_id[identifier].astype(str)


run


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  drugbank_df_id[identifier] = drugbank_df_id[identifier].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  drugbank_df_id[identifier] = drugbank_df_id[identifier].astype(str)


run


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  binding_df_id[identifier] = binding_df_id[identifier].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  drugbank_df_id[identifier] = drugbank_df_id[identifier].astype(str)


run


100%|██████████| 7/7 [01:07<00:00,  9.59s/it]
  return_df =  pd.read_csv(output_file)


In [25]:
merged_df['swissprot_protein_id']

0          NaN
1          NaN
2          NaN
3          NaN
4          NaN
          ... 
2927604    NaN
2927605    NaN
2927606    NaN
2927607    NaN
2927608    NaN
Name: swissprot_protein_id, Length: 2927609, dtype: object

In [26]:
drugbank_ligand

Unnamed: 0,drugbank_id,name,unii,toxicity,class_kingdom,class_superclass,synonyms,categories,patent_approved,interaction,pubchem_cid,chembl_id,smiles,inchi_key,chebi_id,bindingdb_id
0,DB00001,Lepirudin,Y43GF64R34,The acute toxicity of intravenous lepirudin wa...,Organic Compounds,Organic Acids,"[Leu1, Thr2]-63-desulfohirudin|Desulfatohirudi...","Amino Acids, Peptides, and Proteins|Anticoagul...",1993-01-19,DB06605|DB06695|DB01254|DB01609|DB01586|DB0212...,46507011,CHEMBL1201666,,,,
1,DB00002,Cetuximab,PQX0D8J21J,The intravenous LD<sub>50</sub> is > 300 mg/kg...,Organic Compounds,Organic Acids,Cetuximab|Cétuximab|Cetuximabum,"Amino Acids, Peptides, and Proteins|Antibodies...",1999-03-02,DB00255|DB00269|DB00286|DB00655|DB00783|DB0089...,46507042,CHEMBL1201577,,,,
2,DB00003,Dornase alfa,953A26OA1Y,Adverse reactions occur at a frequency of < 1/...,Organic Compounds,Organic Acids,Deoxyribonuclease (human clone 18-1 protein mo...,"Amino Acids, Peptides, and Proteins|Cough and ...",2005-02-22|2004-10-26,,46507792,CHEMBL1201431,,,,
3,DB00004,Denileukin diftitox,25E79B5CTM,,Organic Compounds,Organic Acids,Denileukin|Denileukin diftitox|Interleukin-2/d...,"ADP Ribose Transferases|Amino Acids, Peptides,...",,DB00012|DB00016|DB08894|DB09107|DB00281|DB0029...,46506950,CHEMBL1201550,,,,
4,DB00005,Etanercept,OP401G7OJC,,Organic Compounds,Organic Acids,Etanercept|etanercept-szzs|etanercept-ykro|Rec...,"Agents reducing cytokine levels|Amino Acids, P...",2009-06-16|2000-03-14|2007-10-02,DB08879|DB00531|DB06643|DB00065|DB00008|DB0001...,46506732,CHEMBL1201572,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16576,DB18713,Recombinant stabilized RSV A prefusion F antigen,,,,,RSV subgroup A stabilized prefusion F protein,Respiratory syncytial virus (RSV) vaccines|Vac...,,,,,,,,
16577,DB18714,Recombinant stabilized RSV B prefusion F antigen,,,,,RSV subgroup B stabilized prefusion F protein,Respiratory syncytial virus (RSV) vaccines|Vac...,,,,,,,,
16578,DB18715,Tolebrutinib,8CZ82ZYY9X,,,,"2h-imidazo(4,5-c)pyridin-2-one, 4-amino-1,3-di...",,,,,CHEMBL4650323,NC1=C2N(C(=O)N([C@@H]3CCCN(C3)C(=O)C=C)C2=CC=N...,KOEUOFPEZFUWRF-LJQANCHMSA-N,,
16579,DB18716,Enmetazobactam,80VUN7L00C,There is limited information on the acute toxi...,,,"(2s,3s,5r)-3-methyl-3-((3-methyltriazol-3-ium-...",Anti-Bacterial Agents|Anti-Infective Agents|Az...,,DB12768|DB14022|DB00266|DB00498|DB00682|DB0094...,,CHEMBL4458276,[H][C@@]12CC(=O)N1[C@@H](C([O-])=O)[C@](C)(CN1...,HFZITXBUTWITPT-YWVKMMECSA-N,,


In [27]:
merged_df['Matched_On'].value_counts()

Matched_On
chembl_id      96639
drugbank_id    26669
inchi_key      25597
smiles          1248
Name: count, dtype: int64

In [28]:
complete_merged_df = pd.merge(merged_df, drugbank_proteins, how='left', on='swissprot_protein_id')

complete_merged_df.to_pickle(MERGED)

print("Merged dataset is loaded and saved.")

Merged dataset is loaded and saved.


In [29]:
len(complete_merged_df), len(merged_df), len(drugbank_ligand), len(drugbank_proteins), len(clean_binding_df)

(4613464, 2927609, 16581, 22414, 2927609)

In [18]:
merged_df['Matched_On'].value_counts()

Matched_On
chembl_id      96639
drugbank_id    26669
inchi_key      25597
smiles          1248
Name: count, dtype: int64