## Insight for the medical dataset

#### To create the dataframes and files needed for our analysis, make sure that:

You have [full_database.xml](https://drive.google.com/file/d/1149kYVkazq67e0vuv-_4APyqVX6yyh2p) in `data/clean` folder, which will represent the XML version of the DrugBank
    You have [BindingDB_All.tsv](https://www.bindingdb.org/bind/downloads/BindingDB_All_202411_tsv.zip) in `data/raw` folder, which will represent the tsv version of the BindingDB

### Imports

In [23]:
import os
import gc
import re

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xml.etree.ElementTree as ET
from tqdm import tqdm
import pandas as pd

from drugbank_XML_drugparser import DrugParser
from drugbank_bindingdb_merger import DrugBank_BindingDB_Merger
from preprocessing import Preprocessing, CleanNumericAtrributesStrategy, ColumnClean
from imports import *

%matplotlib inline

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload



def keep_just_numeric(value):
    if type(value) != str:
        return pd.NA

    ## One or more non-digit charachters should be replaced
    cleaned_val = re.sub(r'[^\d.]+','', str(value)) ## There are random float / str in the dataset?? -> convert to str
    if(cleaned_val == ''): # It didn't contain any number?
        return new_class
    return float(cleaned_val)
    
bdb_preprocessor = Preprocessing(
    [
        ColumnClean('Ki (nM)', 'ki', clean=keep_just_numeric),
        # ColumnClean('pH', 'ph'),
        ColumnClean('Temp (C)', 'temp'),
        ColumnClean('IC50 (nM)', 'ic50'),
        ColumnClean('Kd (nM)', 'kd'),
        ColumnClean('kon (M-1-s-1)', 'kon'),
        ColumnClean('Article DOI', 'doi'),

        # For the merge

        ColumnClean('PubChem CID', 'pubchem_cid'),
        ColumnClean('ChEBI ID of Ligand', 'chebi_id'),
        ColumnClean('ChEMBL ID of Ligand', 'chembl_id'),
        ColumnClean('DrugBank ID of Ligand', 'drugbank_id'),
        ColumnClean('KEGG ID of Ligand', 'kegg_id'),
        ColumnClean('ZINC ID of Ligand', 'zinc_id'),
        columnclean('ligand smiles', 'smiles'),
        columnclean('ligand inchi key', 'inchi_key'),
        ColumnClean('BindingDB MonomerID', 'bindingdb_id'),
        ColumnClean('PubChem SID','pubchem_sid'),
        ColumnClean('UniProt (SwissProt) Primary ID of Target Chain.1', 'swissprot_protein_id'),
    ]
)




print("Lodaing Binding DB...")
raw_binding_df = load_BindingDB(BINDINGDB_RAW, preprocessor.get_used_old_columns())

print("Cleaning Binding DB...")
clean_binding_df = preprocessor.transform(raw_binding_df)
clean_binding_df.to_pickle(BINDINGDB_CLEAN)

### Loading the data

In [55]:

if os.path.exists(MERGED):
    print("Merged dataset exists.\n Loading...")

    merged_df = pd.read_pickle(MERGED)

    print("Merged dataset loaded")

else:
    print("Merged dataset doesn't exists.\n Creating it...")

    if os.path.exists(DRUGBANK_CSV):
        print("parsed_Drugbank exists...")
        print("Loading...")

        drugbank = pd.read_csv(DRUGBANK_CSV, encoding='utf-8')
    else:
        print("parsed_Drugbank doesn't exists...")
        print("Creating parsed_Drugbank.csv")

        drugparser = DrugParser(DRUGBANK_XML)
        drugparser.parse_drugs()
        drugbank = drugparser.save_parsed_drugs(DRUGBANK_CSV, return_df = True)

        print("parsed_Drugbank.csv is created")
        print("DrugBank XML is parsed. \n Loading it ...")
    if os.path.exists(BINDINGDB_CLEAN):
        print("BindingDB clean exists...")
        clean_binding_df = pd.read_pickle(BINDINGDB_CLEAN)
    else:
        def load_BindingDB(file_path, cols):
            return pd.read_csv(file_path, sep='\t', header=0, usecols=cols)

        def keep_just_numeric(value):
            if type(value) != str:
                return pd.NA

            ## One or more non-digit charachters should be replaced
            cleaned_val = re.sub(r'[^\d.]+','', str(value)) ## There are random float / str in the dataset?? -> convert to str
            if(cleaned_val == ''): # It didn't contain any number?
                return new_class
            return float(cleaned_val)

        def parse_int(value):
            try:
                return int(value)
            except:
                return pd.NA

        bdb_preprocessor = Preprocessing(
            [
                ColumnClean('Ki (nM)', 'ki', clean=keep_just_numeric),
                # ColumnClean('pH', 'ph'),
                ColumnClean('Temp (C)', 'temp'),
                ColumnClean('IC50 (nM)', 'ic50'),
                ColumnClean('Kd (nM)', 'kd'),
                ColumnClean('kon (M-1-s-1)', 'kon'),
                ColumnClean('Article DOI', 'doi'),

                # For the merge

                ColumnClean('PubChem CID', 'pubchem_cid'),
                ColumnClean('ChEBI ID of Ligand', 'chebi_id'),
                ColumnClean('ChEMBL ID of Ligand', 'chembl_id'),
                ColumnClean('DrugBank ID of Ligand', 'drugbank_id'),
                ColumnClean('KEGG ID of Ligand', 'kegg_id'),
                ColumnClean('ZINC ID of Ligand', 'zinc_id'),
                ColumnClean('Ligand SMILES', 'smiles'),
                ColumnClean('Ligand InChI Key', 'inchi_key'),
                ColumnClean('BindingDB MonomerID', 'bindingdb_id', clean=parse_int),
                ColumnClean('PubChem CID','pubchem_cid'),
                ColumnClean('UniProt (SwissProt) Primary ID of Target Chain.1', 'swissprot_protein_id'),
            ]
        )




        print("Lodaing Binding DB...")
        raw_binding_df = load_BindingDB(BINDINGDB_RAW, bdb_preprocessor.get_used_old_columns())

        print("Cleaning Binding DB...")
        clean_binding_df = bdb_preprocessor.transform(raw_binding_df)
        clean_binding_df.to_pickle(BINDINGDB_CLEAN)

        assert len(clean_binding_df) == len(raw_binding_df)
        del raw_binding_df
        gc.collect()


Merged dataset exists.
 Loading...
Merged dataset loaded


In [56]:
clean_binding_df['bindingdb_id'].isna().sum()

np.int64(0)

In [57]:

clean_binding_df.sample(20)

Unnamed: 0,ki,temp,ic50,kd,kon,doi,pubchem_cid,chebi_id,chembl_id,drugbank_id,kegg_id,zinc_id,smiles,inchi_key,bindingdb_id,swissprot_protein_id,Unique_ID
2019238,,,58000,,,10.1016/j.bmcl.2010.12.060,53321495.0,,CHEMBL1683108,,,ZINC66076543,CN([C@H]1CC[C@@H](CC1)[C@H](N)Cc1cc(F)ccc1F)C(...,GZZJGNXXRICIGM-LSBZLQRGSA-N,50338474,,2019238
1450942,,,3.2,,,10.1021/jm00084a007,14789979.0,,CHEMBL1169546,,,,Cn1cc(C2=NCC3(CN4CCC3CC4)O2)c2ccccc12,MMHYDCNZBJGVJX-UHFFFAOYSA-N,50455920,Q9JJ16,1450942
2172597,,,>14,,,10.1021/jm950747d,5994.0,17026.0,CHEMBL103,DB00396,C00410,ZINC04428529,CC(=O)[C@H]1CC[C@H]2[C@@H]3CCC4=CC(=O)CC[C@]4(...,RJKFOVLPORLFTN-LEKSSAKUSA-N,8903,,2172597
2751936,,,8200,,,10.1021/acsmedchemlett.0c00573,49846217.0,,,,,,Cc1nc(NC(=O)NCCOc2ccccc2Cl)sc1C#Cc1ccncc1,OQKRWTOIBYHOIY-UHFFFAOYSA-N,50549830,,2751936
365606,10185.0,4.00 C,,,,,16735274.0,,,,,,CCCCC[C@@H](C)NC[C@H](O)c1cc(O)cc(O)c1,DFGXCGQPCWUBML-ABAIWWIYSA-N,206775,,365606
2645374,,,0.536000,,,10.1016/j.ejmech.2019.04.053,155556300.0,,,,,,CC(C)N1Cc2cc(O[C@@H](C)CNC(=O)c3cnn4cc1cnc34)c...,JHUWBOAVDAIILQ-LBPRGKRZSA-N,50512442,,2645374
1571023,1400.0,,,,,10.1021/jm0004998,44289054.0,,CHEMBL40680,,,ZINC29343353,F[C@@H]1CCNCC1c1c([nH]c2cc(Cl)ccc12)-c1ccccc1,KVZUKTSYJONTHS-OEMAIJDKSA-N,50099257,,1571023
1388035,,,,,,,,,,,,,CC(Cc1ccccc1)NCCCc1ccc(cc1)[N+]([O-])=O,POEMFIVXIOWWMZ-UHFFFAOYSA-N,679358,,1388035
1650626,,,>50000,,,10.1021/jm031011g,11351150.0,,CHEMBL282592,,,ZINC13559646,C(Cc1ccccc1)N1CCN(CC1)c1ncnc2c(c3CCCCn3c12)-c1...,XIKLXZWNLSQCAS-UHFFFAOYSA-N,50140801,,1650626
692884,,,31.0,,,,57943198.0,,,,C01614,,CCc1csc(n1)[C@H](Cc1ccc(NS(O)(=O)=O)cc1)NC(=O)...,WUZNCUBOAURWQF-URXFXBBRSA-N,359093,,692884


In [63]:
for col in clean_binding_df.columns:
    i = clean_binding_df[col].isna().mean()
    print(f"NA ratio in {col}: {i:.2f}")

NA ratio in ki: 0.80
NA ratio in temp: 0.93
NA ratio in ic50: 0.33
NA ratio in kd: 0.96
NA ratio in kon: 1.00
NA ratio in doi: 0.47
NA ratio in pubchem_cid: 0.02
NA ratio in chebi_id: 0.97
NA ratio in chembl_id: 0.62
NA ratio in drugbank_id: 0.97
NA ratio in kegg_id: 0.97
NA ratio in zinc_id: 0.60
NA ratio in smiles: 0.00
NA ratio in inchi_key: 0.04
NA ratio in bindingdb_id: 0.00
NA ratio in swissprot_protein_id: 0.95
NA ratio in Unique_ID: 0.00


In [None]:

print("Creating merged dataset")
drugbank_binding_merger = DrugBank_BindingDB_Merger()
merged_df = drugbank_binding_merger.merge(drugbank, clean_binding_df)


merged_df.to_pickle(MERGED)

print("Merged dataset is loaded and saved.")


Creating merged dataset


  0%|          | 0/7 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  binding_df_id[identifier] = binding_df_id[identifier].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  drugbank_df_id[identifier] = drugbank_df_id[identifier].astype(str)
 14%|█▍        | 1/7 [00:02<00:14,  2.34s/it]

run


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  binding_df_id[identifier] = binding_df_id[identifier].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  drugbank_df_id[identifier] = drugbank_df_id[identifier].astype(str)


run


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  binding_df_id[identifier] = binding_df_id[identifier].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  drugbank_df_id[identifier] = drugbank_df_id[identifier].astype(str)


run


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  binding_df_id[identifier] = binding_df_id[identifier].astype(str)


run


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  drugbank_df_id[identifier] = drugbank_df_id[identifier].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  drugbank_df_id[identifier] = drugbank_df_id[identifier].astype(str)


run


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  binding_df_id[identifier] = binding_df_id[identifier].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  drugbank_df_id[identifier] = drugbank_df_id[identifier].astype(str)


run


100%|██████████| 7/7 [00:43<00:00,  6.19s/it]
  return_df =  pd.read_csv(output_file)


Merged dataset is loaded and saved.


In [54]:
len(merged_df), len(drugbank), len(clean_binding_df)

(2923143, 16581, 2923143)

In [48]:
merged_df['Matched_On'].value_counts()

Matched_On
chebi_id       74903
chembl_id      40543
inchi_key      20056
drugbank_id    15134
smiles           792
pubchem_cid        9
Name: count, dtype: int64