## Insight for the medical dataset

#### To create the dataframes and files needed for our analysis, make sure that:

You have [full_database.xml](https://drive.google.com/file/d/1149kYVkazq67e0vuv-_4APyqVX6yyh2p) in `data/clean` folder, which will represent the XML version of the DrugBank
    You have [BindingDB_All.tsv](https://www.bindingdb.org/bind/downloads/BindingDB_All_202411_tsv.zip) in `data/raw` folder, which will represent the tsv version of the BindingDB

### Imports

In [None]:
import os
import gc
import re

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xml.etree.ElementTree as ET
from tqdm import tqdm
import pandas as pd

from drugbank_XML_drugparser import DrugParser
from drugbank_bindingdb_merger import DrugBank_BindingDB_Merger
from preprocessing import Preprocessing, CleanNumericAtrributesStrategy, ColumnClean
from imports import *

%matplotlib inline

%load_ext autoreload
%autoreload 2


def keep_just_numeric(value):
    if type(value) != str:
        return pd.NA

    ## One or more non-digit charachters should be replaced
    cleaned_val = re.sub(r'[^\d.]+','', str(value)) ## There are random float / str in the dataset?? -> convert to str
    if(cleaned_val == ''): # It didn't contain any number?
        return new_class
    return float(cleaned_val)
    
bdb_preprocessor = Preprocessing(
    [
        ColumnClean('Ki (nM)', 'ki', clean=keep_just_numeric),
        # ColumnClean('pH', 'ph'),
        ColumnClean('Temp (C)', 'temp'),
        ColumnClean('IC50 (nM)', 'ic50'),
        ColumnClean('Kd (nM)', 'kd'),
        ColumnClean('kon (M-1-s-1)', 'kon'),
        ColumnClean('Article DOI', 'doi'),

        # For the merge

        ColumnClean('PubChem CID', 'pubchem_cid'),
        ColumnClean('ChEBI ID of Ligand', 'chebi_id'),
        ColumnClean('ChEMBL ID of Ligand', 'chembl_id'),
        ColumnClean('DrugBank ID of Ligand', 'drugbank_id'),
        ColumnClean('KEGG ID of Ligand', 'kegg_id'),
        ColumnClean('ZINC ID of Ligand', 'zinc_id'),
        columnclean('ligand smiles', 'smiles'),
        columnclean('ligand inchi key', 'inchi_key'),
        ColumnClean('BindingDB MonomerID', 'bindingdb_id'),
        ColumnClean('PubChem SID','pubchem_sid'),
        ColumnClean('UniProt (SwissProt) Primary ID of Target Chain.1', 'swissprot_protein_id'),
    ]
)




print("Lodaing Binding DB...")
raw_binding_df = load_BindingDB(BINDINGDB_RAW, preprocessor.get_used_old_columns())

print("Cleaning Binding DB...")
clean_binding_df = preprocessor.transform(raw_binding_df)
clean_binding_df.to_pickle(BINDINGDB_CLEAN)

### Loading the data

In [None]:

if os.path.exists(MERGED):
    print("Merged dataset exists.\n Loading...")

    merged_df = pd.read_pickle(MERGED)

    print("Merged dataset loaded")

else:
    print("Merged dataset doesn't exists.\n Creating it...")

    if os.path.exists(DRUGBANK_LIGAND_PARSED) and os.path.exists(DRUGBANK_PROTEIN_PARSED):
        print("parsed_Drugbank exists...")
        print("Loading...")

        drugbank_ligand = pd.read_pickle(DRUGBANK_LIGAND_PARSED)
        drugbank_proteins = pd.read_pickle(DRUGBANK_PROTEIN_PARSED)
    else:
        print("parsed_Drugbank doesn't exists...")
        print("Parsing DrugBank XML...")

        drugparser = DrugParser(DRUGBANK_XML)
        drugparser.parse_drugs()
        drugparser.parse_proteins()
        drugbank_ligand, drugbank_proteins = drugparser.save_parsed_drugs(DRUGBANK_LIGAND_PARSED, DRUGBANK_PROTEIN_PARSED, return_df = True)

        print("DrugBank XML is parsed. \n Loading Bind ...")
    if os.path.exists(BINDINGDB_CLEAN):
        print("BindingDB clean exists...")
        clean_binding_df = pd.read_pickle(BINDINGDB_CLEAN)
    else:
        def load_BindingDB(file_path, cols):
            return pd.read_csv(file_path, sep='\t', header=0, usecols=cols)

        def keep_just_numeric(value):
            if type(value) != str:
                return pd.NA

            ## One or more non-digit charachters should be replaced
            cleaned_val = re.sub(r'[^\d.]+','', str(value)) ## There are random float / str in the dataset?? -> convert to str
            if(cleaned_val == ''): # It didn't contain any number?
                return pd.NA
            
            return float(cleaned_val)

        def parse_int(value):
            try:
                return int(value)
            except:
                return pd.NA

        bdb_preprocessor = Preprocessing(
            [
                ColumnClean('Ki (nM)', 'ki', clean=keep_just_numeric),
                # ColumnClean('pH', 'ph'),
                ColumnClean('Temp (C)', 'temp'),
                ColumnClean('IC50 (nM)', 'ic50'),
                ColumnClean('Kd (nM)', 'kd'),
                ColumnClean('kon (M-1-s-1)', 'kon'),
                ColumnClean('Article DOI', 'doi'),

                # For the merge

                ColumnClean('PubChem CID', 'pubchem_cid'),
                ColumnClean('ChEBI ID of Ligand', 'chebi_id'),
                ColumnClean('ChEMBL ID of Ligand', 'chembl_id'),
                ColumnClean('DrugBank ID of Ligand', 'drugbank_id'),
                ColumnClean('KEGG ID of Ligand', 'kegg_id'),
                ColumnClean('ZINC ID of Ligand', 'zinc_id'),
                ColumnClean('Ligand SMILES', 'smiles'),
                ColumnClean('Ligand InChI Key', 'inchi_key'),
                ColumnClean('BindingDB MonomerID', 'bindingdb_id', clean=parse_int),
                ColumnClean('PubChem CID','pubchem_cid'),
                ColumnClean('UniProt (SwissProt) Primary ID of Target Chain.1', 'swissprot_protein_id'),
            ]
        )

        print("Loading Binding DB...")
        raw_binding_df = load_BindingDB(BINDINGDB_RAW, bdb_preprocessor.get_used_old_columns())

        print("Cleaning Binding DB...")
        clean_binding_df = bdb_preprocessor.transform(raw_binding_df)
        clean_binding_df.to_pickle(BINDINGDB_CLEAN)

        assert len(clean_binding_df) == len(raw_binding_df)
        del raw_binding_df
        gc.collect()


In [None]:
clean_binding_df['bindingdb_id'].isna().sum()

In [None]:

clean_binding_df.sample(20)

In [None]:
for col in clean_binding_df.columns:
    i = clean_binding_df[col].isna().mean()
    print(f"NA ratio in {col}: {i:.2f}")

In [None]:
print("Creating merged dataset")
drugbank_binding_merger = DrugBank_BindingDB_Merger()
merged_df = drugbank_binding_merger.merge(drugbank_ligand, clean_binding_df)
complete_merged_df = pd.merge(merged_df, drugbank_proteins, how='left', on='swissprot_protein_id')
complete_merged_df.to_pickle(MERGED)

print("Merged dataset is loaded and saved.")

In [None]:
merged_df['Matched_On'].value_counts()

In [None]:
complete_merged_df['Matched_On'].value_counts()

In [None]:
len(complete_merged_df), len(merged_df), len(drugbank_ligand), len(drugbank_proteins), len(clean_binding_df)