## Insight for the medical dataset

#### To create the dataframes and files needed for our analysis, make sure that:
    You have full_database.xml under data/medical/ folder, which will represent the XML version of the DrugBank
    You have BindingDB_All.tsv under data/medical/ folder, which will represent the tsv version of the BindingDB

### Imports

In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import xml.etree.ElementTree as ET
from tqdm import tqdm
import pandas as pd

from drugbank_XML_drugparser import DrugParser
from drugbank_bindingdb_merger import DrugBank_BindingDB_Merger
from preprocessing import Preprocessing, CleanNumericAtrributesStrategy

%matplotlib inline

%load_ext autoreload
%autoreload 2

### Paths

In [2]:
DATA_PATH = 'data/medical/'
BINDINGDB = DATA_PATH + 'BindingDB_All.tsv'
DRUGBANK_XML = DATA_PATH + 'full_database.xml'
DRUGBANK_CSV = DATA_PATH + 'parsed_DrugBank.csv'
MERGED_CSV = DATA_PATH + 'Merged_Binding_DrugBank_LEFT.csv'
MERGED_PARQUET = DATA_PATH + 'merged_dataframe.parquet'

### Loading the data

In [3]:
def load_BindingDB(file_path):

    ## load the first row to get the number of columns
    df_first_row = pd.read_csv(file_path, sep='\t', nrows=1)

    ## number of columns to use
    df = pd.read_csv(file_path, sep='\t', header=0, usecols=range(df_first_row.shape[1]))
    return df

In [None]:
if os.path.exists(MERGED_PARQUET):
    print("Merged dataset exists.\n Loading...")

    merged_df = pd.read_parquet(MERGED_PARQUET)

    print("Merged dataset loaded")

else:
    print("Merged dataset doesn't exists.\n Creating it...")

    if os.path.exists(DRUGBANK_CSV):
        print("parsed_Drugbank exists...")
        print("Loading...")

        drugbank = pd.read_csv(DRUGBANK_CSV, encoding='utf-8')
    else:
        print("parsed_Drugbank doesn't exists...")
        print("Creating parsed_Drugbank.csv")

        drugparser = DrugParser(DRUGBANK_XML)
        drugparser.parse_drugs()
        drugbank = drugparser.save_parsed_drugs(DRUGBANK_CSV, return_df = True)

        print("parsed_Drugbank.csv is created")
        print("DrugBank XML is parsed. \n Loading it ...")

    print("Load Binding DB...")
    bindingDB_df = load_BindingDB(BINDINGDB)

    print("Cleaning Binding DB...")
    preprocessor = Preprocessing(
        [
            CleanNumericAtrributesStrategy(),
        ]
    )
    preprocessed_df = preprocessor.transform(bindingDB_df)

    print("Creating merged dataset")
    drugbank_binding_merger = DrugBank_BindingDB_Merger()
    merged_df = drugbank_binding_merger.merge(drugbank, preprocessed_df)
    drugbank_binding_merger.save_merged(MERGED_PARQUET)

    print("Done.")


In [None]:
2

In [None]:
preprocessor = Preprocessing(
    [
        CleanNumericAtrributesStrategy(),
    ]
)
preprocessed_df = preprocessor.transform(merged_df)
preprocessed_df.head()

In [9]:
len(merged_df.columns)


216

parsed_Drugbank exists...
Loading parsed_Drugbank.csv...


In [7]:
#print(f"Number of rows: {len(df)}")
#print(f"Number of cols: {len(df.columns)}")

In [None]:
del bindingDB_df, drugbank
gc.collect()

'data/medical/Merged_Binding_DrugBank_LEFT.csv'

In [13]:
if os.path.exists(output_file_merged):
    print("Merged_Binding_DrugBank exists...")
    print("Loading Merged_Binding_DrugBank_LEFT.csv...")
    merged_df = pd.read_csv(output_file_merged, encoding='utf-8')
else:
    print("Merged_Binding_DrugBank_LEFT doesn't exists...")
    print("Creating Merged_Binding_DrugBank_LEFT.csv")
    drugbank_binding_merger = DrugBank_BindingDB_Merger()
    merged_df = drugbank_binding_merger.merge(drugbank, bindingDB_df)
    drugbank_binding_merger.save_merged(output_file_merged)
    print("Merged_Binding_DrugBank_LEFT.csv is created")

Merged_Binding_DrugBank exists...
Loading Merged_Binding_DrugBank_LEFT.csv...


  merged_df = pd.read_csv(output_file_merged, encoding='utf-8')


In [1]:
preprocessed_df.head()

NameError: name 'preprocessed_df' is not defined

Unnamed: 0,BindingDB Reactant_set_id,SMILES,Ligand InChI,InChI_Key,BindingDB_ID,BindingDB Ligand Name,Target Name,Target Source Organism According to Curator or DataSource,Ki (nM),IC50 (nM),...,synonyms,categories,patent_approved,interaction,ChEMBL_ID_DrugBank,SMILES_DrugBank,InChI_Key_DrugBank,ChEBI_ID_DrugBank,BindingDB_ID_DrugBank,Matched_On
0,1,O[C@@H]1[C@@H](O)[C@@H](Cc2ccccc2)N(CCCCCC(O)=...,InChI=1S/C31H42N2O7/c34-27(35)17-9-3-11-19-32-...,XGEGDSLAQZJGCW-HHGOQMMWSA-N,608734,"6-[(4R,5S,6S,7R)-4,7-dibenzyl-3-(5-carboxypent...",Dimer of Gag-Pol polyprotein [501-599],Human immunodeficiency virus 1,0.24,,...,,,,,,,,,,
1,2,O[C@@H]1[C@@H](O)[C@@H](Cc2ccccc2)N(C\C=C\c2cn...,InChI=1S/C31H34N6O3/c38-29-27(17-23-9-3-1-4-10...,UZLMEAPBHYEHAC-UNTBESQGSA-N,22,"(4R,5S,6S,7R)-4,7-dibenzyl-5,6-dihydroxy-1,3-b...",Dimer of Gag-Pol polyprotein [501-599],Human immunodeficiency virus 1,0.25,,...,,,,,,,,,,
2,3,O[C@@H]1[C@@H](O)[C@@H](Cc2ccccc2)N(CC2CC2)C(=...,InChI=1S/C29H34N4O3/c34-27-25(16-21-8-3-1-4-9-...,HYNYUFZPPJMPOB-UTWJFGBXSA-N,23,"(4R,5S,6S,7R)-4,7-dibenzyl-1-(cyclopropylmethy...",Dimer of Gag-Pol polyprotein [501-599],Human immunodeficiency virus 1,0.41,,...,,,,,,,,,,
3,4,OCCCCCCN1[C@H](Cc2ccccc2)[C@H](O)[C@@H](O)[C@@...,InChI=1S/C29H40N2O4/c32-18-10-2-1-9-17-30-25(1...,YXVAZXDWVZTGGD-VIJSPRBVSA-N,24,"(4R,5S,6S,7R)-4,7-dibenzyl-1-(cyclopropylmethy...",Dimer of Gag-Pol polyprotein [501-599],Human immunodeficiency virus 1,0.8,,...,,,,,,,,,,
4,5,OCCCCCN1[C@H](Cc2ccccc2)[C@H](O)[C@@H](O)[C@@H...,InChI=1S/C28H38N2O4/c31-17-9-3-8-16-29-24(18-2...,WWTSWTPNILRSJX-XDZXDJIYSA-N,25,"(4R,5S,6S,7R)-4,7-dibenzyl-1-(cyclopropylmethy...",Dimer of Gag-Pol polyprotein [501-599],Human immunodeficiency virus 1,0.99,,...,,,,,,,,,,


In [8]:
len(merged_df)

2927609

Unnamed: 0,BindingDB Reactant_set_id,Ligand SMILES,Ligand InChI,Ligand InChI Key,BindingDB MonomerID,BindingDB Ligand Name,Target Name,Target Source Organism According to Curator or DataSource,Ki (nM),IC50 (nM),...,UniProt (SwissProt) Recommended Name of Target Chain.12,UniProt (SwissProt) Entry Name of Target Chain.12,UniProt (SwissProt) Primary ID of Target Chain.12,UniProt (SwissProt) Secondary ID(s) of Target Chain.12,UniProt (SwissProt) Alternative ID(s) of Target Chain.12,UniProt (TrEMBL) Submitted Name of Target Chain.12,UniProt (TrEMBL) Entry Name of Target Chain.12,UniProt (TrEMBL) Primary ID of Target Chain.12,UniProt (TrEMBL) Secondary ID(s) of Target Chain.12,UniProt (TrEMBL) Alternative ID(s) of Target Chain.12
0,1,O[C@@H]1[C@@H](O)[C@@H](Cc2ccccc2)N(CCCCCC(O)=...,InChI=1S/C31H42N2O7/c34-27(35)17-9-3-11-19-32-...,XGEGDSLAQZJGCW-HHGOQMMWSA-N,608734,"6-[(4R,5S,6S,7R)-4,7-dibenzyl-3-(5-carboxypent...",Dimer of Gag-Pol polyprotein [501-599],Human immunodeficiency virus 1,0.24,-1.0,...,,,,,,,,,,
1,2,O[C@@H]1[C@@H](O)[C@@H](Cc2ccccc2)N(C\C=C\c2cn...,InChI=1S/C31H34N6O3/c38-29-27(17-23-9-3-1-4-10...,UZLMEAPBHYEHAC-UNTBESQGSA-N,22,"(4R,5S,6S,7R)-4,7-dibenzyl-5,6-dihydroxy-1,3-b...",Dimer of Gag-Pol polyprotein [501-599],Human immunodeficiency virus 1,0.25,-1.0,...,,,,,,,,,,
2,3,O[C@@H]1[C@@H](O)[C@@H](Cc2ccccc2)N(CC2CC2)C(=...,InChI=1S/C29H34N4O3/c34-27-25(16-21-8-3-1-4-9-...,HYNYUFZPPJMPOB-UTWJFGBXSA-N,23,"(4R,5S,6S,7R)-4,7-dibenzyl-1-(cyclopropylmethy...",Dimer of Gag-Pol polyprotein [501-599],Human immunodeficiency virus 1,0.41,-1.0,...,,,,,,,,,,
3,4,OCCCCCCN1[C@H](Cc2ccccc2)[C@H](O)[C@@H](O)[C@@...,InChI=1S/C29H40N2O4/c32-18-10-2-1-9-17-30-25(1...,YXVAZXDWVZTGGD-VIJSPRBVSA-N,24,"(4R,5S,6S,7R)-4,7-dibenzyl-1-(cyclopropylmethy...",Dimer of Gag-Pol polyprotein [501-599],Human immunodeficiency virus 1,0.8,-1.0,...,,,,,,,,,,
4,5,OCCCCCN1[C@H](Cc2ccccc2)[C@H](O)[C@@H](O)[C@@H...,InChI=1S/C28H38N2O4/c31-17-9-3-8-16-29-24(18-2...,WWTSWTPNILRSJX-XDZXDJIYSA-N,25,"(4R,5S,6S,7R)-4,7-dibenzyl-1-(cyclopropylmethy...",Dimer of Gag-Pol polyprotein [501-599],Human immunodeficiency virus 1,0.99,-1.0,...,,,,,,,,,,


## Link to Drug Bank

MemoryError: Unable to allocate 22.3 MiB for an array with shape (2927609,) and data type object

In [None]:
len(filtered_df['DrugBank ID of Ligand'].dropna())

87465

In [None]:
smile_merged = pd.merge(filtered_df, drugbank, left_on='Ligand SMILES', right_on='SMILES')
len(smile_merged)

6191

Processing identifier: PubChem_CID


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1_id[identifier] = df1_id[identifier].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2_id[identifier] = df2_id[identifier].astype(str)


Processing identifier: ChEBI_ID


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1_id[identifier] = df1_id[identifier].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2_id[identifier] = df2_id[identifier].astype(str)


Processing identifier: ChEMBL_ID


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1_id[identifier] = df1_id[identifier].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2_id[identifier] = df2_id[identifier].astype(str)


Processing identifier: BindingDB_ID


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2_id[identifier] = df2_id[identifier].astype(str)


Processing identifier: SMILES


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2_id[identifier] = df2_id[identifier].astype(str)


Processing identifier: InChI_Key


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1_id[identifier] = df1_id[identifier].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2_id[identifier] = df2_id[identifier].astype(str)


  merged_df = pd.read_csv(output_file)


In [None]:
merged_df['Matched_On'].value_counts()

Matched_On
InChI_Key      147426
ChEMBL_ID       96639
ChEBI_ID        74904
SMILES           6191
PubChem_CID         9
Name: count, dtype: int64

In [None]:
len(merged_df)

325169

In [None]:
merged_df['Unique_ID'].nunique()

150783

In [None]:
merged_df.drop_duplicates(subset=['Unique_ID'], inplace=True)

In [None]:
binding_readded.head(2)

Unnamed: 0,BindingDB Reactant_set_id,SMILES,Ligand InChI,InChI_Key,BindingDB_ID,BindingDB Ligand Name,Target Name,Target Source Organism According to Curator or DataSource,Ki (nM),IC50 (nM),...,synonyms,categories,patent_approved,interaction,ChEMBL_ID_DrugBank,SMILES_DrugBank,InChI_Key_DrugBank,ChEBI_ID_DrugBank,BindingDB_ID_DrugBank,Matched_On
0,1,O[C@@H]1[C@@H](O)[C@@H](Cc2ccccc2)N(CCCCCC(O)=...,InChI=1S/C31H42N2O7/c34-27(35)17-9-3-11-19-32-...,XGEGDSLAQZJGCW-HHGOQMMWSA-N,608734,"6-[(4R,5S,6S,7R)-4,7-dibenzyl-3-(5-carboxypent...",Dimer of Gag-Pol polyprotein [501-599],Human immunodeficiency virus 1,0.24,-1.0,...,,,,,,,,,,
1,2,O[C@@H]1[C@@H](O)[C@@H](Cc2ccccc2)N(C\C=C\c2cn...,InChI=1S/C31H34N6O3/c38-29-27(17-23-9-3-1-4-10...,UZLMEAPBHYEHAC-UNTBESQGSA-N,22,"(4R,5S,6S,7R)-4,7-dibenzyl-5,6-dihydroxy-1,3-b...",Dimer of Gag-Pol polyprotein [501-599],Human immunodeficiency virus 1,0.25,-1.0,...,,,,,,,,,,


In [None]:
# Left join binding db with merged_df (on Unique_ID), don't duplicate columns though
binding_readded = pd.merge(BindingDB, merged_df, on='Unique_ID', how='left', suffixes=('', '_y'))
all_cols = list(binding_readded.columns)
cols_to_keep = []

for col in all_cols:
    if not ('_y' in col and col.split('_y')[0] in all_cols):
        cols_to_keep.append(col)

In [None]:
# Left join binding db with merged_df (on Unique_ID), don't duplicate columns though
binding_readded = pd.merge(BindingDB, merged_df, on='Unique_ID', how='left', suffixes=('', '_y'))
all_cols = list(binding_readded.columns)
cols_to_keep = []

for col in all_cols:
    if not ('_y' in col and col.split('_y')[0] in all_cols):
        cols_to_keep.append(col)
# Keep only the columns in cols_to_keep
cleaned_readded = binding_readded[cols_to_keep]
cleaned_readded.to_csv('Merged Binding DrugBank.csv', index=False)

In [None]:
cleaned_readded.to_csv('Merged Binding DrugBank.csv', index=False)

NameError: name 'os' is not defined

In [None]:
import os
if os.path.exists(output_file):
    os.remove(output_file)