## Insight for the medical dataset

#### To create the dataframes and files needed for our analysis, make sure that:
    You have full_database.xml under data/medical/ folder, which will represent the XML version of the DrugBank
    You have BindingDB_All.tsv under data/medical/ folder, which will represent the tsv version of the BindingDB

### Imports

In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import xml.etree.ElementTree as ET
from tqdm import tqdm
import pandas as pd

from drugbank_XML_drugparser import DrugParser
from drugbank_bindingdb_merger import DrugBank_BindingDB_Merger
from preprocessing import Preprocessing, CleanNumericAtrributesStrategy

%matplotlib inline

%load_ext autoreload
%autoreload 2

### Paths

In [2]:
DATA_PATH = 'data/medical/'
BINDINGDB = DATA_PATH + 'BindingDB_All.tsv'
DRUGBANK_XML = DATA_PATH + 'full_database.xml'
DRUGBANK_CSV = DATA_PATH + 'parsed_DrugBank.csv'
MERGED_CSV = DATA_PATH + 'Merged_Binding_DrugBank_LEFT.csv'
MERGED_PARQUET = DATA_PATH + 'merged_dataframe.parquet'
MERGED_PICKLE = DATA_PATH + 'merged_dataframe.pkl'

### Loading the data

In [3]:
def load_BindingDB(file_path):

    ## load the first row to get the number of columns
    df_first_row = pd.read_csv(file_path, sep='\t', nrows=1)

    ## number of columns to use
    df = pd.read_csv(file_path, sep='\t', header=0, usecols=range(df_first_row.shape[1]))
    return df

In [4]:
import gc
if os.path.exists(MERGED_PICKLE):
    print("Merged dataset exists.\n Loading...")

    merged_df = pd.read_pickle(MERGED_PICKLE)

    print("Merged dataset loaded")

else:
    print("Merged dataset doesn't exists.\n Creating it...")

    if os.path.exists(DRUGBANK_CSV):
        print("parsed_Drugbank exists...")
        print("Loading...")

        drugbank = pd.read_csv(DRUGBANK_CSV, encoding='utf-8')
    else:
        print("parsed_Drugbank doesn't exists...")
        print("Creating parsed_Drugbank.csv")

        drugparser = DrugParser(DRUGBANK_XML)
        drugparser.parse_drugs()
        drugbank = drugparser.save_parsed_drugs(DRUGBANK_CSV, return_df = True)

        print("parsed_Drugbank.csv is created")
        print("DrugBank XML is parsed. \n Loading it ...")

    print("Load Binding DB...")
    bindingDB_df = load_BindingDB(BINDINGDB)

    print("Cleaning Binding DB...")
    preprocessor = Preprocessing(
        [
            CleanNumericAtrributesStrategy(),
        ]
    )
    preprocessed_df = preprocessor.transform(bindingDB_df)

    del bindingDB_df
    gc.collect()
    
    print("Creating merged dataset")
    drugbank_binding_merger = DrugBank_BindingDB_Merger()
    merged_df = drugbank_binding_merger.merge(drugbank, preprocessed_df)
    drugbank_binding_merger.save_merged(MERGED_PICKLE)

    print("Merged dataset is loaded and saved.")

Merged dataset exists.
 Loading...
Merged dataset loaded


In [5]:
merged_df.head()

Unnamed: 0,BindingDB Reactant_set_id,SMILES,Ligand InChI,InChI_Key,BindingDB_ID,BindingDB Ligand Name,Target Name,Target Source Organism According to Curator or DataSource,Ki (nM),IC50 (nM),...,synonyms,categories,patent_approved,interaction,ChEMBL_ID_DrugBank,SMILES_DrugBank,InChI_Key_DrugBank,ChEBI_ID_DrugBank,BindingDB_ID_DrugBank,Matched_On
0,1,O[C@@H]1[C@@H](O)[C@@H](Cc2ccccc2)N(CCCCCC(O)=...,InChI=1S/C31H42N2O7/c34-27(35)17-9-3-11-19-32-...,XGEGDSLAQZJGCW-HHGOQMMWSA-N,608734,"6-[(4R,5S,6S,7R)-4,7-dibenzyl-3-(5-carboxypent...",Dimer of Gag-Pol polyprotein [501-599],Human immunodeficiency virus 1,0.24,-1.0,...,,,,,,,,,,
1,2,O[C@@H]1[C@@H](O)[C@@H](Cc2ccccc2)N(C\C=C\c2cn...,InChI=1S/C31H34N6O3/c38-29-27(17-23-9-3-1-4-10...,UZLMEAPBHYEHAC-UNTBESQGSA-N,22,"(4R,5S,6S,7R)-4,7-dibenzyl-5,6-dihydroxy-1,3-b...",Dimer of Gag-Pol polyprotein [501-599],Human immunodeficiency virus 1,0.25,-1.0,...,,,,,,,,,,
2,3,O[C@@H]1[C@@H](O)[C@@H](Cc2ccccc2)N(CC2CC2)C(=...,InChI=1S/C29H34N4O3/c34-27-25(16-21-8-3-1-4-9-...,HYNYUFZPPJMPOB-UTWJFGBXSA-N,23,"(4R,5S,6S,7R)-4,7-dibenzyl-1-(cyclopropylmethy...",Dimer of Gag-Pol polyprotein [501-599],Human immunodeficiency virus 1,0.41,-1.0,...,,,,,,,,,,
3,4,OCCCCCCN1[C@H](Cc2ccccc2)[C@H](O)[C@@H](O)[C@@...,InChI=1S/C29H40N2O4/c32-18-10-2-1-9-17-30-25(1...,YXVAZXDWVZTGGD-VIJSPRBVSA-N,24,"(4R,5S,6S,7R)-4,7-dibenzyl-1-(cyclopropylmethy...",Dimer of Gag-Pol polyprotein [501-599],Human immunodeficiency virus 1,0.8,-1.0,...,,,,,,,,,,
4,5,OCCCCCN1[C@H](Cc2ccccc2)[C@H](O)[C@@H](O)[C@@H...,InChI=1S/C28H38N2O4/c31-17-9-3-8-16-29-24(18-2...,WWTSWTPNILRSJX-XDZXDJIYSA-N,25,"(4R,5S,6S,7R)-4,7-dibenzyl-1-(cyclopropylmethy...",Dimer of Gag-Pol polyprotein [501-599],Human immunodeficiency virus 1,0.99,-1.0,...,,,,,,,,,,


### Initial Cancer related filtering

In [17]:
cancer_keywords = [
    "cancer", "tumor", "carcinoma", "neoplasm", "malignancy", "metastasis", "oncology",
    "sarcoma", "adenocarcinoma", "lymphoma", "leukemia", "myeloma", "breast cancer",
    "lung cancer", "prostate cancer", "colon cancer", "rectal cancer", "colorectal cancer",
    "skin cancer", "melanoma", "bladder cancer", "kidney cancer", "renal cancer",
    "pancreatic cancer", "ovarian cancer", "cervical cancer", "uterine cancer", "thyroid cancer",
    "liver cancer", "gastric cancer", "stomach cancer", "esophageal cancer", "brain cancer",
    "glioma", "astrocytoma", "neuroblastoma", "endometrial cancer", "dysplasia", "hyperplasia",
    "leukoplakia", "adenoma", "lesion", "metastases", "carcinoma in situ", "anaplastic",
    "fibrosarcoma", "osteosarcoma", "mesothelioma", "chemotherapy", "radiotherapy",
    "immunotherapy", "biopsy", "staging", "grading", "metastasize", "malignant", "benign",
    "tumor suppressor", "oncogene", "palliative care", "remission", "recurrence"
]
filtered_df_with_target_name = merged_df[merged_df[["Target Name", "UniProt (SwissProt) Entry Name of Target Chain"]].map(lambda x: any(keyword in str(x).lower() for keyword in cancer_keywords)).any(axis=1)]

In [26]:
unique_cancer_related_proteins = filtered_df_with_target_name["Target Name"].unique()
print(f"number of different cancer related proteins: {len(unique_cancer_related_proteins)}")

number of different cancer related proteins: 123


#### Just human as source organism

In [None]:
filtered_df_human = filtered_df_with_target_name[filtered_df_with_target_name[["Target Source Organism According to Curator or DataSource", "UniProt (SwissProt) Entry Name of Target Chain"]].map(lambda x: "human" in str(x).lower()).any(axis=1)]

In [22]:
len(filtered_df_human)

67650