## Insight for the medical dataset

### Imports

In [2]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import xml.etree.ElementTree as ET
from tqdm import tqdm
import pandas as pd
%matplotlib inline

In [1]:
2

2

### Paths

In [3]:
DATA_PATH = 'data/medical/'
BINDINGDB = DATA_PATH + 'BindingDB_All.tsv'
DRUGBANK_XML = DATA_PATH + 'full_database.xml'
DRUGBANK_CSV = DATA_PATH + 'parsed_DrugBank.csv'
MERGED_CSV = DATA_PATH + 'Merged_Binding_DrugBank_LEFT.csv'
output_file = DATA_PATH + 'parsed_DrugBank.csv'
output_file_merged = DATA_PATH + "Merged_NEW.csv"

### Loading the data

In [3]:
def load_BindingDB(file_path):

    ## load the first row to get the number of columns
    df_first_row = pd.read_csv(file_path, sep='\t', nrows=1)

    ## number of columns to use
    df = pd.read_csv(file_path, sep='\t', header=0, usecols=range(df_first_row.shape[1]))
    return df

In [4]:
bindingDB_df = load_BindingDB(BINDINGDB)

In [None]:
from tqdm import tqdm
import pandas as pd
from lxml import etree

class DrugParser:
    def __init__(self, xml_path):
        parser = etree.XMLParser(recover=True)
        parsed_file = etree.parse(xml_path, parser=parser)
        root = parsed_file.getroot()

        self.drugs = list(root)
        self.parsed_drugs = []
    
    def parse_drugs(self):
        for i in tqdm(range(len(self.drugs))):
            drug = self.drugs[i]
            drug_properties = self._parse_drug_properties(drug)
            self.parsed_drugs.append(drug_properties)
        return self.parse_drugs
    
    def _parse_drug_properties(self, drug):
        idDB = drug[0].text # Drug Bank ID
        drug_properties = {}
        drug_properties['id'] = idDB

        for feature in drug:
            feature_name = feature.tag

            if 'name' in feature_name: # drug name
                drug_properties['name'] = feature.text

            if 'synonyms' in feature_name: # drug's synonyms
                drug_synm = '|'.join([synm.text for synm in list(feature)])
                drug_properties['synonyms'] = drug_synm
                
            if 'toxicity' in feature_name: # drug's toxicity
                drug_properties['toxicity'] = feature.text

            if 'unii' in feature_name: # drug's UNII
                drug_properties['unii'] = feature.text

            if 'categories' in feature_name: # drug's categories
                drug_categories = '|'.join([cat[0].text for cat in list(feature)])
                drug_properties['categories'] = drug_categories

            if 'classification' in feature_name: #type of drug
                classifications = list(feature)
                drug_class_kingdom = classifications[2].text
                drug_class_superclass = classifications[3].text
                drug_properties['class_kingdom'] = drug_class_kingdom
                drug_properties['class_superclass'] = drug_class_superclass

            if 'drug-interactions' in feature_name: #interaction other drugs
                drug_interaction = '|'.join([di[0].text
                                            for di in list(feature)])
                drug_properties['interaction'] = drug_interaction
                
            if 'patents' in feature_name:
                patents_list = list(feature)
                if len(patents_list) > 0:
                    drug_patent_approved = '|'.join([cat[2].text for cat in patents_list])
                    drug_properties['patent_approved'] = drug_patent_approved

            if 'calculated-properties' in feature_name: # drug's categories
                for calc_prop in list(feature):
                    prop_name = calc_prop[0].text
                    if 'SMILES' in prop_name:
                        drug_SMILE = calc_prop[1].text
                        drug_properties['SMILES'] = drug_SMILE

                    if 'InChI' in prop_name:
                        drugInChI = calc_prop[1].text
                        drug_properties['InChI'] = drugInChI

            if 'external-identifiers' in feature_name: #other drug's IDs
                feature_list = list(feature)

                for ext in feature_list:
                    if str(ext[0].text) == 'ChEMBL':
                        drug_properties['chembl'] = ext[1].text
                    if str(ext[0].text) == 'ChEBI':
                        drug_properties['chebi'] = ext[1].text
                    if str(ext[0].text) == 'PubChem Substance':
                        drug_properties['pubchem'] = ext[1].text
                    if str(ext[0].text) == 'BindingDB':
                        drug_properties['bindingdb'] = ext[1].text
            
        return drug_properties
        
    def save_parsed_drugs(self, output_file, return_df = False):
        parsed_drugs_df = pd.DataFrame(self.parsed_drugs)
        parsed_drugs_df.to_csv(output_file, index=False, encoding='utf-8')

        if return_df:
            return parsed_drugs_df
        return None
    

In [6]:
if os.path.exists(DRUGBANK_CSV):
    print("parsed_Drugbank exists...")
    print("Loading parsed_Drugbank.csv...")
    drugbank = pd.read_csv(DRUGBANK_CSV, encoding='utf-8')
else:
    print("parsed_Drugbank doesn't exists...")
    print("Creating parsed_Drugbank.csv")
    drugparser = DrugParser(DRUGBANK_XML)
    drugparser.parse_drugs()
    drugbank = drugparser.save_parsed_drugs(DRUGBANK_CSV, return_df = True)
    print("parsed_Drugbank.csv is created")

parsed_Drugbank exists...
Loading parsed_Drugbank.csv...


In [7]:
#print(f"Number of rows: {len(df)}")
#print(f"Number of cols: {len(df.columns)}")

In [8]:
from typing import List
import pandas as pd
import os
import gc

"""
The main idea:  We want to maximize the amount of data remaining after merge between BindingDB and DrugBank
                We don't have an identifier that matches them clearly, so we defined a set of identifiers
                If any of those matches between the two databases then we consider them as a match and we merge

Since an immidiate left-join on every identifier will make the computation very slow we merge in two steps:

1)  We execute an inner join on every identifier and append the values under each other
    This will result in some duplicated rows (because a row is highly to match in more than 1 identifier)
2)  We execute a left join between the bindingDB and the merged dataframe => We don't loose data from bindingDB

    a) We need to remove the duplicated rows from point 1)
    b) After left joining we will have duplicated features as well, that should be removed. The duplicated features caused by
    the left-join will be marked with  *_duplicated*, so we know what features to remove
"""
class DrugBank_BindingDB_Merger:

    def __init__(self):
        self.drugbank_df = None
        self.binding_df = None
        self.merged_df = None

    def merge(self, drugbank_df : pd.DataFrame, binding_df : pd.DataFrame):

        self.drugbank_df = drugbank_df
        self.binding_df = binding_df

        #1)
        temp_file = 'temp.csv'
        identifiers = self._rename_cols_and_get_identifiers()
        before_left_merge = self._merge_dataframes_on_identifiers(identifiers, temp_file)

        #2)
        self._left_join(before_left_merge)
        
        return self.merged_df
    
    # Rename columns in BindingDB and in DrugBank to unify naming conventions
    def _rename_cols_and_get_identifiers(self) -> List[str]:
        
        self.binding_df.rename(columns={
            'PubChem CID': 'PubChem_CID',
            'ChEBI ID of Ligand': 'ChEBI_ID',
            'ChEMBL ID of Ligand': 'ChEMBL_ID',
            'DrugBank ID of Ligand': 'DrugBank_ID',
            'KEGG ID of Ligand': 'KEGG_ID',
            'ZINC ID of Ligand': 'ZINC_ID',
            'Ligand SMILES': 'SMILES',
            'Ligand InChI Key': 'InChI_Key',
            'BindingDB MonomerID': 'BindingDB_ID',
        }, inplace=True)

        self.drugbank_df.rename(columns={
            'chebi': 'ChEBI_ID',
            'chembl': 'ChEMBL_ID',
            'pubchem': 'PubChem_CID',
            'PubChem Substance': 'PubChem_SID',
            'DrugBank IDs': 'DrugBank_ID',
            'bindingdb': 'BindingDB_ID',
            'ZINC': 'ZINC_ID',
            'SMILES': 'SMILES',
            'InChI': 'InChI_Key',
            'KEGG Compound': 'KEGG_ID'
        }, inplace=True)

        self.binding_df['Unique_ID'] = np.arange(len(self.binding_df))

        # List of identifiers to merge on
        identifiers = [
            'PubChem_CID',
            'PubChem_SID',
            'ChEBI_ID',
            'ChEMBL_ID',
            'DrugBank_ID',
            'BindingDB_ID',
            'ZINC_ID',
            'SMILES',
            'InChI',
            'InChI_Key'
        ]
        
        return identifiers

    # Function to process and merge on each identifier individually
    def _merge_dataframes_on_identifiers(self,identifiers, output_file):
        for identifier in identifiers:
            if identifier in self.binding_df.columns and identifier in self.drugbank_df.columns:
                
                # Drop rows with NaN in the identifier columns
                binding_df_id = self.binding_df.dropna(subset=[identifier]).copy()
                drugbank_df_id = self.drugbank_df.dropna(subset=[identifier]).copy()
                
                # Convert identifier columns to string to avoid type mismatches
                binding_df_id.loc[:, identifier] = binding_df_id[identifier].astype(str)
                drugbank_df_id.loc[:, identifier] = drugbank_df_id[identifier].astype(str)
                
                # Perform the merge
                merged_df = pd.merge(
                    binding_df_id, drugbank_df_id, 
                    on=identifier, 
                    how='inner', 
                    suffixes=('_BindingDB', '_DrugBank')
                )
                
                if not merged_df.empty:
                    # Add a column to indicate which identifier was matched
                    merged_df['Matched_On'] = identifier
                    
                    # Write to CSV in append mode
                    merged_df.to_csv(
                        output_file, 
                        mode='a', 
                        index=False, 
                        header=not os.path.exists(output_file)
                    )
        return_df =  pd.read_csv(output_file)
        os.remove(output_file)
        return return_df


    def _left_join(self, merged_df: pd.DataFrame):
        # Left join binding db with merged_df (on Unique_ID), don't duplicate columns though
        binding_readded = pd.merge(self.binding_df, merged_df, on='Unique_ID', how='left', suffixes=('', '_duplicated'))
        all_cols = list(binding_readded.columns)
        cols_to_keep = []

        for col in all_cols:
            if not ('_duplicated' in col and col.split('_duplicated')[0] in all_cols):
                cols_to_keep.append(col)

        # Keep only the columns in cols_to_keep
        self.merged_df = binding_readded[cols_to_keep]

    def save_merged(self, output_file):
        self.merged_df.to_csv(output_file, index=False)

In [None]:
del bindingDB_df, drugbank
gc.collect()

'data/medical/Merged_Binding_DrugBank_LEFT.csv'

In [13]:
if os.path.exists(output_file_merged):
    print("Merged_Binding_DrugBank exists...")
    print("Loading Merged_Binding_DrugBank_LEFT.csv...")
    merged_df = pd.read_csv(output_file_merged, encoding='utf-8')
else:
    print("Merged_Binding_DrugBank_LEFT doesn't exists...")
    print("Creating Merged_Binding_DrugBank_LEFT.csv")
    drugbank_binding_merger = DrugBank_BindingDB_Merger()
    merged_df = drugbank_binding_merger.merge(drugbank, bindingDB_df)
    drugbank_binding_merger.save_merged(output_file_merged)
    print("Merged_Binding_DrugBank_LEFT.csv is created")

Merged_Binding_DrugBank exists...
Loading Merged_Binding_DrugBank_LEFT.csv...


  merged_df = pd.read_csv(output_file_merged, encoding='utf-8')


In [16]:
from abc import ABC, abstractmethod
import re
from typing import List
import pandas as pd

class ColumnCleaningStrategy(ABC):

    @abstractmethod
    def fill(self, df: pd.DataFrame) -> pd.DataFrame:
        pass

class Preprocessing:

    def __init__(self, strategies: List[ColumnCleaningStrategy]) -> None:
        self.strategies = strategies
    
    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
        cleaned_df = df.copy()
        for strategy in self.strategies:
            cleaned_df = strategy.fill(cleaned_df)
        return cleaned_df
        

class CleanNumericAtrributesStrategy(ColumnCleaningStrategy):
    def keep_just_numeric(self, value, new_class = -1):
        if type(value) != str:
            return new_class
        
        ## One or more non-digit charachters should be replaced
        cleaned_val = re.sub(r'[^\d.]+','', str(value)) ## There are random float / str in the dataset?? -> convert to str
        if(cleaned_val == ''): # It didn't contain any number?
            return new_class
        return float(cleaned_val)

    def fill(self, df: pd.DataFrame) -> pd.DataFrame:
        affinity_cols = ["Ki (nM)", "Kd (nM)"] ## Measures of binding affinity
        ec_ic = ["EC50 (nM)", "IC50 (nM)"] #### Measures of inhibitory (IC50) and effective concentrations (EC50)
        bind_unbind = ["kon (M-1-s-1)", "koff (s-1)"] ## Rates of binding / unbinding of ligands

        binding_ligand_efficency_cols = affinity_cols + ec_ic + bind_unbind
        filtered_df = df.copy()
        for af_col in binding_ligand_efficency_cols:
            filtered_df[af_col] = filtered_df[af_col].apply(lambda x: self.keep_just_numeric(x, np.NaN))
        
        return filtered_df
        
            


In [18]:
preprocessor = Preprocessing(
    [
        CleanNumericAtrributesStrategy(),
    ]
)
preprocessed_df = preprocessor.transform(merged_df)
preprocessed_df.head()

Unnamed: 0,BindingDB Reactant_set_id,SMILES,Ligand InChI,InChI_Key,BindingDB_ID,BindingDB Ligand Name,Target Name,Target Source Organism According to Curator or DataSource,Ki (nM),IC50 (nM),...,synonyms,categories,patent_approved,interaction,ChEMBL_ID_DrugBank,SMILES_DrugBank,InChI_Key_DrugBank,ChEBI_ID_DrugBank,BindingDB_ID_DrugBank,Matched_On
0,1,O[C@@H]1[C@@H](O)[C@@H](Cc2ccccc2)N(CCCCCC(O)=...,InChI=1S/C31H42N2O7/c34-27(35)17-9-3-11-19-32-...,XGEGDSLAQZJGCW-HHGOQMMWSA-N,608734,"6-[(4R,5S,6S,7R)-4,7-dibenzyl-3-(5-carboxypent...",Dimer of Gag-Pol polyprotein [501-599],Human immunodeficiency virus 1,0.24,,...,,,,,,,,,,
1,2,O[C@@H]1[C@@H](O)[C@@H](Cc2ccccc2)N(C\C=C\c2cn...,InChI=1S/C31H34N6O3/c38-29-27(17-23-9-3-1-4-10...,UZLMEAPBHYEHAC-UNTBESQGSA-N,22,"(4R,5S,6S,7R)-4,7-dibenzyl-5,6-dihydroxy-1,3-b...",Dimer of Gag-Pol polyprotein [501-599],Human immunodeficiency virus 1,0.25,,...,,,,,,,,,,
2,3,O[C@@H]1[C@@H](O)[C@@H](Cc2ccccc2)N(CC2CC2)C(=...,InChI=1S/C29H34N4O3/c34-27-25(16-21-8-3-1-4-9-...,HYNYUFZPPJMPOB-UTWJFGBXSA-N,23,"(4R,5S,6S,7R)-4,7-dibenzyl-1-(cyclopropylmethy...",Dimer of Gag-Pol polyprotein [501-599],Human immunodeficiency virus 1,0.41,,...,,,,,,,,,,
3,4,OCCCCCCN1[C@H](Cc2ccccc2)[C@H](O)[C@@H](O)[C@@...,InChI=1S/C29H40N2O4/c32-18-10-2-1-9-17-30-25(1...,YXVAZXDWVZTGGD-VIJSPRBVSA-N,24,"(4R,5S,6S,7R)-4,7-dibenzyl-1-(cyclopropylmethy...",Dimer of Gag-Pol polyprotein [501-599],Human immunodeficiency virus 1,0.8,,...,,,,,,,,,,
4,5,OCCCCCN1[C@H](Cc2ccccc2)[C@H](O)[C@@H](O)[C@@H...,InChI=1S/C28H38N2O4/c31-17-9-3-8-16-29-24(18-2...,WWTSWTPNILRSJX-XDZXDJIYSA-N,25,"(4R,5S,6S,7R)-4,7-dibenzyl-1-(cyclopropylmethy...",Dimer of Gag-Pol polyprotein [501-599],Human immunodeficiency virus 1,0.99,,...,,,,,,,,,,


Unnamed: 0,BindingDB Reactant_set_id,Ligand SMILES,Ligand InChI,Ligand InChI Key,BindingDB MonomerID,BindingDB Ligand Name,Target Name,Target Source Organism According to Curator or DataSource,Ki (nM),IC50 (nM),...,UniProt (SwissProt) Recommended Name of Target Chain.12,UniProt (SwissProt) Entry Name of Target Chain.12,UniProt (SwissProt) Primary ID of Target Chain.12,UniProt (SwissProt) Secondary ID(s) of Target Chain.12,UniProt (SwissProt) Alternative ID(s) of Target Chain.12,UniProt (TrEMBL) Submitted Name of Target Chain.12,UniProt (TrEMBL) Entry Name of Target Chain.12,UniProt (TrEMBL) Primary ID of Target Chain.12,UniProt (TrEMBL) Secondary ID(s) of Target Chain.12,UniProt (TrEMBL) Alternative ID(s) of Target Chain.12
0,1,O[C@@H]1[C@@H](O)[C@@H](Cc2ccccc2)N(CCCCCC(O)=...,InChI=1S/C31H42N2O7/c34-27(35)17-9-3-11-19-32-...,XGEGDSLAQZJGCW-HHGOQMMWSA-N,608734,"6-[(4R,5S,6S,7R)-4,7-dibenzyl-3-(5-carboxypent...",Dimer of Gag-Pol polyprotein [501-599],Human immunodeficiency virus 1,0.24,-1.0,...,,,,,,,,,,
1,2,O[C@@H]1[C@@H](O)[C@@H](Cc2ccccc2)N(C\C=C\c2cn...,InChI=1S/C31H34N6O3/c38-29-27(17-23-9-3-1-4-10...,UZLMEAPBHYEHAC-UNTBESQGSA-N,22,"(4R,5S,6S,7R)-4,7-dibenzyl-5,6-dihydroxy-1,3-b...",Dimer of Gag-Pol polyprotein [501-599],Human immunodeficiency virus 1,0.25,-1.0,...,,,,,,,,,,
2,3,O[C@@H]1[C@@H](O)[C@@H](Cc2ccccc2)N(CC2CC2)C(=...,InChI=1S/C29H34N4O3/c34-27-25(16-21-8-3-1-4-9-...,HYNYUFZPPJMPOB-UTWJFGBXSA-N,23,"(4R,5S,6S,7R)-4,7-dibenzyl-1-(cyclopropylmethy...",Dimer of Gag-Pol polyprotein [501-599],Human immunodeficiency virus 1,0.41,-1.0,...,,,,,,,,,,
3,4,OCCCCCCN1[C@H](Cc2ccccc2)[C@H](O)[C@@H](O)[C@@...,InChI=1S/C29H40N2O4/c32-18-10-2-1-9-17-30-25(1...,YXVAZXDWVZTGGD-VIJSPRBVSA-N,24,"(4R,5S,6S,7R)-4,7-dibenzyl-1-(cyclopropylmethy...",Dimer of Gag-Pol polyprotein [501-599],Human immunodeficiency virus 1,0.8,-1.0,...,,,,,,,,,,
4,5,OCCCCCN1[C@H](Cc2ccccc2)[C@H](O)[C@@H](O)[C@@H...,InChI=1S/C28H38N2O4/c31-17-9-3-8-16-29-24(18-2...,WWTSWTPNILRSJX-XDZXDJIYSA-N,25,"(4R,5S,6S,7R)-4,7-dibenzyl-1-(cyclopropylmethy...",Dimer of Gag-Pol polyprotein [501-599],Human immunodeficiency virus 1,0.99,-1.0,...,,,,,,,,,,


## Link to Drug Bank

In [4]:
merged = pd.read_csv(MERGED_CSV)
merged.head()

MemoryError: Unable to allocate 22.3 MiB for an array with shape (2927609,) and data type object

In [None]:
len(filtered_df['DrugBank ID of Ligand'].dropna())

87465

In [None]:
smile_merged = pd.merge(filtered_df, drugbank, left_on='Ligand SMILES', right_on='SMILES')
len(smile_merged)

6191

Processing identifier: PubChem_CID


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1_id[identifier] = df1_id[identifier].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2_id[identifier] = df2_id[identifier].astype(str)


Processing identifier: ChEBI_ID


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1_id[identifier] = df1_id[identifier].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2_id[identifier] = df2_id[identifier].astype(str)


Processing identifier: ChEMBL_ID


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1_id[identifier] = df1_id[identifier].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2_id[identifier] = df2_id[identifier].astype(str)


Processing identifier: BindingDB_ID


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2_id[identifier] = df2_id[identifier].astype(str)


Processing identifier: SMILES


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2_id[identifier] = df2_id[identifier].astype(str)


Processing identifier: InChI_Key


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1_id[identifier] = df1_id[identifier].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2_id[identifier] = df2_id[identifier].astype(str)


  merged_df = pd.read_csv(output_file)


In [None]:
merged_df['Matched_On'].value_counts()

Matched_On
InChI_Key      147426
ChEMBL_ID       96639
ChEBI_ID        74904
SMILES           6191
PubChem_CID         9
Name: count, dtype: int64

In [None]:
len(merged_df)

325169

In [None]:
merged_df['Unique_ID'].nunique()

150783

In [None]:
merged_df.drop_duplicates(subset=['Unique_ID'], inplace=True)

In [None]:
binding_readded.head(2)

Unnamed: 0,BindingDB Reactant_set_id,SMILES,Ligand InChI,InChI_Key,BindingDB_ID,BindingDB Ligand Name,Target Name,Target Source Organism According to Curator or DataSource,Ki (nM),IC50 (nM),...,synonyms,categories,patent_approved,interaction,ChEMBL_ID_DrugBank,SMILES_DrugBank,InChI_Key_DrugBank,ChEBI_ID_DrugBank,BindingDB_ID_DrugBank,Matched_On
0,1,O[C@@H]1[C@@H](O)[C@@H](Cc2ccccc2)N(CCCCCC(O)=...,InChI=1S/C31H42N2O7/c34-27(35)17-9-3-11-19-32-...,XGEGDSLAQZJGCW-HHGOQMMWSA-N,608734,"6-[(4R,5S,6S,7R)-4,7-dibenzyl-3-(5-carboxypent...",Dimer of Gag-Pol polyprotein [501-599],Human immunodeficiency virus 1,0.24,-1.0,...,,,,,,,,,,
1,2,O[C@@H]1[C@@H](O)[C@@H](Cc2ccccc2)N(C\C=C\c2cn...,InChI=1S/C31H34N6O3/c38-29-27(17-23-9-3-1-4-10...,UZLMEAPBHYEHAC-UNTBESQGSA-N,22,"(4R,5S,6S,7R)-4,7-dibenzyl-5,6-dihydroxy-1,3-b...",Dimer of Gag-Pol polyprotein [501-599],Human immunodeficiency virus 1,0.25,-1.0,...,,,,,,,,,,


In [None]:
# Left join binding db with merged_df (on Unique_ID), don't duplicate columns though
binding_readded = pd.merge(BindingDB, merged_df, on='Unique_ID', how='left', suffixes=('', '_y'))
all_cols = list(binding_readded.columns)
cols_to_keep = []

for col in all_cols:
    if not ('_y' in col and col.split('_y')[0] in all_cols):
        cols_to_keep.append(col)

In [None]:
# Left join binding db with merged_df (on Unique_ID), don't duplicate columns though
binding_readded = pd.merge(BindingDB, merged_df, on='Unique_ID', how='left', suffixes=('', '_y'))
all_cols = list(binding_readded.columns)
cols_to_keep = []

for col in all_cols:
    if not ('_y' in col and col.split('_y')[0] in all_cols):
        cols_to_keep.append(col)
# Keep only the columns in cols_to_keep
cleaned_readded = binding_readded[cols_to_keep]
cleaned_readded.to_csv('Merged Binding DrugBank.csv', index=False)

In [None]:
cleaned_readded.to_csv('Merged Binding DrugBank.csv', index=False)

NameError: name 'os' is not defined

In [None]:
import os
if os.path.exists(output_file):
    os.remove(output_file)