In [9]:
import pandas as pd 
import numpy as np

def STITCHgetDBID():
    """
    Function: 
    This function opens two datasets, one of the datasets is a dataset downloaded from DrugBank. This dataset contains all 
    DrugBank ID's with other different ID's so that you can easily map DrugBank data to other data. The other dataset contains
    a list of CID's and InChIKeys. This dataset gets mapped against the other DrugBank dataset using the InChIKey. All the 
    unnecessary columns are deleted. This mapped dataset is then mapped again. This time to the STITCH dataset with chemical-
    protein interactions. You will end up with a dataset with the chemical-protein interactions and the DrugBank ID. 
    
    Variables: 
    structure_links = a dataset with DrugBank ID's and corresponding other ID's. Later on duplicates are dropped from this dataset. 
    CID_InchiKey = a dataset with the CID and the corresponding InchiKey. Later on duplicates are dropped from this dataset.  
    merged = the mapped dataset that contains of structure_links and CID_InchiKey. This is mapped based on the InChIKey. 
    merged_filtered = The same dataset as merged but filtered. Some columns that are not needed are dropped.  
    STITCH_data = the dataset taken from STITCH. This has been filtered by a combined_score of 700 or higher. 
    mapped = a dataset that mapped the STITCH_data and the merged_filtered data using the "CID" column.  
    
    """
    structure_links = pd.read_csv("structure_links.csv")
    CID_InchiKey = pd.read_csv("resp_text_actions.txt")
    
    CID_InchiKey = CID_InchiKey.drop_duplicates()
    structure_links = structure_links.drop_duplicates()
    
    CID_InchiKey['InChIKey']=CID_InchiKey['InChIKey'].astype(str)
    structure_links['InChIKey']=structure_links['InChIKey'].astype(str)
    
    merged = pd.merge(CID_InchiKey, structure_links, on=["InChIKey"])
    merged_filtered = merged.drop(['CAS Number', 'InChI', 'SMILES', 'Formula', 'KEGG Compound ID', 'KEGG Drug ID', 'PubChem Compound ID', 'PubChem Substance ID', 'ChEBI ID', 'ChEMBL ID', 'HET ID', 'ChemSpider ID', 'BindingDB ID'], axis=1) 
    
    STITCH_data = pd.read_csv("actions.v5.0.700.t.d.tsv", sep="\t")
    STITCH_data = STITCH_data.rename(columns={"item_id_a": "CID"})
    STITCH_data['CID'] = STITCH_data['CID'].map(lambda x: x.lstrip('CIDms'))
    
    merged_filtered['CID']=merged_filtered['CID'].astype(int)
    STITCH_data['CID']=STITCH_data['CID'].astype(int)
    
    mapped = pd.merge(merged_filtered, STITCH_data, on=["CID"])
    #mapped = mapped[mapped['Drug Groups'].str.contains("approved") | mapped['Drug Groups'].str.contains("investigational") | mapped['Drug Groups'].str.contains("experimental")]
    mapped = mapped.drop(['mode', 'action', 'a_is_acting', 'score'], axis=1)
    
    mapped.to_csv("mapped_DB_STITCH_actions_first.tsv", sep='\t')
    
    
def main():
    STITCHgetDBID() 
    
main() 