This example notebooks shows the local execution of Comp2Prot and of the singular databases 

In [3]:
#General packages
import pandas as pd
from tqdm import tqdm
import os
from cpiextract import Comp2Prot
import time

# Load in Required Datasets

In [2]:
# Root data path
data_path = 'data/Databases/'

#Downloaded from BindingDB on 3/30/2023
file_path=os.path.join(data_path, 'BindingDB.csv')
BDB_data=pd.read_csv(file_path,sep=',',usecols=['CID', 'Ligand SMILES','Ligand InChI','BindingDB MonomerID','Ligand InChI Key','BindingDB Ligand Name','Target Name Assigned by Curator or DataSource','Target Source Organism According to Curator or DataSource','Ki (nM)','IC50 (nM)','Kd (nM)','EC50 (nM)','pH','Temp (C)','Curation/DataSource','UniProt (SwissProt) Entry Name of Target Chain','UniProt (SwissProt) Primary ID of Target Chain'],on_bad_lines='skip')

#Downloaded from STITCH on 2/22/2023
file_path=os.path.join(data_path, 'STITCH.tsv')
sttch_data=pd.read_csv(file_path,sep='\t')

#Downloaded from ChEMBL on 2/01/2024
file_path=os.path.join(data_path, 'ChEMBL.csv')
chembl_data=pd.read_csv(file_path,sep=',')

file_path=os.path.join(data_path, 'CTD.csv')
CTD_data=pd.read_csv(file_path,sep=',')

#Downloaded from DTC on 2/24/2023
file_path=os.path.join(data_path, 'DTC.csv')
DTC_data=pd.read_csv(file_path,sep=',',usecols=['CID', 'compound_id','standard_inchi_key','target_id','gene_names','wildtype_or_mutant','mutation_info','standard_type','standard_relation','standard_value','standard_units','activity_comment','pubmed_id','doc_type'])

#Downloaded from DrugBank on 3/2/2022
file_path=os.path.join(data_path, 'DB.csv')
DB_data=pd.read_csv(file_path, sep=',')

#Downloaded from DrugCentral on 2/25/2024
file_path=os.path.join(data_path, 'DrugCentral.csv')
DC_data=pd.read_csv(file_path, sep=',')


  BDB_data=pd.read_csv(file_path,sep=',',usecols=['CID', 'Ligand SMILES','Ligand InChI','BindingDB MonomerID','Ligand InChI Key','BindingDB Ligand Name','Target Name Assigned by Curator or DataSource','Target Source Organism According to Curator or DataSource','Ki (nM)','IC50 (nM)','Kd (nM)','EC50 (nM)','pH','Temp (C)','Curation/DataSource','UniProt (SwissProt) Entry Name of Target Chain','UniProt (SwissProt) Primary ID of Target Chain'],on_bad_lines='skip')
  chembl_data=pd.read_csv(file_path,sep=',')
  DTC_data=pd.read_csv(file_path,sep=',',usecols=['CID', 'compound_id','standard_inchi_key','target_id','gene_names','wildtype_or_mutant','mutation_info','standard_type','standard_relation','standard_value','standard_units','activity_comment','pubmed_id','doc_type'])


In [3]:
dbs = {
    'chembl': chembl_data,
    'bdb': BDB_data,
    'stitch': sttch_data,
    'ctd': CTD_data,
    'dtc': DTC_data,
    'db': DB_data,
    'dc': DC_data
}

## Load in Compound List

In [7]:
chem_dat=pd.read_csv('data/input/db_compounds.csv')

## Run Comp2Prot on Compound List

In [6]:
C2P = Comp2Prot('local', dbs=dbs)

In [8]:
times = {}

In [9]:
output_path = 'data/output/'

In [None]:
c2p_dat=pd.DataFrame()
c2p_fail=pd.DataFrame()
states = pd.DataFrame()

r=0
for h in tqdm(range(0, 10)):

    try:
        inputid=int(chem_dat['cid'].iloc[h])
    except:
        continue
    start = time.time()

    try:
        [comp_dat, state] = C2P.comp_interactions(inputid)
        end = time.time()
        times[h] = end-start
        states = pd.concat([states, state])
        c2p_dat=pd.concat([c2p_dat,comp_dat])
        print(f'{h} done')
    except:
        print(f'{h} failed')
        c2p_fail.loc[r,"failed_id"]=inputid #Collects failed inputids
        r=r+1

    # Saves a file after a certain number of loops for insurance
    check=h/50 #automatically saves progress when on loop number set here
    if (check-int(check)==0)==True:
        filename=os.path.join(output_path, 'C2P_iter_'+str(h)+'.csv')
        c2p_dat.to_csv(filename,sep=',',index=False)

# Save completed dataframe to file
filename=os.path.join(output_path, 'C2P.csv')
c2p_dat.to_csv(filename,sep=',',index=False)

In [None]:
c2p_dat

In [None]:
states

# Run Comp2Prot on Single Compound

## Comp2Prot with all Databases

### Test time required

In [None]:
#inputid='AAWZDTNXLSGCEK-WYWMIBKRSA-N'
import cProfile
inputid=5350

In [None]:
cProfile.run('C2P.comp_interactions(inputid)')

### Run pipeline

In [None]:
inputid = 317

In [None]:
C2P = Comp2Prot('local', dbs=dbs)

In [None]:
[comp_dat,states]=C2P.comp_interactions(input_id=inputid)

In [None]:
states

In [None]:
comp_dat

### Comp2Prot with select Databases

In [None]:
C2P = Comp2Prot('local', dbs=dbs)

In [None]:
#inputid='AAWZDTNXLSGCEK-WYWMIBKRSA-N'
inputid=39929

In [None]:
#The second argument is a string listing the databases needed.
#Underscore is required to separate the databases.
[comp_dat,states]=C2P.comp_interactions_select(inputid,'ctd')

In [None]:
comp_dat

# Run each step/database separately (for troubleshooting)

In [None]:
#inp='AAWZDTNXLSGCEK-WYWMIBKRSA-N'
import pandas as pd
import os
from cpiextract.databases import *
from cpiextract.utils import compound_identifiers
inp=2830389
comp_ids=compound_identifiers(inp)

### Run PubChem only

In [None]:
pubchem = PubChem()
pc_dat, pc_state, pc_raw = pubchem.interactions(input_comp=comp_ids, pChEMBL_thres=0)

In [None]:
pc_dat

In [None]:
pc_dat

In [None]:
pc_state

### Run ChEMBL only

In [None]:
chembl_ids = []

In [None]:
file_path=os.path.expanduser(os.getcwd()+'data\\Databases\\ChEMBL.csv')
chembl_data=pd.read_csv(file_path,sep=',')

In [None]:
chembl = ChEMBL(database=chembl_data)
[chembl_dat, chembl_state, chembl_raw] = chembl.interactions(input_comp=comp_ids, pChEMBL_thres=0, chembl_ids=chembl_ids)

In [None]:
chembl_dat

In [None]:
chembl_state

### Run BindingDB only

In [None]:
#Downloaded from BindingDB on 3/30/2023
file_path=os.path.expanduser(os.getcwd()+'\\data\\Databases\\BindingDB_All.tsv')
BDB_data=pd.read_csv(file_path,sep='\t',usecols=['Ligand SMILES','Ligand InChI','BindingDB MonomerID','Ligand InChI Key','BindingDB Ligand Name','Target Name Assigned by Curator or DataSource','Target Source Organism According to Curator or DataSource','Ki (nM)','IC50 (nM)','Kd (nM)','EC50 (nM)','pH','Temp (C)','Curation/DataSource','UniProt (SwissProt) Entry Name of Target Chain','UniProt (SwissProt) Primary ID of Target Chain'],on_bad_lines='skip')

In [None]:
bdb = BindingDB(database=BDB_data)
[bdb_dat, bdb_state, bdb_raw] = bdb.interactions(input_comp=comp_ids, pChEMBL_thres=0)

In [None]:
bdb_raw

In [None]:
bdb_state

In [None]:
bdb_dat

### Run STITCH only

In [None]:
#Downloaded from STITCH on 2/22/2023
file_path=os.path.expanduser(os.getcwd()+'\\data\\Databases\\STITCH.tsv')
sttch_data=pd.read_csv(file_path,sep='\t')

In [None]:
stitch = Stitch(database=sttch_data)

In [None]:
[sttch_dat, sttch_state, sttch_raw] = stitch.interactions(input_comp=comp_ids, set_stereo=1)

In [None]:
sttch_dat

### Run CTD only

In [None]:
#Downloaded from CTD on 2/24/2023
file_path=os.path.expanduser(os.getcwd()+'\\data\\Databases\\CTD.csv')
CTD_data=pd.read_csv(file_path,sep=',')

In [None]:
ctd = CTD(database=CTD_data)
[ctd_dat, ctd_state, ctd_raw] = ctd.interactions(input_comp=comp_ids)

In [None]:
ctd_dat

### Run DTC only

In [None]:
#Downloaded from DTC on 2/24/2023
file_path=os.path.expanduser(os.getcwd()+'\\data\\Databases\\DTC.csv')
DTC_data=pd.read_csv(file_path,sep=',',usecols=['compound_id','standard_inchi_key','target_id','gene_names','wildtype_or_mutant','mutation_info','standard_type','standard_relation','standard_value','standard_units','activity_comment','pubmed_id','doc_type'])

In [None]:
dtc = DTC(database=DTC_data)
[dtc_dat, dtc_state, dtc_raw] = dtc.interactions(input_comp=comp_ids, chembl_ids=[], pChEMBL_thres=0)

In [None]:
dtc_dat

### Run OTP only

In [None]:
otp = OTP()
[otp_dat, otp_state, otp_raw] = otp.interactions(input_comp=comp_ids, chembl_ids=[], intqual=2)

In [None]:
otp_dat

### Run DrugCentral only

In [None]:
# File obtained in the last cells of this notebook
file_path=os.path.expanduser(os.getcwd()+'\\data\\Databases\\DrugCentral.csv')
DC_data=pd.read_csv(file_path, sep=',')

In [None]:
dc = DrugCentral(database=DC_data)
[DC_dat, DC_state, DC_raw] = dc.interactions(input_comp=comp_ids)

In [None]:
DC_dat

In [None]:
DC_state

### Run DrugBank only

In [None]:
#Downloaded from DrugBank on 3/2/2022
file_path=os.path.expanduser(os.getcwd()+'\\data\\Databases\\DB.csv')
DB_data = pd.read_csv(file_path, sep=',')

In [None]:
db = DB(database=DB_data)
[DB_dat, DB_state, DB_raw] = db.interactions(input_comp=comp_ids)

In [None]:
DB_dat

In [None]:
DB_state