This example notebooks shows the local execution of Prot2Comp and of the singular databases 

In [1]:
#General packages
import pandas as pd
from tqdm import tqdm
import os
import time
from cpiextract import Prot2Comp

# Load in Required Datasets

In [None]:
# Root data path
data_path = 'data/Databases/'

#Downloaded from BindingDB on 3/30/2023
file_path=os.path.join(data_path, 'BindingDB.csv')
BDB_data=pd.read_csv(file_path,sep=',',usecols=['CID', 'Ligand SMILES','Ligand InChI','BindingDB MonomerID','Ligand InChI Key','BindingDB Ligand Name','Target Name Assigned by Curator or DataSource','Target Source Organism According to Curator or DataSource','Ki (nM)','IC50 (nM)','Kd (nM)','EC50 (nM)','pH','Temp (C)','Curation/DataSource','UniProt (SwissProt) Entry Name of Target Chain','UniProt (SwissProt) Primary ID of Target Chain'],on_bad_lines='skip')

#Downloaded from STITCH on 2/22/2023
file_path=os.path.join(data_path, 'STITCH.tsv')
sttch_data=pd.read_csv(file_path,sep='\t')

#Downloaded from ChEMBL on 2/01/2024
file_path=os.path.join(data_path, 'ChEMBL.csv')
chembl_data=pd.read_csv(file_path,sep=',')

file_path=os.path.join(data_path, 'CTD.csv')
CTD_data=pd.read_csv(file_path,sep=',')

#Downloaded from DTC on 2/24/2023
file_path=os.path.join(data_path, 'DTC.csv')
DTC_data=pd.read_csv(file_path,sep=',',usecols=['CID', 'compound_id','standard_inchi_key','target_id','gene_names','wildtype_or_mutant','mutation_info','standard_type','standard_relation','standard_value','standard_units','activity_comment','pubmed_id','doc_type'])

#Downloaded from DrugBank on 3/2/2022
file_path=os.path.join(data_path, 'DB.csv')
DB_data=pd.read_csv(file_path, sep=',')

#Downloaded from DrugCentral on 2/25/2024
file_path=os.path.join(data_path, 'DrugCentral.csv')
DC_data=pd.read_csv(file_path, sep=',')


In [None]:
dbs = {
    'bdb': BDB_data,
    'chembl': chembl_data,
    'stitch': sttch_data,
    'ctd': CTD_data,
    'dtc': DTC_data,
    'dc': DC_data,
    'db': DB_data
}

## Load in Protein List

In [None]:
inputids = pd.read_csv('data/input/db_proteins.csv')

## Run Prot2Comp on Protein List

In [None]:
times = {}

In [None]:
P2C = Prot2Comp('local', dbs=dbs)

In [None]:
output_path = 'data/output/'

In [None]:
p2c_dat=pd.DataFrame()
p2c_fail=pd.DataFrame()
states = pd.DataFrame()

r=0
for h in tqdm(range(0, 10)):
    
    protid=inputids['HGNC ID'].iloc[h]
    start = time.time()
    try:
        [comp_dat, state]=P2C.prot_interactions(input_id=protid, pChEMBL_thres=0)
        end = time.time()
        times[h] = end-start
        p2c_dat=pd.concat([p2c_dat,comp_dat])
        states = pd.concat([states, state])
        print(f'{h} done')
    except:
        # Collects failed inputids
        print(f'{h} failed')
        p2c_fail.loc[r,"failed_id"]=protid #Collects failed inputids
        r=r+1
    
    #Saves a file after a certain number of loops for insurance
    check=h/50 #automatically saves progress when on loop number set here
    if (check-int(check)==0)==True:
        filename=os.path.join(output_path, 'P2C_iter_'+str(h)+'.csv')
        p2c_dat.to_csv(filename,sep=',') 

# Save completed dataframe to file
filename=os.path.join(output_path, 'P2C.csv')
p2c_dat.to_csv(filename,sep=',',index=False)

In [None]:
p2c_dat

In [None]:
states

# Run Prot2Comp on Single Compound

## Prot2Compt2Comp with all Databases

### Test time required

In [None]:
#inputid='AAWZDTNXLSGCEK-WYWMIBKRSA-N'
import cProfile
protid='KLK1'

In [None]:
cProfile.run('P2C.prot_interactions(protid)')

### Run pipeline

In [None]:
inp = 'Q12809'

In [None]:
P2C = Prot2Comp('local', dbs=dbs)

In [None]:
[comp_dat,states]=P2C.prot_interactions(input_id=inp, pChEMBL_thres=0)

In [None]:
states

In [None]:
comp_dat

### Prot2Compt with select Databases

In [None]:
#inputid='AAWZDTNXLSGCEK-WYWMIBKRSA-N'
inputid='KLK1'

In [None]:
P2C = Prot2Comp('local', dbs=dbs)

In [None]:
#The second argument is a string listing the databases needed.
#Underscore is required to separate the databases.
[comp_dat,states]=P2C.prot_interactions_select(inputid,'pc_db_ctd_bdb_sttch')

In [None]:
comp_dat

In [None]:
states

# Run each step/database separately (for troubleshooting)

In [1]:
from cpiextract.utils.identifiers import *
import os
import pandas as pd
from cpiextract.databases import *
from cpiextract import protein_identifiers
input_id = 'KLK1'
protids = protein_identifiers(input_id)

In [None]:
protids

### Run PubChem only

In [None]:
pubchem = PubChem()
[pc_dat, pc_state, pc_raw] = pubchem.compounds(protids)

In [None]:
pc_dat

In [None]:
pc_state

### Run ChEMBL only

In [None]:
#Downloaded from ChEMBL on 2/01/2024
file_path=os.path.expanduser(os.getcwd()+'\\data\\Databases\\ChEMBL.csv')
chembl_data=pd.read_csv(file_path,sep=',')

In [None]:
chembl = ChEMBL(database=chembl_data)
[chembl_dat, state, chembl_raw] = chembl.compounds(protids)

In [None]:
chembl_dat

In [None]:
state

### Run BindingDB only

In [None]:
#Downloaded from BindingDB on 3/30/2023
file_path=os.path.expanduser(os.getcwd()+'\\data\\Databases\\BindingDB.tsv')
BDB_data=pd.read_csv(file_path,sep='\t',
                     usecols=['Ligand SMILES','Ligand InChI','BindingDB MonomerID','Ligand InChI Key','BindingDB Ligand Name','Target Name Assigned by Curator or DataSource','Target Source Organism According to Curator or DataSource','Ki (nM)','IC50 (nM)','Kd (nM)','EC50 (nM)','pH','Temp (C)','Curation/DataSource','UniProt (SwissProt) Entry Name of Target Chain','UniProt (SwissProt) Primary ID of Target Chain'],on_bad_lines='skip')

In [None]:
bdb = BindingDB(database=BDB_data)
bdb_dat, bdb_state, bdb_raw = bdb.compounds(protids, 0)

In [None]:
bdb_raw

In [None]:
bdb_state

### Run STITCH only

In [None]:
#Downloaded from STITCH on 2/22/2023
file_path=os.path.expanduser(os.getcwd()+'\\data\\Databases\\STITCH.tsv')
sttch_data=pd.read_csv(file_path,sep='\t')

In [None]:
stitch = Stitch(database=sttch_data)
[stitch_dat, stitch_state, stitch_raw] = stitch.compounds(protids)

In [None]:
stitch_raw

In [None]:
stitch_dat

In [None]:
stitch_state

### Run CTD only

In [None]:
#Downloaded from CTD on 2/24/2023
file_path=os.path.expanduser(os.getcwd()+'\\data\\Databases\\CTD.csv')
CTD_data=pd.read_csv(file_path,sep=',')

In [None]:
ctd = CTD(database=CTD_data)
[ctd_dat, ctd_state, ctd_raw] = ctd.compounds(protids) 

In [None]:
ctd_dat

### Run DTC only

In [None]:
#Downloaded from DTC on 2/24/2023
file_path=os.path.expanduser(os.getcwd()+'\\data\\Databases\\DTC.csv')
DTC_data=pd.read_csv(file_path,sep=',',usecols=['compound_id','standard_inchi_key','target_id','gene_names','wildtype_or_mutant','mutation_info','standard_type','standard_relation','standard_value','standard_units','activity_comment','pubmed_id','doc_type'])

In [None]:
dtc = DTC(database=DTC_data)
[dtc_dat, dtc_state, dtc_raw] = dtc.compounds(protids, 0)

In [None]:
dtc_state

In [None]:
dtc_dat

### Run OTP only

In [None]:
otp = OTP() 
[otp_dat, otp_state, otp_raw] = otp.compounds(protids)

In [None]:
otp_state

In [None]:
otp_raw

In [None]:
otp_dat

### Run DrugCentral only

In [None]:
#Downloaded from DrugCentral on 2/25/2024
file_path=os.path.expanduser(os.getcwd()+'\\data\\DrugCentral.csv')
DC_data=pd.read_csv(file_path, sep=',')

In [None]:
dc = DrugCentral(database=DC_data)
[DC_dat, DC_state, DC_raw] = dc.compounds(input_protein=protids)

In [None]:
DC_dat

In [None]:
DC_state

### Run DrugBank only

In [None]:
#Downloaded from DrugBank on 3/2/2022
file_path=os.path.expanduser(os.getcwd()+'\\data\\Databases\\DB.csv')
DB_data=pd.read_csv(file_path, sep=',')

In [None]:
db = DB(database=DB_data)
[DB_dat, DB_state, DB_raw] = db.compounds(protids)

In [None]:
DB_state

In [None]:
DB_dat

In [None]:
DB_dat