In [1]:
from rdkit.Chem import inchi
from rdkit import Chem
import os
os.environ["PYTHONWARNINGS"] = "ignore"  # Suppress warnings

from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

from dimorphite_dl.dimorphite_dl import DimorphiteDL
from rdkit.Chem import AddHs
from rdkit.Chem.MolStandardize import rdMolStandardize
import pandas as pd

def standardize_jumpcp(smiles):
    # follows the steps in
    # https://github.com/greglandrum/RSC_OpenScience_Standardization_202104/blob/main/MolStandardize%20pieces.ipynb
    # as described **excellently** (by Greg) in
    # https://www.youtube.com/watch?v=eWTApNX8dJQ
    try: 
        mol = Chem.MolFromSmiles(smiles)
        #print(smiles)
        
        # removeHs, disconnect metal atoms, normalize the molecule, reionize the molecule
        clean_mol = rdMolStandardize.Cleanup(mol) 
        #print(Chem.MolToSmiles(clean_mol))
        
        # if many fragments, get the "parent" (the actual mol we are interested in) 
        parent_clean_mol = rdMolStandardize.FragmentParent(clean_mol)

        # try to neutralize molecule
        uncharger = rdMolStandardize.Uncharger() # annoying, but necessary as no convenience method exists
        uncharged_parent_clean_mol = uncharger.uncharge(parent_clean_mol)

        #print(uncharged_parent_clean_mol)
        
        # use pH 7.4 https://git.durrantlab.pitt.edu/jdurrant/dimorphite_dl/
        dimorphite = DimorphiteDL(min_ph=7.4, max_ph=7.4, pka_precision=0)
        protonated_smiles = dimorphite.protonate(Chem.MolToSmiles(uncharged_parent_clean_mol))

        #print("protonated_smiles")
        
        if len(protonated_smiles) > 0:
                protonated_smile = protonated_smiles[0]

        protonated_mol = Chem.MolFromSmiles(protonated_smile)
        #protonated_mol= AddHs(protonated_mol)
        #protonated_smile = Chem.MolToSmiles(protonated_mol)


        # attempt is made at reionization at this step
        # at 7.4 pH

        te = rdMolStandardize.TautomerEnumerator() # idem
        taut_uncharged_parent_clean_mol = te.Canonicalize(protonated_mol)
     
        return  Chem.MolToSmiles(taut_uncharged_parent_clean_mol)
    
    except: 
        
        return "Cannot_do"

def inchi_from_standardised_smile(value):

    try: return Chem.MolToInchi(Chem.MolFromSmiles(value))
    except: return "Cannot_do"
    
def inchi_to_smiles(value):

    try: return Chem.MolToSmiles(Chem.MolFromInchi(value))
    except: return "Cannot_do"

def process_data(data_path, smiles_variable):
    # Read the data
    if data_path.endswith('.gz'):
        data = pd.read_csv(data_path, compression='gzip')
    else:
        data = pd.read_csv(data_path)

    # Apply the standardize_oasis function
    data['Standardized_SMILES'] = data[smiles_variable].parallel_apply(standardize_jumpcp)
    # Convert standardized SMILES to InChI
    data['Standardized_InChI'] = data['Standardized_SMILES'].parallel_apply(inchi_from_standardised_smile)

    
    # Filter out SMILES strings that couldn't be standardized

    filtered_data = data[data['Standardized_SMILES'] != "Cannot_do"]
    # Filter out InChI strings that couldn't be standardized
    filtered_data = filtered_data[filtered_data['Standardized_InChI'] != "Cannot_do"].reset_index(drop=True)
    
    return filtered_data

def save_data(df, save_path):
    
    if not os.path.exists(os.path.dirname(save_path)):
        os.makedirs(os.path.dirname(save_path))
    
    df.to_csv(save_path, index=False)

INFO: Pandarallel will run on 56 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [21]:
wells = pd.read_csv("well.csv.gz",compression='gzip')
#plates['Standardized_SMILES'] = plates["smiles"].parallel_apply(standardize_jumpcp)
#plates['Standardized_InChI'] = plates["Standardized_SMILES"].parallel_apply(inchi_from_standardised_smile)
wells

Unnamed: 0,Metadata_Source,Metadata_Plate,Metadata_Well,Metadata_JCP2022
0,source_1,UL000081,A02,JCP2022_033924
1,source_1,UL000081,A03,JCP2022_085227
2,source_1,UL000081,A04,JCP2022_033924
3,source_1,UL000081,A05,JCP2022_047857
4,source_1,UL000081,A06,JCP2022_072229
...,...,...,...,...
1096064,source_9,GR00004421,Z44,JCP2022_999999
1096065,source_9,GR00004421,Z45,JCP2022_999999
1096066,source_9,GR00004421,Z46,JCP2022_999999
1096067,source_9,GR00004421,Z47,JCP2022_033924


In [26]:
plate = pd.read_csv("plate.csv.gz",compression='gzip')
plate= plate[plate["Metadata_PlateType"]=="TARGET2"]
plate

Unnamed: 0,Metadata_Source,Metadata_Batch,Metadata_Plate,Metadata_PlateType
161,source_10,2021_08_03_U2OS_48_hr_run12,Dest210726-160150,TARGET2
180,source_10,2021_08_09_U2OS_48_hr_run13,Dest210727-153003,TARGET2
195,source_10,2021_08_12_U2OS_48_hr_run15,Dest210803-153958,TARGET2
213,source_10,2021_08_17_U2OS_48_hr_run16,Dest210809-134534,TARGET2
231,source_10,2021_08_20_U2OS_48_hr_run17,Dest210810-173723,TARGET2
...,...,...,...,...
2315,source_9,20210915-Run10,GR00003310,TARGET2
2324,source_9,20210918-Run11,GR00004371,TARGET2
2341,source_9,20211013-Run14,GR00003283,TARGET2
2354,source_9,20211102-Run15,GR00004395,TARGET2


In [86]:
#Target2_JCP
Target2_JCP_list = list(wells[wells.Metadata_Plate.isin(plate.Metadata_Plate.to_list())].Metadata_JCP2022.unique())
Target2_JCP

['JCP2022_043547',
 'JCP2022_050797',
 'JCP2022_050997',
 'JCP2022_108326',
 'JCP2022_033924',
 'JCP2022_027911',
 'JCP2022_100264',
 'JCP2022_050861',
 'JCP2022_014367',
 'JCP2022_071429',
 'JCP2022_016288',
 'JCP2022_091373',
 'JCP2022_004940',
 'JCP2022_103217',
 'JCP2022_079562',
 'JCP2022_020163',
 'JCP2022_099471',
 'JCP2022_048971',
 'JCP2022_060649',
 'JCP2022_061421',
 'JCP2022_030713',
 'JCP2022_042261',
 'JCP2022_113600',
 'JCP2022_001890',
 'JCP2022_061654',
 'JCP2022_093289',
 'JCP2022_115963',
 'JCP2022_079617',
 'JCP2022_111730',
 'JCP2022_067887',
 'JCP2022_080150',
 'JCP2022_098688',
 'JCP2022_012146',
 'JCP2022_029951',
 'JCP2022_014114',
 'JCP2022_021678',
 'JCP2022_062326',
 'JCP2022_010382',
 'JCP2022_061437',
 'JCP2022_029365',
 'JCP2022_116188',
 'JCP2022_047559',
 'JCP2022_055397',
 'JCP2022_068660',
 'JCP2022_030049',
 'JCP2022_105456',
 'JCP2022_105621',
 'JCP2022_067432',
 'JCP2022_058046',
 'JCP2022_068838',
 'JCP2022_087474',
 'JCP2022_024601',
 'JCP2022_09

In [44]:
len(Target2_JCP)

302

In [45]:
jumpcp = pd.read_csv("compound.csv.gz",compression='gzip')
jumpcp = jumpcp[jumpcp.Metadata_JCP2022.isin(Target2_JCP)]
jumpcp['smiles'] = jumpcp["Metadata_InChI"].parallel_apply(inchi_to_smiles)
jumpcp['Standardized_SMILES'] = jumpcp["smiles"].parallel_apply(standardize_jumpcp)
jumpcp

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6), Label(value='0 / 6'))), HBox(c…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6), Label(value='0 / 6'))), HBox(c…

Unnamed: 0,Metadata_JCP2022,Metadata_InChIKey,Metadata_InChI,smiles,Standardized_SMILES
793,JCP2022_000794,AECDBHGVIIRMOI-UHFFFAOYSA-N,InChI=1S/C27H29N5O/c28-26-25-24(21-8-4-9-23(14...,Nc1ncnc2c1c(-c1cccc(OCc3ccccc3)c1)cn2C1CC(CN2C...,Nc1ncnc2c1c(-c1cccc(OCc3ccccc3)c1)cn2C1CC(C[NH...
1035,JCP2022_001036,AFJRDFWMXUECEW-UHFFFAOYSA-N,InChI=1S/C18H17Cl2FN4OS/c1-25-16(14(19)9-23-25...,Cn1ncc(Cl)c1-c1cc(C(=O)NC(CN)Cc2cccc(F)c2)sc1Cl,Cn1ncc(Cl)c1-c1cc(C(=O)NC(C[NH3+])Cc2cccc(F)c2...
1274,JCP2022_001275,AGNWVEJTZJIJIM-UHFFFAOYSA-N,InChI=1S/C28H25N5O4S/c1-16-13-17(2)26-22(14-16...,Cc1cc(C)c2nc(-c3ccccn3)cc(C(=O)Nc3ccc(S(=O)(=O...,Cc1cc(C)c2nc(-c3ccccn3)cc(C(=O)Nc3ccc(S(=O)(=O...
1889,JCP2022_001890,AJVXVYTVAAWZAP-UHFFFAOYSA-N,InChI=1S/C14H11NO3/c1-14(2)9-10(15-13(14)18)7-...,CC1(C)C(=O)N=C2c3ccccc3C(=O)C(=O)C21,CC1(C)C(=O)N=C2c3ccccc3C(=O)C(=O)C21
2117,JCP2022_002118,ALBKMJDFBZVHAK-UHFFFAOYSA-N,InChI=1S/C23H22N2O4/c1-3-28-23(26)22-18(14-27-...,CCOC(=O)c1ncc2[nH]c3ccc(OCc4ccccc4)cc3c2c1COC,CCOC(=O)c1ncc2[nH]c3ccc(OCc4ccccc4)cc3c2c1COC
...,...,...,...,...,...
115961,JCP2022_115963,ZVPDNRVYHLRXLX-UHFFFAOYSA-N,InChI=1S/C16H19N5/c1-10-5-7-11(8-6-10)13-12-14...,Cc1ccc(-c2nn(C(C)(C)C)c3ncnc(N)c23)cc1,Cc1ccc(-c2nn(C(C)(C)C)c3ncnc(N)c23)cc1
116186,JCP2022_116188,ZWVZORIKUNOTCS-UHFFFAOYSA-N,InChI=1S/C25H26ClN5O3/c1-15-11-18(31-7-9-34-10...,Cc1cc(N2CCOCC2)cc2nc(-c3c(NCC(O)c4cccc(Cl)c4)c...,Cc1cc(N2CCOCC2)cc2nc(-c3c(NCC(O)c4cccc(Cl)c4)c...
116435,JCP2022_116437,ZYGHJZDHTFUPRJ-UHFFFAOYSA-N,InChI=1S/C9H6O2/c10-9-6-5-7-3-1-2-4-8(7)11-9/h...,O=c1ccc2ccccc2o1,O=c1ccc2ccccc2o1
116558,JCP2022_116560,ZYVXTMKTGDARKR-UHFFFAOYSA-N,InChI=1S/C24H27N7O/c1-29-10-12-31(13-11-29)17-...,COc1cc(N2CCN(C)CC2)ccc1Nc1nccc(-c2cn(C)c3cnccc...,COc1cc(N2CC[NH+](C)CC2)ccc1Nc1nccc(-c2cn(C)c3c...


In [106]:
df = target2.merge(jumpcp, left_on="Standardized_SMILES", right_on="Standardized_SMILES", how="inner")
len(df)

155

In [34]:
target2 = pd.read_csv("JUMP-Target-2_compound_metadata.tsv", sep="\t")
target2['Standardized_SMILES'] = target2["smiles"].parallel_apply(standardize_jumpcp)
target2['Standardized_InChI'] = target2["Standardized_SMILES"].parallel_apply(inchi_from_standardised_smile)
target2

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6), Label(value='0 / 6'))), HBox(c…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6), Label(value='0 / 6'))), HBox(c…

Unnamed: 0,broad_sample,InChIKey,pert_iname,pubchem_cid,target,pert_type,control_type,smiles,Standardized_SMILES,Standardized_InChI
0,BRD-K09338665-001-07-1,KBPLFHHGFOOTCA-UHFFFAOYSA-N,1-octanol,957.0,GJB4,trt,,CCCCCCCCO,CCCCCCCCO,"InChI=1S/C8H18O/c1-2-3-4-5-6-7-8-9/h9H,2-8H2,1H3"
1,BRD-K48278478-001-01-2,LOUPRKONTZGTKE-AFHBHXEDSA-N,quinine,94175.0,KCNN4,trt,,COc1ccc2nccc([C@@H](O)[C@H]3C[C@@H]4CC[N@]3C[C...,C=C[C@H]1C[N@H+]2CC[C@H]1C[C@@H]2[C@H](O)c1ccn...,InChI=1S/C20H24N2O2/c1-3-13-12-22-9-7-14(13)10...
2,BRD-A85242401-001-12-3,KRGQEOSDQHTZMX-IGCYCDGOSA-N,ascorbic-acid,9888239.0,P3H1,trt,,OC[C@H](O)[C@H]1OC(=O)C(=O)C1O,OC[C@H](O)c1oc(O)c(O)c1O,InChI=1S/C6H8O6/c7-1-2(8)5-3(9)4(10)6(11)12-5/...
3,BRD-K93632104-001-17-2,YGSDEFSMJLZEOE-UHFFFAOYSA-N,salicylic-acid,118212070.0,AKR1C1,trt,,OC(=O)c1ccccc1O,O=C([O-])c1ccccc1[O-],"InChI=1S/C7H6O3/c8-6-4-2-1-3-5(6)7(9)10/h1-4,8..."
4,BRD-K57313110-001-06-8,ODHCTXKNWHHXJC-VKHMYHEASA-N,pidolic-acid,7405.0,VEGFA,trt,,OC(=O)[C@@H]1CCC(=O)N1,O=C1CCC(C(=O)[O-])N1,"InChI=1S/C5H7NO3/c7-4-2-1-3(6-4)5(8)9/h3H,1-2H..."
...,...,...,...,...,...,...,...,...,...,...
302,BRD-A69636825-003-04-7,HSUGRBWQSSZJOP-UHFFFAOYSA-N,diltiazem,3076.0,CACNG1,trt,,COc1ccc(cc1)C1Sc2ccccc2N(CCN(C)C)C(=O)C1OC(C)=O,COc1ccc(C2Sc3ccccc3N(CC[NH+](C)C)C(=O)C2OC(C)=...,InChI=1S/C22H26N2O4S/c1-15(25)28-20-21(16-9-11...
303,BRD-K87782578-001-03-9,KXBDTLQSDKGAEB-UHFFFAOYSA-N,AVL-292,59174488.0,BTK,trt,,COCCOc1ccc(Nc2ncc(F)c(Nc3cccc(NC(=O)C=C)c3)n2)cc1,C=CC(=O)Nc1cccc(Nc2nc(Nc3ccc(OCCOC)cc3)ncc2F)c1,InChI=1S/C22H22FN5O3/c1-3-20(29)25-16-5-4-6-17...
304,BRD-K98763141-001-30-8,JZFPYUNJRRFVQU-UHFFFAOYSA-N,niflumic-acid,4488.0,UGT1A9,trt,,OC(=O)c1cccnc1Nc1cccc(c1)C(F)(F)F,O=C([O-])c1cccnc1Nc1cccc(C(F)(F)F)c1,"InChI=1S/C13H9F3N2O2/c14-13(15,16)8-3-1-4-9(7-..."
305,BRD-K19975102-001-02-0,YYDUWLSETXNJJT-MTJSOVHGSA-N,GNF-5837,59397065.0,NTRK1,trt,,Cc1ccc(NC(=O)Nc2cc(ccc2F)C(F)(F)F)cc1Nc1ccc2c(...,Cc1ccc(NC(=O)Nc2cc(C(F)(F)F)ccc2F)cc1Nc1ccc2c(...,InChI=1S/C28H21F4N5O2/c1-15-4-6-19(35-27(39)37...


In [74]:
jump_platemap= pd.read_csv("JUMP-Target-2_compound_platemap.tsv", sep="\t")
jump_platemap = jump_platemap.dropna(subset=["broad_sample"])
jump_platemap

Unnamed: 0,well_position,broad_sample,solvent
0,A01,BRD-K09338665-001-07-1,DMSO
1,A02,BRD-K48278478-001-01-2,DMSO
2,A03,BRD-A85242401-001-12-3,DMSO
3,A04,BRD-K93632104-001-17-2,DMSO
5,A06,BRD-K57313110-001-06-8,DMSO
...,...,...,...
378,P19,BRD-K80970344-201-10-9,DMSO
379,P20,BRD-A69636825-003-04-7,DMSO
380,P21,BRD-K87782578-001-03-9,DMSO
381,P22,BRD-K98763141-001-30-8,DMSO


In [126]:
source_4 = wells[wells.Metadata_Source == "source_4"]
source_4 = source_4[source_4["Metadata_Plate"] == "BR00117035"]
source_4

Unnamed: 0,Metadata_Source,Metadata_Plate,Metadata_Well,Metadata_JCP2022
497462,source_4,BR00117035,A01,JCP2022_905588
497463,source_4,BR00117035,A02,JCP2022_912241
497464,source_4,BR00117035,A03,JCP2022_900266
497465,source_4,BR00117035,A04,JCP2022_915129
497466,source_4,BR00117035,A05,JCP2022_907177
...,...,...,...,...
497841,source_4,BR00117035,P20,JCP2022_915132
497842,source_4,BR00117035,P21,JCP2022_999999
497843,source_4,BR00117035,P22,JCP2022_999999
497844,source_4,BR00117035,P23,JCP2022_999999


In [138]:
len(source_4.Metadata_JCP2022.unique())

348

In [127]:
source_4_brd = jump_platemap.merge(source_4, left_on="well_position", right_on="Metadata_Well", how="inner")
source_4_brd = source_4_brd[["broad_sample", "Metadata_JCP2022"]].drop_duplicates(keep="first")
source_4_brd 


Unnamed: 0,broad_sample,Metadata_JCP2022
0,BRD-K09338665-001-07-1,JCP2022_905588
1,BRD-K48278478-001-01-2,JCP2022_912241
2,BRD-A85242401-001-12-3,JCP2022_900266
3,BRD-K93632104-001-17-2,JCP2022_915129
4,BRD-K57313110-001-06-8,JCP2022_904426
...,...,...
315,BRD-K80970344-201-10-9,JCP2022_915132
316,BRD-A69636825-003-04-7,JCP2022_915132
317,BRD-K87782578-001-03-9,JCP2022_999999
318,BRD-K98763141-001-30-8,JCP2022_999999


In [128]:
target2

Unnamed: 0,broad_sample,InChIKey,pert_iname,pubchem_cid,target,pert_type,control_type,smiles,Standardized_SMILES,Standardized_InChI
0,BRD-K09338665-001-07-1,KBPLFHHGFOOTCA-UHFFFAOYSA-N,1-octanol,957.0,GJB4,trt,,CCCCCCCCO,CCCCCCCCO,"InChI=1S/C8H18O/c1-2-3-4-5-6-7-8-9/h9H,2-8H2,1H3"
1,BRD-K48278478-001-01-2,LOUPRKONTZGTKE-AFHBHXEDSA-N,quinine,94175.0,KCNN4,trt,,COc1ccc2nccc([C@@H](O)[C@H]3C[C@@H]4CC[N@]3C[C...,C=C[C@H]1C[N@H+]2CC[C@H]1C[C@@H]2[C@H](O)c1ccn...,InChI=1S/C20H24N2O2/c1-3-13-12-22-9-7-14(13)10...
2,BRD-A85242401-001-12-3,KRGQEOSDQHTZMX-IGCYCDGOSA-N,ascorbic-acid,9888239.0,P3H1,trt,,OC[C@H](O)[C@H]1OC(=O)C(=O)C1O,OC[C@H](O)c1oc(O)c(O)c1O,InChI=1S/C6H8O6/c7-1-2(8)5-3(9)4(10)6(11)12-5/...
3,BRD-K93632104-001-17-2,YGSDEFSMJLZEOE-UHFFFAOYSA-N,salicylic-acid,118212070.0,AKR1C1,trt,,OC(=O)c1ccccc1O,O=C([O-])c1ccccc1[O-],"InChI=1S/C7H6O3/c8-6-4-2-1-3-5(6)7(9)10/h1-4,8..."
4,BRD-K57313110-001-06-8,ODHCTXKNWHHXJC-VKHMYHEASA-N,pidolic-acid,7405.0,VEGFA,trt,,OC(=O)[C@@H]1CCC(=O)N1,O=C1CCC(C(=O)[O-])N1,"InChI=1S/C5H7NO3/c7-4-2-1-3(6-4)5(8)9/h3H,1-2H..."
...,...,...,...,...,...,...,...,...,...,...
302,BRD-A69636825-003-04-7,HSUGRBWQSSZJOP-UHFFFAOYSA-N,diltiazem,3076.0,CACNG1,trt,,COc1ccc(cc1)C1Sc2ccccc2N(CCN(C)C)C(=O)C1OC(C)=O,COc1ccc(C2Sc3ccccc3N(CC[NH+](C)C)C(=O)C2OC(C)=...,InChI=1S/C22H26N2O4S/c1-15(25)28-20-21(16-9-11...
303,BRD-K87782578-001-03-9,KXBDTLQSDKGAEB-UHFFFAOYSA-N,AVL-292,59174488.0,BTK,trt,,COCCOc1ccc(Nc2ncc(F)c(Nc3cccc(NC(=O)C=C)c3)n2)cc1,C=CC(=O)Nc1cccc(Nc2nc(Nc3ccc(OCCOC)cc3)ncc2F)c1,InChI=1S/C22H22FN5O3/c1-3-20(29)25-16-5-4-6-17...
304,BRD-K98763141-001-30-8,JZFPYUNJRRFVQU-UHFFFAOYSA-N,niflumic-acid,4488.0,UGT1A9,trt,,OC(=O)c1cccnc1Nc1cccc(c1)C(F)(F)F,O=C([O-])c1cccnc1Nc1cccc(C(F)(F)F)c1,"InChI=1S/C13H9F3N2O2/c14-13(15,16)8-3-1-4-9(7-..."
305,BRD-K19975102-001-02-0,YYDUWLSETXNJJT-MTJSOVHGSA-N,GNF-5837,59397065.0,NTRK1,trt,,Cc1ccc(NC(=O)Nc2cc(ccc2F)C(F)(F)F)cc1Nc1ccc2c(...,Cc1ccc(NC(=O)Nc2cc(C(F)(F)F)ccc2F)cc1Nc1ccc2c(...,InChI=1S/C28H21F4N5O2/c1-15-4-6-19(35-27(39)37...


In [129]:
a = list(target2.pert_iname.unique())
len(a)

303

In [130]:
target2_brd = list(target2.broad_sample.unique())
target2_brd
len(target2_brd)

307

In [131]:
target2_brd

['BRD-K09338665-001-07-1',
 'BRD-K48278478-001-01-2',
 'BRD-A85242401-001-12-3',
 'BRD-K93632104-001-17-2',
 'BRD-K57313110-001-06-8',
 'BRD-K25244359-066-04-9',
 'BRD-K44067360-001-30-3',
 'BRD-K73276278-305-02-9',
 'BRD-K59058747-001-22-9',
 'BRD-K68982262-001-02-9',
 'BRD-A20697603-001-07-2',
 'BRD-A54139254-001-10-9',
 'BRD-K26521938-001-07-2',
 'BRD-K43330982-001-04-9',
 'BRD-K05878375-236-05-7',
 'BRD-K51485109-001-02-9',
 'BRD-A15397381-001-02-5',
 'BRD-K15502390-001-19-1',
 'BRD-A43005764-003-02-9',
 'BRD-K38512030-001-01-7',
 'BRD-K50720187-050-09-0',
 'BRD-K02554563-001-01-6',
 'BRD-K68567222-001-02-0',
 'BRD-K81801188-001-02-8',
 'BRD-K08953028-001-09-6',
 'BRD-K47598052-001-14-1',
 'BRD-K37602296-001-02-9',
 'BRD-K60659193-001-01-0',
 'BRD-A07207424-001-15-9',
 'BRD-K29895144-001-01-0',
 'BRD-K40109029-001-03-7',
 'BRD-A09722536-002-18-0',
 'BRD-K26439554-001-02-9',
 'BRD-K70586315-001-02-4',
 'BRD-K29151923-001-02-9',
 'BRD-K17849083-001-32-9',
 'BRD-K70463136-001-02-9',
 

In [132]:
source_4_brd

Unnamed: 0,broad_sample,Metadata_JCP2022
0,BRD-K09338665-001-07-1,JCP2022_905588
1,BRD-K48278478-001-01-2,JCP2022_912241
2,BRD-A85242401-001-12-3,JCP2022_900266
3,BRD-K93632104-001-17-2,JCP2022_915129
4,BRD-K57313110-001-06-8,JCP2022_904426
...,...,...
315,BRD-K80970344-201-10-9,JCP2022_915132
316,BRD-A69636825-003-04-7,JCP2022_915132
317,BRD-K87782578-001-03-9,JCP2022_999999
318,BRD-K98763141-001-30-8,JCP2022_999999


In [133]:
Target2_brd_jcp = source_4_brd[source_4_brd.broad_sample.isin(target2_brd)]#.drop_duplicates(keep="first")
Target2_brd_jcp = Target2_brd_jcp[Target2_brd_jcp["Metadata_JCP2022"]!="JCP2022_999999"]
Target2_brd_jcp

Unnamed: 0,broad_sample,Metadata_JCP2022
0,BRD-K09338665-001-07-1,JCP2022_905588
1,BRD-K48278478-001-01-2,JCP2022_912241
2,BRD-A85242401-001-12-3,JCP2022_900266
3,BRD-K93632104-001-17-2,JCP2022_915129
4,BRD-K57313110-001-06-8,JCP2022_904426
...,...,...
312,BRD-K55567017-001-12-3,JCP2022_902702
313,BRD-K31170746-001-03-9,JCP2022_915132
314,BRD-A18992208-003-02-7,JCP2022_915132
315,BRD-K80970344-201-10-9,JCP2022_915132


In [134]:
len(Target2_brd_jcp.Metadata_JCP2022.unique())

289

In [135]:
len(Target2_brd_jcp.broad_sample.unique())

292

In [50]:
target2

Unnamed: 0,broad_sample,InChIKey,pert_iname,pubchem_cid,target,pert_type,control_type,smiles,Standardized_SMILES,Standardized_InChI
0,BRD-K09338665-001-07-1,KBPLFHHGFOOTCA-UHFFFAOYSA-N,1-octanol,957.0,GJB4,trt,,CCCCCCCCO,CCCCCCCCO,"InChI=1S/C8H18O/c1-2-3-4-5-6-7-8-9/h9H,2-8H2,1H3"
1,BRD-K48278478-001-01-2,LOUPRKONTZGTKE-AFHBHXEDSA-N,quinine,94175.0,KCNN4,trt,,COc1ccc2nccc([C@@H](O)[C@H]3C[C@@H]4CC[N@]3C[C...,C=C[C@H]1C[N@H+]2CC[C@H]1C[C@@H]2[C@H](O)c1ccn...,InChI=1S/C20H24N2O2/c1-3-13-12-22-9-7-14(13)10...
2,BRD-A85242401-001-12-3,KRGQEOSDQHTZMX-IGCYCDGOSA-N,ascorbic-acid,9888239.0,P3H1,trt,,OC[C@H](O)[C@H]1OC(=O)C(=O)C1O,OC[C@H](O)c1oc(O)c(O)c1O,InChI=1S/C6H8O6/c7-1-2(8)5-3(9)4(10)6(11)12-5/...
3,BRD-K93632104-001-17-2,YGSDEFSMJLZEOE-UHFFFAOYSA-N,salicylic-acid,118212070.0,AKR1C1,trt,,OC(=O)c1ccccc1O,O=C([O-])c1ccccc1[O-],"InChI=1S/C7H6O3/c8-6-4-2-1-3-5(6)7(9)10/h1-4,8..."
4,BRD-K57313110-001-06-8,ODHCTXKNWHHXJC-VKHMYHEASA-N,pidolic-acid,7405.0,VEGFA,trt,,OC(=O)[C@@H]1CCC(=O)N1,O=C1CCC(C(=O)[O-])N1,"InChI=1S/C5H7NO3/c7-4-2-1-3(6-4)5(8)9/h3H,1-2H..."
...,...,...,...,...,...,...,...,...,...,...
302,BRD-A69636825-003-04-7,HSUGRBWQSSZJOP-UHFFFAOYSA-N,diltiazem,3076.0,CACNG1,trt,,COc1ccc(cc1)C1Sc2ccccc2N(CCN(C)C)C(=O)C1OC(C)=O,COc1ccc(C2Sc3ccccc3N(CC[NH+](C)C)C(=O)C2OC(C)=...,InChI=1S/C22H26N2O4S/c1-15(25)28-20-21(16-9-11...
303,BRD-K87782578-001-03-9,KXBDTLQSDKGAEB-UHFFFAOYSA-N,AVL-292,59174488.0,BTK,trt,,COCCOc1ccc(Nc2ncc(F)c(Nc3cccc(NC(=O)C=C)c3)n2)cc1,C=CC(=O)Nc1cccc(Nc2nc(Nc3ccc(OCCOC)cc3)ncc2F)c1,InChI=1S/C22H22FN5O3/c1-3-20(29)25-16-5-4-6-17...
304,BRD-K98763141-001-30-8,JZFPYUNJRRFVQU-UHFFFAOYSA-N,niflumic-acid,4488.0,UGT1A9,trt,,OC(=O)c1cccnc1Nc1cccc(c1)C(F)(F)F,O=C([O-])c1cccnc1Nc1cccc(C(F)(F)F)c1,"InChI=1S/C13H9F3N2O2/c14-13(15,16)8-3-1-4-9(7-..."
305,BRD-K19975102-001-02-0,YYDUWLSETXNJJT-MTJSOVHGSA-N,GNF-5837,59397065.0,NTRK1,trt,,Cc1ccc(NC(=O)Nc2cc(ccc2F)C(F)(F)F)cc1Nc1ccc2c(...,Cc1ccc(NC(=O)Nc2cc(C(F)(F)F)ccc2F)cc1Nc1ccc2c(...,InChI=1S/C28H21F4N5O2/c1-15-4-6-19(35-27(39)37...


In [51]:
len(target2.InChIKey.unique())

307

In [52]:
len(target2.pert_iname.unique())

303

In [53]:
len(target2.broad_sample.unique())

307

In [37]:
jumpcp['Standardized_InChI'] = jumpcp["Standardized_SMILES"].parallel_apply(inchi_from_standardised_smile)
jumpcp

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2085), Label(value='0 / 2085'))), …

Unnamed: 0,Metadata_JCP2022,Metadata_InChIKey,Metadata_InChI,smiles,Standardized_SMILES,Standardized_InChI
0,JCP2022_000001,AAAHWCWPZPSPIW-UHFFFAOYSA-N,InChI=1S/C25H31N5O2/c1-4-23-26-14-16-30(23)24-...,CCc1nccn1-c1cccc(C2CCCN2C(=O)c2ccc(OCCN(C)C)cc...,CCc1nccn1-c1cccc(C2CCC[NH+]2C(=O)c2ccc(OCC[NH+...,InChI=1S/C25H31N5O2/c1-4-23-26-14-16-30(23)24-...
1,JCP2022_000002,AAAJHRMBUHXWLD-UHFFFAOYSA-N,InChI=1S/C11H13ClN2O/c12-10-4-2-9(3-5-10)8-14-...,OC1=NCCCN1Cc1ccc(Cl)cc1,OC1=[NH+]CCCN1Cc1ccc(Cl)cc1,InChI=1S/C11H13ClN2O/c12-10-4-2-9(3-5-10)8-14-...
2,JCP2022_000003,AAALVYBICLMAMA-UHFFFAOYSA-N,InChI=1S/C20H15N3O2/c24-19-15-11-17(21-13-7-3-...,O=C1N=C(O)c2cc(Nc3ccccc3)c(Nc3ccccc3)cc21,O=C1NC(=O)c2cc(Nc3ccccc3)c(Nc3ccccc3)cc21,InChI=1S/C20H15N3O2/c24-19-15-11-17(21-13-7-3-...
3,JCP2022_000004,AAANUZMCJQUYNX-UHFFFAOYSA-N,InChI=1S/C13H22N4O2S/c1-2-7-16-13(5-6-15-16)20...,CCCn1nccc1S(=O)(=O)N1CC2CCC1CNC2,CCCn1nccc1S(=O)(=O)[NH+]1CC2CCC1C[NH2+]C2,InChI=1S/C13H22N4O2S/c1-2-7-16-13(5-6-15-16)20...
4,JCP2022_000005,AAAQFGUYHFJNHI-UHFFFAOYSA-N,InChI=1S/C22H22ClN5O2/c1-4-24-20(29)12-18-22-2...,CCN=C(O)CC1N=C(c2ccc(Cl)cc2)c2cc(OC)ccc2-n2c(C...,CCNC(=O)CC1N=C(c2ccc(Cl)cc2)c2cc(OC)ccc2-n2c(C...,InChI=1S/C22H22ClN5O2/c1-4-24-20(29)12-18-22-2...
...,...,...,...,...,...,...
116748,JCP2022_116750,ZZZTXDPKNAOZPM-UHFFFAOYSA-N,InChI=1S/C22H27N3O2/c1-14-18(22(27)24-15(2)23-...,Cc1nc(C)c(CCC(=O)N(C2CC2)C2CCCc3ccccc32)c(O)n1,Cc1nc(C)c(CCC(=O)[NH+](C2CC2)C2CCCc3ccccc32)c(...,InChI=1S/C22H27N3O2/c1-14-18(22(27)24-15(2)23-...
116749,JCP2022_116751,ZZZUOLMMTJKOGE-UHFFFAOYSA-N,InChI=1S/C19H17N5O2/c1-23-9-6-13-7-10-24(19(26...,Cn1ccc2ccn(CCN=C(O)c3cnc4ccccc4n3)c(=O)c21,Cn1ccc2ccn(CCNC(=O)c3cnc4ccccc4n3)c(=O)c21,InChI=1S/C19H17N5O2/c1-23-9-6-13-7-10-24(19(26...
116750,JCP2022_116752,ZZZZIBSVRUABIA-UHFFFAOYSA-N,InChI=1S/C19H13F6N3O2S/c1-11-26-6-5-17(27-11)1...,Cc1nccc(-c2cccc(NS(=O)(=O)c3cc(C(F)(F)F)cc(C(F...,Cc1nccc(-c2cccc(NS(=O)(=O)c3cc(C(F)(F)F)cc(C(F...,InChI=1S/C19H13F6N3O2S/c1-11-26-6-5-17(27-11)1...
116751,JCP2022_116753,ZZZZVNLQWWXZTQ-UHFFFAOYSA-N,InChI=1S/C14H20N2O3/c1-18-13-5-3-2-4-12(13)14(...,COc1ccccc1C(O)=NCCN1CCOCC1,COc1ccccc1C(=O)NCC[NH+]1CCOCC1,InChI=1S/C14H20N2O3/c1-18-13-5-3-2-4-12(13)14(...


In [14]:
df = target2.merge(jumpcp, left_on="Standardized_SMILES", right_on="Standardized_SMILES", how="inner")
df

Unnamed: 0,broad_sample,InChIKey,pert_iname,pubchem_cid,target,pert_type,control_type,smiles_x,Standardized_SMILES,Standardized_InChI_x,Metadata_JCP2022,Metadata_InChIKey,Metadata_InChI,smiles_y,Standardized_InChI_y
0,BRD-K09338665-001-07-1,KBPLFHHGFOOTCA-UHFFFAOYSA-N,1-octanol,957.0,GJB4,trt,,CCCCCCCCO,CCCCCCCCO,"InChI=1S/C8H18O/c1-2-3-4-5-6-7-8-9/h9H,2-8H2,1H3",JCP2022_043547,KBPLFHHGFOOTCA-UHFFFAOYSA-N,"InChI=1S/C8H18O/c1-2-3-4-5-6-7-8-9/h9H,2-8H2,1H3",CCCCCCCCO,"InChI=1S/C8H18O/c1-2-3-4-5-6-7-8-9/h9H,2-8H2,1H3"
1,BRD-K93632104-001-17-2,YGSDEFSMJLZEOE-UHFFFAOYSA-N,salicylic-acid,118212070.0,AKR1C1,trt,,OC(=O)c1ccccc1O,O=C([O-])c1ccccc1[O-],"InChI=1S/C7H6O3/c8-6-4-2-1-3-5(6)7(9)10/h1-4,8...",JCP2022_108326,YGSDEFSMJLZEOE-UHFFFAOYSA-N,"InChI=1S/C7H6O3/c8-6-4-2-1-3-5(6)7(9)10/h1-4,8...",O=C(O)c1ccccc1O,"InChI=1S/C7H6O3/c8-6-4-2-1-3-5(6)7(9)10/h1-4,8..."
2,BRD-K25244359-066-04-9,WPEWQEMJFLWMLV-UHFFFAOYSA-N,apatinib,11315474.0,CSK,trt,,O=C(Nc1ccc(cc1)C1(CCCC1)C#N)c1cccnc1NCc1ccncc1,N#CC1(c2ccc(NC(=O)c3cccnc3NCc3ccncc3)cc2)CCCC1,InChI=1S/C24H23N5O/c25-17-24(11-1-2-12-24)19-5...,JCP2022_100264,WPEWQEMJFLWMLV-UHFFFAOYSA-N,InChI=1S/C24H23N5O/c25-17-24(11-1-2-12-24)19-5...,N#CC1(c2ccc(NC(=O)c3cccnc3NCc3ccncc3)cc2)CCCC1,InChI=1S/C24H23N5O/c25-17-24(11-1-2-12-24)19-5...
3,BRD-K44067360-001-30-3,LPEPZBJOKDYZAD-UHFFFAOYSA-N,flufenamic-acid,3371.0,GJB4,trt,,OC(=O)c1ccccc1Nc1cccc(c1)C(F)(F)F,O=C([O-])c1ccccc1Nc1cccc(C(F)(F)F)c1,"InChI=1S/C14H10F3NO2/c15-14(16,17)9-4-3-5-10(8...",JCP2022_050861,LPEPZBJOKDYZAD-UHFFFAOYSA-N,"InChI=1S/C14H10F3NO2/c15-14(16,17)9-4-3-5-10(8...",O=C(O)c1ccccc1Nc1cccc(C(F)(F)F)c1,"InChI=1S/C14H10F3NO2/c15-14(16,17)9-4-3-5-10(8..."
4,BRD-K59058747-001-22-9,PWKSKIMOESPYIA-BYPYZUCNSA-N,acetylcysteine,12035.0,SLC7A11,trt,,CC(=O)N[C@@H](CS)C(O)=O,CC(=O)NC(CS)C(=O)[O-],"InChI=1S/C5H9NO3S/c1-3(7)6-4(2-10)5(8)9/h4,10H...",JCP2022_071429,PWKSKIMOESPYIA-UHFFFAOYSA-N,"InChI=1S/C5H9NO3S/c1-3(7)6-4(2-10)5(8)9/h4,10H...",CC(O)=NC(CS)C(=O)O,"InChI=1S/C5H9NO3S/c1-3(7)6-4(2-10)5(8)9/h4,10H..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150,BRD-K31170746-001-03-9,QUGDTMONBLMLLD-UHFFFAOYSA-N,ANR-94,11805896.0,ADORA2A,trt,,CCOc1nc2c(N)ncnc2n1CC,CCOc1nc2c(N)ncnc2n1CC,InChI=1S/C9H13N5O/c1-3-14-8-6(7(10)11-5-12-8)1...,JCP2022_075916,QUGDTMONBLMLLD-UHFFFAOYSA-N,InChI=1S/C9H13N5O/c1-3-14-8-6(7(10)11-5-12-8)1...,CCOc1nc2c(N)ncnc2n1CC,InChI=1S/C9H13N5O/c1-3-14-8-6(7(10)11-5-12-8)1...
151,BRD-K80970344-201-10-9,VSWDORGPIHIGNW-UHFFFAOYSA-N,pyrrolidine-dithiocarbamate,65351.0,HSD11B1,trt,,SC(=S)N1CCCC1,S=C([S-])[NH+]1CCCC1,"InChI=1S/C5H9NS2/c7-5(8)6-3-1-2-4-6/h1-4H2,(H,...",JCP2022_096067,VSWDORGPIHIGNW-UHFFFAOYSA-N,"InChI=1S/C5H9NS2/c7-5(8)6-3-1-2-4-6/h1-4H2,(H,...",S=C(S)N1CCCC1,"InChI=1S/C5H9NS2/c7-5(8)6-3-1-2-4-6/h1-4H2,(H,..."
152,BRD-A69636825-003-04-7,HSUGRBWQSSZJOP-UHFFFAOYSA-N,diltiazem,3076.0,CACNG1,trt,,COc1ccc(cc1)C1Sc2ccccc2N(CCN(C)C)C(=O)C1OC(C)=O,COc1ccc(C2Sc3ccccc3N(CC[NH+](C)C)C(=O)C2OC(C)=...,InChI=1S/C22H26N2O4S/c1-15(25)28-20-21(16-9-11...,JCP2022_032357,HSUGRBWQSSZJOP-UHFFFAOYSA-N,InChI=1S/C22H26N2O4S/c1-15(25)28-20-21(16-9-11...,COc1ccc(C2Sc3ccccc3N(CCN(C)C)C(=O)C2OC(C)=O)cc1,InChI=1S/C22H26N2O4S/c1-15(25)28-20-21(16-9-11...
153,BRD-K98763141-001-30-8,JZFPYUNJRRFVQU-UHFFFAOYSA-N,niflumic-acid,4488.0,UGT1A9,trt,,OC(=O)c1cccnc1Nc1cccc(c1)C(F)(F)F,O=C([O-])c1cccnc1Nc1cccc(C(F)(F)F)c1,"InChI=1S/C13H9F3N2O2/c14-13(15,16)8-3-1-4-9(7-...",JCP2022_043099,JZFPYUNJRRFVQU-UHFFFAOYSA-N,"InChI=1S/C13H9F3N2O2/c14-13(15,16)8-3-1-4-9(7-...",O=C(O)c1cccnc1Nc1cccc(C(F)(F)F)c1,"InChI=1S/C13H9F3N2O2/c14-13(15,16)8-3-1-4-9(7-..."


In [15]:
df = target2.merge(jumpcp, left_on="Standardized_InChI", right_on="Standardized_InChI", how="inner")
df

Unnamed: 0,broad_sample,InChIKey,pert_iname,pubchem_cid,target,pert_type,control_type,smiles_x,Standardized_SMILES_x,Standardized_InChI,Metadata_JCP2022,Metadata_InChIKey,Metadata_InChI,smiles_y,Standardized_SMILES_y
0,BRD-K09338665-001-07-1,KBPLFHHGFOOTCA-UHFFFAOYSA-N,1-octanol,957.0,GJB4,trt,,CCCCCCCCO,CCCCCCCCO,"InChI=1S/C8H18O/c1-2-3-4-5-6-7-8-9/h9H,2-8H2,1H3",JCP2022_043547,KBPLFHHGFOOTCA-UHFFFAOYSA-N,"InChI=1S/C8H18O/c1-2-3-4-5-6-7-8-9/h9H,2-8H2,1H3",CCCCCCCCO,CCCCCCCCO
1,BRD-K93632104-001-17-2,YGSDEFSMJLZEOE-UHFFFAOYSA-N,salicylic-acid,118212070.0,AKR1C1,trt,,OC(=O)c1ccccc1O,O=C([O-])c1ccccc1[O-],"InChI=1S/C7H6O3/c8-6-4-2-1-3-5(6)7(9)10/h1-4,8...",JCP2022_108326,YGSDEFSMJLZEOE-UHFFFAOYSA-N,"InChI=1S/C7H6O3/c8-6-4-2-1-3-5(6)7(9)10/h1-4,8...",O=C(O)c1ccccc1O,O=C([O-])c1ccccc1[O-]
2,BRD-K25244359-066-04-9,WPEWQEMJFLWMLV-UHFFFAOYSA-N,apatinib,11315474.0,CSK,trt,,O=C(Nc1ccc(cc1)C1(CCCC1)C#N)c1cccnc1NCc1ccncc1,N#CC1(c2ccc(NC(=O)c3cccnc3NCc3ccncc3)cc2)CCCC1,InChI=1S/C24H23N5O/c25-17-24(11-1-2-12-24)19-5...,JCP2022_100264,WPEWQEMJFLWMLV-UHFFFAOYSA-N,InChI=1S/C24H23N5O/c25-17-24(11-1-2-12-24)19-5...,N#CC1(c2ccc(NC(=O)c3cccnc3NCc3ccncc3)cc2)CCCC1,N#CC1(c2ccc(NC(=O)c3cccnc3NCc3ccncc3)cc2)CCCC1
3,BRD-K44067360-001-30-3,LPEPZBJOKDYZAD-UHFFFAOYSA-N,flufenamic-acid,3371.0,GJB4,trt,,OC(=O)c1ccccc1Nc1cccc(c1)C(F)(F)F,O=C([O-])c1ccccc1Nc1cccc(C(F)(F)F)c1,"InChI=1S/C14H10F3NO2/c15-14(16,17)9-4-3-5-10(8...",JCP2022_050861,LPEPZBJOKDYZAD-UHFFFAOYSA-N,"InChI=1S/C14H10F3NO2/c15-14(16,17)9-4-3-5-10(8...",O=C(O)c1ccccc1Nc1cccc(C(F)(F)F)c1,O=C([O-])c1ccccc1Nc1cccc(C(F)(F)F)c1
4,BRD-K59058747-001-22-9,PWKSKIMOESPYIA-BYPYZUCNSA-N,acetylcysteine,12035.0,SLC7A11,trt,,CC(=O)N[C@@H](CS)C(O)=O,CC(=O)NC(CS)C(=O)[O-],"InChI=1S/C5H9NO3S/c1-3(7)6-4(2-10)5(8)9/h4,10H...",JCP2022_071429,PWKSKIMOESPYIA-UHFFFAOYSA-N,"InChI=1S/C5H9NO3S/c1-3(7)6-4(2-10)5(8)9/h4,10H...",CC(O)=NC(CS)C(=O)O,CC(=O)NC(CS)C(=O)[O-]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
158,BRD-K31170746-001-03-9,QUGDTMONBLMLLD-UHFFFAOYSA-N,ANR-94,11805896.0,ADORA2A,trt,,CCOc1nc2c(N)ncnc2n1CC,CCOc1nc2c(N)ncnc2n1CC,InChI=1S/C9H13N5O/c1-3-14-8-6(7(10)11-5-12-8)1...,JCP2022_075916,QUGDTMONBLMLLD-UHFFFAOYSA-N,InChI=1S/C9H13N5O/c1-3-14-8-6(7(10)11-5-12-8)1...,CCOc1nc2c(N)ncnc2n1CC,CCOc1nc2c(N)ncnc2n1CC
159,BRD-K80970344-201-10-9,VSWDORGPIHIGNW-UHFFFAOYSA-N,pyrrolidine-dithiocarbamate,65351.0,HSD11B1,trt,,SC(=S)N1CCCC1,S=C([S-])[NH+]1CCCC1,"InChI=1S/C5H9NS2/c7-5(8)6-3-1-2-4-6/h1-4H2,(H,...",JCP2022_096067,VSWDORGPIHIGNW-UHFFFAOYSA-N,"InChI=1S/C5H9NS2/c7-5(8)6-3-1-2-4-6/h1-4H2,(H,...",S=C(S)N1CCCC1,S=C([S-])[NH+]1CCCC1
160,BRD-A69636825-003-04-7,HSUGRBWQSSZJOP-UHFFFAOYSA-N,diltiazem,3076.0,CACNG1,trt,,COc1ccc(cc1)C1Sc2ccccc2N(CCN(C)C)C(=O)C1OC(C)=O,COc1ccc(C2Sc3ccccc3N(CC[NH+](C)C)C(=O)C2OC(C)=...,InChI=1S/C22H26N2O4S/c1-15(25)28-20-21(16-9-11...,JCP2022_032357,HSUGRBWQSSZJOP-UHFFFAOYSA-N,InChI=1S/C22H26N2O4S/c1-15(25)28-20-21(16-9-11...,COc1ccc(C2Sc3ccccc3N(CCN(C)C)C(=O)C2OC(C)=O)cc1,COc1ccc(C2Sc3ccccc3N(CC[NH+](C)C)C(=O)C2OC(C)=...
161,BRD-K98763141-001-30-8,JZFPYUNJRRFVQU-UHFFFAOYSA-N,niflumic-acid,4488.0,UGT1A9,trt,,OC(=O)c1cccnc1Nc1cccc(c1)C(F)(F)F,O=C([O-])c1cccnc1Nc1cccc(C(F)(F)F)c1,"InChI=1S/C13H9F3N2O2/c14-13(15,16)8-3-1-4-9(7-...",JCP2022_043099,JZFPYUNJRRFVQU-UHFFFAOYSA-N,"InChI=1S/C13H9F3N2O2/c14-13(15,16)8-3-1-4-9(7-...",O=C(O)c1cccnc1Nc1cccc(C(F)(F)F)c1,O=C([O-])c1cccnc1Nc1cccc(C(F)(F)F)c1


In [19]:
df = target2.merge(jumpcp, left_on="InChIKey", right_on="Metadata_InChIKey", how="inner")
df

Unnamed: 0,broad_sample,InChIKey,pert_iname,pubchem_cid,target,pert_type,control_type,smiles_x,Standardized_SMILES_x,Standardized_InChI_x,Metadata_JCP2022,Metadata_InChIKey,Metadata_InChI,smiles_y,Standardized_SMILES_y,Standardized_InChI_y
0,BRD-K09338665-001-07-1,KBPLFHHGFOOTCA-UHFFFAOYSA-N,1-octanol,957.0,GJB4,trt,,CCCCCCCCO,CCCCCCCCO,"InChI=1S/C8H18O/c1-2-3-4-5-6-7-8-9/h9H,2-8H2,1H3",JCP2022_043547,KBPLFHHGFOOTCA-UHFFFAOYSA-N,"InChI=1S/C8H18O/c1-2-3-4-5-6-7-8-9/h9H,2-8H2,1H3",CCCCCCCCO,CCCCCCCCO,"InChI=1S/C8H18O/c1-2-3-4-5-6-7-8-9/h9H,2-8H2,1H3"
1,BRD-K93632104-001-17-2,YGSDEFSMJLZEOE-UHFFFAOYSA-N,salicylic-acid,118212070.0,AKR1C1,trt,,OC(=O)c1ccccc1O,O=C([O-])c1ccccc1[O-],"InChI=1S/C7H6O3/c8-6-4-2-1-3-5(6)7(9)10/h1-4,8...",JCP2022_108326,YGSDEFSMJLZEOE-UHFFFAOYSA-N,"InChI=1S/C7H6O3/c8-6-4-2-1-3-5(6)7(9)10/h1-4,8...",O=C(O)c1ccccc1O,O=C([O-])c1ccccc1[O-],"InChI=1S/C7H6O3/c8-6-4-2-1-3-5(6)7(9)10/h1-4,8..."
2,BRD-K25244359-066-04-9,WPEWQEMJFLWMLV-UHFFFAOYSA-N,apatinib,11315474.0,CSK,trt,,O=C(Nc1ccc(cc1)C1(CCCC1)C#N)c1cccnc1NCc1ccncc1,N#CC1(c2ccc(NC(=O)c3cccnc3NCc3ccncc3)cc2)CCCC1,InChI=1S/C24H23N5O/c25-17-24(11-1-2-12-24)19-5...,JCP2022_100264,WPEWQEMJFLWMLV-UHFFFAOYSA-N,InChI=1S/C24H23N5O/c25-17-24(11-1-2-12-24)19-5...,N#CC1(c2ccc(NC(=O)c3cccnc3NCc3ccncc3)cc2)CCCC1,N#CC1(c2ccc(NC(=O)c3cccnc3NCc3ccncc3)cc2)CCCC1,InChI=1S/C24H23N5O/c25-17-24(11-1-2-12-24)19-5...
3,BRD-K44067360-001-30-3,LPEPZBJOKDYZAD-UHFFFAOYSA-N,flufenamic-acid,3371.0,GJB4,trt,,OC(=O)c1ccccc1Nc1cccc(c1)C(F)(F)F,O=C([O-])c1ccccc1Nc1cccc(C(F)(F)F)c1,"InChI=1S/C14H10F3NO2/c15-14(16,17)9-4-3-5-10(8...",JCP2022_050861,LPEPZBJOKDYZAD-UHFFFAOYSA-N,"InChI=1S/C14H10F3NO2/c15-14(16,17)9-4-3-5-10(8...",O=C(O)c1ccccc1Nc1cccc(C(F)(F)F)c1,O=C([O-])c1ccccc1Nc1cccc(C(F)(F)F)c1,"InChI=1S/C14H10F3NO2/c15-14(16,17)9-4-3-5-10(8..."
4,BRD-A54139254-001-10-9,AZYDQCGCBQYFSE-UHFFFAOYSA-N,4-CMTB,4307629.0,FFAR2,trt,,CC(C)C(C(=O)Nc1nccs1)c1ccc(Cl)cc1,CC(C)C(C(=O)Nc1nccs1)c1ccc(Cl)cc1,InChI=1S/C14H15ClN2OS/c1-9(2)12(10-3-5-11(15)6...,JCP2022_004940,AZYDQCGCBQYFSE-UHFFFAOYSA-N,InChI=1S/C14H15ClN2OS/c1-9(2)12(10-3-5-11(15)6...,CC(C)C(C(O)=Nc1nccs1)c1ccc(Cl)cc1,CC(C)C(C(=O)Nc1nccs1)c1ccc(Cl)cc1,InChI=1S/C14H15ClN2OS/c1-9(2)12(10-3-5-11(15)6...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
177,BRD-K80970344-201-10-9,VSWDORGPIHIGNW-UHFFFAOYSA-N,pyrrolidine-dithiocarbamate,65351.0,HSD11B1,trt,,SC(=S)N1CCCC1,S=C([S-])[NH+]1CCCC1,"InChI=1S/C5H9NS2/c7-5(8)6-3-1-2-4-6/h1-4H2,(H,...",JCP2022_096067,VSWDORGPIHIGNW-UHFFFAOYSA-N,"InChI=1S/C5H9NS2/c7-5(8)6-3-1-2-4-6/h1-4H2,(H,...",S=C(S)N1CCCC1,S=C([S-])[NH+]1CCCC1,"InChI=1S/C5H9NS2/c7-5(8)6-3-1-2-4-6/h1-4H2,(H,..."
178,BRD-A69636825-003-04-7,HSUGRBWQSSZJOP-UHFFFAOYSA-N,diltiazem,3076.0,CACNG1,trt,,COc1ccc(cc1)C1Sc2ccccc2N(CCN(C)C)C(=O)C1OC(C)=O,COc1ccc(C2Sc3ccccc3N(CC[NH+](C)C)C(=O)C2OC(C)=...,InChI=1S/C22H26N2O4S/c1-15(25)28-20-21(16-9-11...,JCP2022_032357,HSUGRBWQSSZJOP-UHFFFAOYSA-N,InChI=1S/C22H26N2O4S/c1-15(25)28-20-21(16-9-11...,COc1ccc(C2Sc3ccccc3N(CCN(C)C)C(=O)C2OC(C)=O)cc1,COc1ccc(C2Sc3ccccc3N(CC[NH+](C)C)C(=O)C2OC(C)=...,InChI=1S/C22H26N2O4S/c1-15(25)28-20-21(16-9-11...
179,BRD-K87782578-001-03-9,KXBDTLQSDKGAEB-UHFFFAOYSA-N,AVL-292,59174488.0,BTK,trt,,COCCOc1ccc(Nc2ncc(F)c(Nc3cccc(NC(=O)C=C)c3)n2)cc1,C=CC(=O)Nc1cccc(Nc2nc(Nc3ccc(OCCOC)cc3)ncc2F)c1,InChI=1S/C22H22FN5O3/c1-3-20(29)25-16-5-4-6-17...,JCP2022_047545,KXBDTLQSDKGAEB-UHFFFAOYSA-N,InChI=1S/C22H22FN5O3/c1-3-20(29)25-16-5-4-6-17...,C=CC(O)=Nc1cccc(N=c2[nH]c(Nc3ccc(OCCOC)cc3)ncc...,C=CC(=O)Nc1cccc(N=c2[n-]c(Nc3ccc(OCCOC)cc3)ncc...,InChI=1S/C22H22FN5O3/c1-3-20(29)25-16-5-4-6-17...
180,BRD-K98763141-001-30-8,JZFPYUNJRRFVQU-UHFFFAOYSA-N,niflumic-acid,4488.0,UGT1A9,trt,,OC(=O)c1cccnc1Nc1cccc(c1)C(F)(F)F,O=C([O-])c1cccnc1Nc1cccc(C(F)(F)F)c1,"InChI=1S/C13H9F3N2O2/c14-13(15,16)8-3-1-4-9(7-...",JCP2022_043099,JZFPYUNJRRFVQU-UHFFFAOYSA-N,"InChI=1S/C13H9F3N2O2/c14-13(15,16)8-3-1-4-9(7-...",O=C(O)c1cccnc1Nc1cccc(C(F)(F)F)c1,O=C([O-])c1cccnc1Nc1cccc(C(F)(F)F)c1,"InChI=1S/C13H9F3N2O2/c14-13(15,16)8-3-1-4-9(7-..."
