In [10]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import rdFingerprintGenerator
from rdkit.DataStructs.cDataStructs import TanimotoSimilarity
from typing import Tuple
from tqdm import tqdm

In [3]:
def get_morgan_fingerprint(mol: Chem.Mol) -> rdFingerprintGenerator.MorganFP:
    """
    Get the fingerprint of a molecule.
    """
    mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048)
    return mfpgen.GetFingerprint(mol)

def calculate_tanimoto(mol1: Chem.Mol,
                       mol2: Chem.Mol) -> float:
    if mol1 is None or mol2 is None:
        return 0
    # do full circle back to smiles to mol
    mol1, mol2 = Chem.MolFromSmiles(Chem.MolToSmiles(mol1)), Chem.MolFromSmiles(Chem.MolToSmiles(mol2))
    fp1 = get_morgan_fingerprint(mol1)
    fp2 = get_morgan_fingerprint(mol2)
    return TanimotoSimilarity(fp1, fp2)

In [6]:
df = pd.read_csv('sept23_syndirella_CHIKV_MAC_final_car_routes.csv')
df

Unnamed: 0,1_r1_smiles,1_r2_smiles,1_reaction,2_r1_smiles,2_r2_smiles,2_reaction,compound_set,input_num,route_uuid,smiles,hit1_short,hit2_short,datetime,num_steps,flat_inchi
0,NC1CCC(=O)NC1,NC(=O)c1ccc(C(=O)O)c(Cl)c1,Amidation,,,,cx1371a-cx0300a,44.0,32fgo7,NC(=O)c1ccc(C(=O)N[C@@H]2CCC(=O)NC2)c(Cl)c1,cx1371a,cx0300a,2024-09-03 18:02:00,1.0,SMVDAYWKXWVCAN-UHFFFAOYSA-N
1,O=C(O)COc1cccc(F)c1,CNCc1cccc(O)n1,Amidation,,,,cx0540a-cx1118a,134.0,qehMd6,CN(Cc1cccc(O)n1)C(=O)COc1cccc(F)c1,cx1118a,cx0540a,2024-09-04 10:07:00,1.0,MNLAWAQYKQVGLK-UHFFFAOYSA-N
2,CNCc1ccc(O)nc1,CCOc1ncccc1C(=O)O,Amidation,,,,cx0522a-cx1118a,50.0,tTnD23,CCOc1ncccc1C(=O)N(C)Cc1ccc(O)nc1,cx0522a,cx1118a,2024-09-03 20:56:00,1.0,FCGOSHSMTQLVEX-UHFFFAOYSA-N
3,NC1CCCC1CO,Clc1c(I)ccc2[nH]cnc12,Buchwald-Hartwig_amination,,,,cx0894a-cx0406a,238.0,bk89Er,OC[C@@H]1CCC[C@H]1Nc1ccc2[nH]cnc2c1Cl,cx0406a,cx0894a,2024-09-04 11:35:00,1.0,WHLDZOYDHAHKLT-UHFFFAOYSA-N
4,NCc1ccc(O)s1,CCOC(=O)c1ccnn1C,Ester_amidation,,,,cx0892a-cx1151a,128.0,8NWsBn,Cn1nccc1C(=O)NCc1ccc(O)s1,cx1151a,cx0892a,2024-09-04 09:20:00,1.0,IMTCTUMLWQUKFC-UHFFFAOYSA-N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64,OCCl,Clc1ccc2c(c1)NCO2,Nucleophilic_substitution_with_amine,OCN1COc2ccc(Cl)cc21,OB(O)c1ccn[nH]1,Sp2-sp2_Suzuki_coupling,cx0864d-cx0969b,300.0,6ACutS,OCN1COc2ccc(-c3ccn[nH]3)cc21,cx0864d,cx0969b,2024-09-04 11:31:00,2.0,CVNPYWIROGTFDL-UHFFFAOYSA-N
65,O=CO,CN(C)C(=O)CNc1cc[nH]c(=O)c1,Amidation,,,,cx1076a-cx0721a,161.0,4DEaMj,CN(C)C(=O)CN(C=O)c1cc[nH]c(=O)c1,cx1076a,cx0721a,2024-09-04 09:09:00,1.0,KKYCNGMNUUPLTM-UHFFFAOYSA-N
66,Nc1ccccc1N,COCC(=O)OC(C)(C)C,Ester_amidation,,,,cx0692a-cx1116a,283.0,3bhpFq,COCC(=O)Nc1ccccc1N,cx0692a,cx1116a,2024-09-05 11:12:00,1.0,PPHADCGGEPSVTM-UHFFFAOYSA-N
67,CNCc1cccc(=O)[nH]1,CCOC(=O)c1cn[nH]c1,Ester_amidation,,,,cx0732a-cx1135a,29.0,ZzGYCp,CN(Cc1cccc(=O)[nH]1)C(=O)c1cn[nH]c1,cx0732a,cx1135a,2024-09-03 19:00:00,1.0,ZJENMQHSGLZNHL-UHFFFAOYSA-N


In [7]:
enamine = pd.read_excel('../quoting/Q1924016_CHIKV_Reactants.xlsx')
enamine

Unnamed: 0,ID,SMILES,Catalogue ID,"Price per 20mg, EUR","Price per 50mg, EUR","Price per 100mg, EUR","Price per 250mg, EUR",Availability,Match Type,"Purity, %",Alternative_stock_salt_form,Alternative_stock_stereo_form
0,15,Nc1ccc2ncccc2c1,EN300-41836,17,18,18,18,In stock,Exact match,95.0,,
1,18,CNC1CCN(c2ccccc2F)C1=O,EN300-146245,0,0,0,0,In stock,Other salt form,95.0,EN300-75310,
2,19,Cc1[nH][nH]c(=O)c1CCO,EN300-260735,17,18,27,38,In stock,Exact match,95.0,,
3,25,O=c1cc[nH]c(=O)[nH]1,EN300-17138,17,18,18,18,In stock,Exact match,95.0,,
4,32,COCC(=O)Nc1ccc(O)cc1,EN300-76300,56,84,125,179,In stock,Exact match,95.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...
9926,91469,Cc1c(CCO)c(=O)[nH]n1S(=O)(=O)c1ccc(I)cc1,EN300-18644000,0,0,0,0,Out of stock,Exact match,95.0,,
9927,91471,Cc1c(CCO)c(=O)[nH]n1C(=O)c1c(F)cccc1F,EN300-18636910,0,0,0,0,Out of stock,Exact match,95.0,,
9928,91473,Cc1ccc(-n2[nH]c(C(=O)O)c(CCO)c2=O)cc1C,EN300-236836,0,0,0,0,Out of stock,Exact match,95.0,,
9929,91475,Cc1ccc(S(=O)(=O)n2[nH]c(=O)c(CCO)c2C)cc1,EN300-25004434,0,0,0,0,Out of stock,Exact match,95.0,,


In [81]:
# just check if any intermediate is in enamine
#df = routes[routes['2_r1_smiles'].notna()] 

In [8]:
df.fillna('', inplace=True)

In [11]:
# go through row by row
for i, irow in tqdm(df.iterrows()):
    # check if all reactants in enamine
    for j, jrow in enamine[enamine['Availability'] == 'In stock'].iterrows():
        if calculate_tanimoto(Chem.MolFromSmiles(irow['1_r1_smiles']), Chem.MolFromSmiles(jrow['SMILES'])) == 1.0:
            df.loc[i, '1_r1'] = jrow['Catalogue ID']
        if calculate_tanimoto(Chem.MolFromSmiles(irow['1_r2_smiles']), Chem.MolFromSmiles(jrow['SMILES'])) == 1.0:
            df.loc[i, '1_r2'] = jrow['Catalogue ID']
        if calculate_tanimoto(Chem.MolFromSmiles(irow['2_r1_smiles']), Chem.MolFromSmiles(jrow['SMILES'])) == 1.0:
            df.loc[i, '2_r1'] = jrow['Catalogue ID']
        if calculate_tanimoto(Chem.MolFromSmiles(irow['2_r2_smiles']), Chem.MolFromSmiles(jrow['SMILES'])) == 1.0:
            df.loc[i, '2_r2'] = jrow['Catalogue ID']

69it [09:15,  8.05s/it]


In [12]:
df

Unnamed: 0,1_r1_smiles,1_r2_smiles,1_reaction,2_r1_smiles,2_r2_smiles,2_reaction,compound_set,input_num,route_uuid,smiles,hit1_short,hit2_short,datetime,num_steps,flat_inchi,1_r1,1_r2,2_r1,2_r2
0,NC1CCC(=O)NC1,NC(=O)c1ccc(C(=O)O)c(Cl)c1,Amidation,,,,cx1371a-cx0300a,44.0,32fgo7,NC(=O)c1ccc(C(=O)N[C@@H]2CCC(=O)NC2)c(Cl)c1,cx1371a,cx0300a,2024-09-03 18:02:00,1.0,SMVDAYWKXWVCAN-UHFFFAOYSA-N,EN300-56890,,,
1,O=C(O)COc1cccc(F)c1,CNCc1cccc(O)n1,Amidation,,,,cx0540a-cx1118a,134.0,qehMd6,CN(Cc1cccc(O)n1)C(=O)COc1cccc(F)c1,cx1118a,cx0540a,2024-09-04 10:07:00,1.0,MNLAWAQYKQVGLK-UHFFFAOYSA-N,EN300-00073,EN300-702794,,
2,CNCc1ccc(O)nc1,CCOc1ncccc1C(=O)O,Amidation,,,,cx0522a-cx1118a,50.0,tTnD23,CCOc1ncccc1C(=O)N(C)Cc1ccc(O)nc1,cx0522a,cx1118a,2024-09-03 20:56:00,1.0,FCGOSHSMTQLVEX-UHFFFAOYSA-N,EN300-2855135,EN300-21350,,
3,NC1CCCC1CO,Clc1c(I)ccc2[nH]cnc12,Buchwald-Hartwig_amination,,,,cx0894a-cx0406a,238.0,bk89Er,OC[C@@H]1CCC[C@H]1Nc1ccc2[nH]cnc2c1Cl,cx0406a,cx0894a,2024-09-04 11:35:00,1.0,WHLDZOYDHAHKLT-UHFFFAOYSA-N,EN300-76012,EN300-25644419,,
4,NCc1ccc(O)s1,CCOC(=O)c1ccnn1C,Ester_amidation,,,,cx0892a-cx1151a,128.0,8NWsBn,Cn1nccc1C(=O)NCc1ccc(O)s1,cx1151a,cx0892a,2024-09-04 09:20:00,1.0,IMTCTUMLWQUKFC-UHFFFAOYSA-N,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64,OCCl,Clc1ccc2c(c1)NCO2,Nucleophilic_substitution_with_amine,OCN1COc2ccc(Cl)cc21,OB(O)c1ccn[nH]1,Sp2-sp2_Suzuki_coupling,cx0864d-cx0969b,300.0,6ACutS,OCN1COc2ccc(-c3ccn[nH]3)cc21,cx0864d,cx0969b,2024-09-04 11:31:00,2.0,CVNPYWIROGTFDL-UHFFFAOYSA-N,,,,EN300-200298
65,O=CO,CN(C)C(=O)CNc1cc[nH]c(=O)c1,Amidation,,,,cx1076a-cx0721a,161.0,4DEaMj,CN(C)C(=O)CN(C=O)c1cc[nH]c(=O)c1,cx1076a,cx0721a,2024-09-04 09:09:00,1.0,KKYCNGMNUUPLTM-UHFFFAOYSA-N,EN300-19898,,,
66,Nc1ccccc1N,COCC(=O)OC(C)(C)C,Ester_amidation,,,,cx0692a-cx1116a,283.0,3bhpFq,COCC(=O)Nc1ccccc1N,cx0692a,cx1116a,2024-09-05 11:12:00,1.0,PPHADCGGEPSVTM-UHFFFAOYSA-N,EN300-19093,,,
67,CNCc1cccc(=O)[nH]1,CCOC(=O)c1cn[nH]c1,Ester_amidation,,,,cx0732a-cx1135a,29.0,ZzGYCp,CN(Cc1cccc(=O)[nH]1)C(=O)c1cn[nH]c1,cx0732a,cx1135a,2024-09-03 19:00:00,1.0,ZJENMQHSGLZNHL-UHFFFAOYSA-N,,EN300-93017,,


In [13]:
df.to_csv('sept23_syndirella_CHIKV_MAC_final_car_routes_w_enamine.csv')

In [14]:

# Define the function to check if all reactants are in enamine
def check_reactants(row):
    if row['num_steps'] == 1:
        if row['1_r2_smiles'] == '':
            return not pd.isna(row['1_r1'])
        else:
            return not pd.isna(row['1_r1']) and not pd.isna(row['1_r2'])
    elif row['num_steps'] == 2:
        if row['1_r2_smiles'] == '':
            return not pd.isna(row['1_r1']) and (not pd.isna(row['2_r1']) or not pd.isna(row['2_r2']))
        return not pd.isna(row['1_r1']) and not pd.isna(row['1_r2']) and (not pd.isna(row['2_r1']) or not pd.isna(row['2_r2']))
    return False

# Apply the function to each row
df['all_reactants_in_enamine'] = df.apply(check_reactants, axis=1)
df

Unnamed: 0,1_r1_smiles,1_r2_smiles,1_reaction,2_r1_smiles,2_r2_smiles,2_reaction,compound_set,input_num,route_uuid,smiles,hit1_short,hit2_short,datetime,num_steps,flat_inchi,1_r1,1_r2,2_r1,2_r2,all_reactants_in_enamine
0,NC1CCC(=O)NC1,NC(=O)c1ccc(C(=O)O)c(Cl)c1,Amidation,,,,cx1371a-cx0300a,44.0,32fgo7,NC(=O)c1ccc(C(=O)N[C@@H]2CCC(=O)NC2)c(Cl)c1,cx1371a,cx0300a,2024-09-03 18:02:00,1.0,SMVDAYWKXWVCAN-UHFFFAOYSA-N,EN300-56890,,,,False
1,O=C(O)COc1cccc(F)c1,CNCc1cccc(O)n1,Amidation,,,,cx0540a-cx1118a,134.0,qehMd6,CN(Cc1cccc(O)n1)C(=O)COc1cccc(F)c1,cx1118a,cx0540a,2024-09-04 10:07:00,1.0,MNLAWAQYKQVGLK-UHFFFAOYSA-N,EN300-00073,EN300-702794,,,True
2,CNCc1ccc(O)nc1,CCOc1ncccc1C(=O)O,Amidation,,,,cx0522a-cx1118a,50.0,tTnD23,CCOc1ncccc1C(=O)N(C)Cc1ccc(O)nc1,cx0522a,cx1118a,2024-09-03 20:56:00,1.0,FCGOSHSMTQLVEX-UHFFFAOYSA-N,EN300-2855135,EN300-21350,,,True
3,NC1CCCC1CO,Clc1c(I)ccc2[nH]cnc12,Buchwald-Hartwig_amination,,,,cx0894a-cx0406a,238.0,bk89Er,OC[C@@H]1CCC[C@H]1Nc1ccc2[nH]cnc2c1Cl,cx0406a,cx0894a,2024-09-04 11:35:00,1.0,WHLDZOYDHAHKLT-UHFFFAOYSA-N,EN300-76012,EN300-25644419,,,True
4,NCc1ccc(O)s1,CCOC(=O)c1ccnn1C,Ester_amidation,,,,cx0892a-cx1151a,128.0,8NWsBn,Cn1nccc1C(=O)NCc1ccc(O)s1,cx1151a,cx0892a,2024-09-04 09:20:00,1.0,IMTCTUMLWQUKFC-UHFFFAOYSA-N,,,,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64,OCCl,Clc1ccc2c(c1)NCO2,Nucleophilic_substitution_with_amine,OCN1COc2ccc(Cl)cc21,OB(O)c1ccn[nH]1,Sp2-sp2_Suzuki_coupling,cx0864d-cx0969b,300.0,6ACutS,OCN1COc2ccc(-c3ccn[nH]3)cc21,cx0864d,cx0969b,2024-09-04 11:31:00,2.0,CVNPYWIROGTFDL-UHFFFAOYSA-N,,,,EN300-200298,False
65,O=CO,CN(C)C(=O)CNc1cc[nH]c(=O)c1,Amidation,,,,cx1076a-cx0721a,161.0,4DEaMj,CN(C)C(=O)CN(C=O)c1cc[nH]c(=O)c1,cx1076a,cx0721a,2024-09-04 09:09:00,1.0,KKYCNGMNUUPLTM-UHFFFAOYSA-N,EN300-19898,,,,False
66,Nc1ccccc1N,COCC(=O)OC(C)(C)C,Ester_amidation,,,,cx0692a-cx1116a,283.0,3bhpFq,COCC(=O)Nc1ccccc1N,cx0692a,cx1116a,2024-09-05 11:12:00,1.0,PPHADCGGEPSVTM-UHFFFAOYSA-N,EN300-19093,,,,False
67,CNCc1cccc(=O)[nH]1,CCOC(=O)c1cn[nH]c1,Ester_amidation,,,,cx0732a-cx1135a,29.0,ZzGYCp,CN(Cc1cccc(=O)[nH]1)C(=O)c1cn[nH]c1,cx0732a,cx1135a,2024-09-03 19:00:00,1.0,ZJENMQHSGLZNHL-UHFFFAOYSA-N,,EN300-93017,,,False


In [15]:
# multistep
df[df['num_steps'] == 2]

Unnamed: 0,1_r1_smiles,1_r2_smiles,1_reaction,2_r1_smiles,2_r2_smiles,2_reaction,compound_set,input_num,route_uuid,smiles,hit1_short,hit2_short,datetime,num_steps,flat_inchi,1_r1,1_r2,2_r1,2_r2,all_reactants_in_enamine
5,CC(C)(C)OC(=O)NCO,,N-Boc_deprotection,NCO,Cc1c(C(=O)O)ccnc1NC1CC=CC1,Amidation,cx0441a-cx0317a,171.0,wzbbXS,Cc1c(C(=O)NCO)ccnc1NC1CC=CC1,cx0317a,cx0441a,2024-09-04 09:36:00,2.0,YIIMVUSFSKUYCA-UHFFFAOYSA-N,,,,,False
14,O=Cc1nn(CC(=O)O)c2ccccc12,CNC,Amidation,Nc1ccc(O)cc1,CN(C)C(=O)Cn1nc(C=O)c2ccccc21,Reductive_amination,cx0935a-cx0312a,314.0,fbhmej,CN(C)C(=O)Cn1nc(CNc2ccc(O)cc2)c2ccccc21,cx0935a,cx0312a,2024-09-04 12:57:00,2.0,WLFMTZKZYACXSW-UHFFFAOYSA-N,,EN300-30964,EN300-33645,,False
18,O=c1cccc[nH]1,ClCc1noc(Br)n1,Nucleophilic_substitution_with_amine,O=c1ccccn1Cc1noc(Br)n1,CNc1nccc2ccccc12,N-nucleophilic_aromatic_substitution,cx0353a-cx0316a,140.0,Bpx294,CN(c1nc(Cn2ccccc2=O)no1)c1nccc2ccccc12,cx0353a,cx0316a,2024-09-04 10:49:00,2.0,OYAFCPFGSBONSE-UHFFFAOYSA-N,,,,EN300-1231023,False
21,O=C1CCC(Cl)NN1,CN,Nucleophilic_substitution_with_amine,CNC1CCC(=O)NN1,CCOc1ncccc1C(=O)O,Amidation,cx0522a-cx1135a,51.0,e5WDiF,CCOc1ncccc1C(=O)N(C)[C@H]1CCC(=O)NN1,cx0522a,cx1135a,2024-09-03 19:43:00,2.0,CTECJTBKOFPTMY-UHFFFAOYSA-N,,,,EN300-21350,False
22,O=Cc1nn(CC(=O)O)c2ccccc12,Nc1ccc(O)cc1,Reductive_amination,O=C(O)Cn1nc(CNc2ccc(O)cc2)c2ccccc21,CNC,Amidation,cx0312a-cx0935a,264.0,tCavCn,CN(C)C(=O)Cn1nc(CNc2ccc(O)cc2)c2ccccc21,cx0935a,cx0312a,2024-09-04 20:01:00,2.0,WLFMTZKZYACXSW-UHFFFAOYSA-N,,EN300-33645,,EN300-30964,False
32,Cc1cc(I)cc(F)n1,CNNC=O,N-nucleophilic_aromatic_substitution,Cc1cc(I)cc(N(C)NC=O)n1,CC(CCB1OC(C)(C)C(C)(C)O1)NC(N)=O,Sp3-sp2_Suzuki_coupling,cx0544a-cx1194a,33.0,oKid7K,Cc1cc(CC[C@H](C)NC(N)=O)cc(N(C)NC=O)n1,cx0544a,cx1194a,2024-09-03 18:02:00,2.0,OYZLDUQCBJYBED-UHFFFAOYSA-N,EN300-726488,,,,False
33,Nc1ccc(O)c(B(O)O)c1,COCC(=O)O,Amidation,O=C1CCC(NC(=O)c2csc(I)n2)CN1,COCC(=O)Nc1ccc(O)c(B(O)O)c1,Sp2-sp2_Suzuki_coupling,cx0935a-cx0300a,313.0,ZWFCao,COCC(=O)Nc1ccc(O)c(-c2nc(C(=O)N[C@@H]3CCC(=O)N...,cx0935a,cx0300a,2024-09-04 10:40:00,2.0,SBEBTLKTHHBVAY-UHFFFAOYSA-N,,EN300-20233,,,False
34,N#Cc1c(O)ccc(N)c1F,COCC(=O)O,Amidation,COCC(=O)Nc1ccc(O)c(C#N)c1F,CNCC(N)=O,N-nucleophilic_aromatic_substitution,cx0935a-cx0553a,317.0,sCHeek,COCC(=O)Nc1ccc(O)c(C#N)c1N(C)CC(N)=O,cx0935a,cx0553a,2024-09-04 14:44:00,2.0,XXPRWRCNORVHSY-UHFFFAOYSA-N,EN300-6802393,EN300-20233,,EN300-41185,True
36,O=C(O)C(CS)c1cccc(Br)c1,NC1CCC(=O)NC1,Amidation,O=C1CCC(NC(=O)C(CS)c2cccc(Br)c2)CN1,CCC(N)N,Buchwald-Hartwig_amination,cx1338a-cx0300a,40.0,TqMXju,CC[C@H](N)Nc1cccc([C@@H](CS)C(=O)N[C@@H]2CCC(=...,cx1338a,cx0300a,2024-09-05 16:16:00,2.0,JAHJLJNEYRGXQB-UHFFFAOYSA-N,,EN300-56890,,,False
39,Nc1cc(Br)c[nH]1,CCCO,Mitsunobu_reaction_with_amine_alcohol_and_thio...,CCCNc1cc(Br)c[nH]1,CC(C)CCB(O)O,Sp3-sp2_Suzuki_coupling,cx1278a-cx0353a,106.0,BbS6ek,CCCNc1cc(CCC(C)C)c[nH]1,cx1278a,cx0353a,2024-09-04 16:25:00,2.0,VXOZIAXPIMEJPP-UHFFFAOYSA-N,,EN300-19337,,EN300-1252973,False


In [97]:
df.to_csv('/Users/kate_fieseler/PycharmProjects/CHIKV-Mac-syndirella-run/sept23_syndirella_CHIKV_MAC_final_car_routes_w_enamine.csv', index=False)

In [19]:
df[df['all_reactants_in_enamine'] == False][df['num_steps']==2].to_csv('missing_two_step_routes.csv', index=False)

  df[df['all_reactants_in_enamine'] == False][df['num_steps']==2].to_csv('missing_two_step_routes.csv', index=False)
