In [67]:
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [68]:
%autoreload 2
from pathlib import Path
import molparse as mp
import hippo2 as hippo
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
from rdkit import DataStructs
from rdkit.Chem.Fingerprints import FingerprintMols
from rdkit import Chem
import mout
import mcol

### load the restart animal

In [69]:
animal = hippo.HIPPO.from_pickle('pickles/hippo2_test_comps_shell_restart.pickle')

[36mpath[0m = [38;5;11mpickles/hippo2_test_comps_shell_restart.pickle[0m[95m [0m[0m


In [71]:
for base in animal.bases:
    if 'fegrow' in base.name:
        continue
    print(base)

Compound(matteo-ferla-rocs-cov-Z2737383535-1-of-1-step-base, C=CC(=O)N1Cc2ccccc2OC(C)(C)C1, #poses=1)
Compound(matteo-ferla-rocs-cov-Z4425282901-1-of-1-step-base, C=CC(=O)NCC1(NC(=O)c2ccc(F)c3ccccc23)CCOCC1, #poses=1)
Compound(matteo-ferla-rocs-mixed-Z420031520-1-of-1-step-base, Cc1ccccc1CNC(=O)c1scnc1C, #poses=1)
Compound(matteo-ferla-rocs-mixed-Z645758468-1-of-1-step-base, COc1ccc2ccccc2c1CNC(=O)c1c(C)nn2c(C)cc(C)nc12, #poses=1)
Compound(matteo-ferla-rocs-mixed-Z27634076-1-of-1-step-base, Cc1cc(C(=O)NC(C)c2ccccc2Cl)no1, #poses=2)
Compound(matteo-ferla-rocs-mixed-Z27634076-1-of-1-step-base-2, Cc1cc(C(=O)NC2Cc3cccc(Cl)c32)no1, #poses=2)
Compound(matteo-ferla-rocs-mixed-Z1083845368-1-of-1-step-base, CC(=O)NC(Cc1c[nH]c2ccccc12)C(=O)N1CCN(C)c2ccccc2C1, #poses=2)
Compound(matteo-ferla-rocs-cov-Z4899905008-1-of-1-step-base, C=CC(=O)NCc1ccccc1C(=O)Nc1cc(C(=O)Nc2nc(C)cs2)ccc1OC, #poses=1)
Compound(matteo-ferla-rocs-cov-Z4408138101-1-of-1-step-base, C=CC(=O)N(Cc1cc(Cl)c2c(c1)OCCCO2)C1CCCCNC1=O

### load the curated data

In [28]:
df = pd.read_excel('Quote_1782205_clean.xlsx')

In [29]:
df

Unnamed: 0,SMILES,Customer Code,Catalog ID,"Purity, %","Amount, mg",Collection,"Price, USD",Lead time,Detail
0,CCN1C(=NC=2C=CC=CC12)C(C)NC(=O)C,Z57841819,Z57841819,90,5,HTS,44,1-2 weeks,(stock)
1,CCCS(=O)(=O)NC=1ON=C(C)C1C,Z1607060571,Z1607060571,90,5,HTS,44,1-2 weeks,(stock)
2,CC(C)(C)NC(=O)CC1=NOC=2C=CC=CC12,Z99095183,Z99095183,90,5,HTS,44,1-2 weeks,(stock)
3,CC1=NOC(C)=C1CC(=O)NC=2C=CC=C(OCC=3C=CC=CN3)C2,Z383844704,Z383844704,90,5,HTS,44,1-2 weeks,(stock)
4,CC=1N=CSC1C(=O)NCC=2C=CC=CC2C,Z420031520,Z420031520,90,5,HTS,44,1-2 weeks,(stock)
...,...,...,...,...,...,...,...,...,...
121,ClC=1C=CC=2C(NC(=O)C3(CC3)C=4C=CC(=CC4)C=5N=NN...,PV-006984519164,Z8873590480,90,5,No starting material,0,unavailable,(unavailable)
122,ClC=1C=CC=2C(NC(=O)C(CC=3N=NNN3)C=4C=CC=CN4)=C...,PV-007011194285,Z8873590481,90,5,No starting material,0,unavailable,(unavailable)
123,CC=1C=C(C=C(C)C1CC(=O)NC=2C=CC=C3N=C(Cl)C=CC23...,PV-007016163177,Z8873590482,90,5,No starting material,0,unavailable,(unavailable)
124,COC(=O)C=1C=C2OC3(CCN(C3)C(=O)OC(C)(C)C)CC(N)C...,EN300_1129082,Z3002995587,90,5,No starting material,0,unavailable,(unavailable)


In [30]:
### match base compounds

In [31]:
fps = [FingerprintMols.FingerprintMol(Chem.MolFromSmiles(s), minPath=1, maxPath=7, fpSize=2048, bitsPerHash=2, useHs=True, tgtDensity=0.0, minSize=128) 
           for s in df['SMILES']]

def get_similar(df, smiles):
    mol = Chem.MolFromSmiles(Chem.CanonSmiles(smiles))
    fp = FingerprintMols.FingerprintMol(mol, minPath=1, maxPath=7, fpSize=2048, bitsPerHash=2, useHs=True, tgtDensity=0.0, minSize=128)
    scores = DataStructs.BulkTanimotoSimilarity(fp, fps)
    df['similarity'] = scores
    return df[df['similarity'] == 1.0]

In [36]:
curated_bases = hippo.cset.CompoundSet('curated bases')
for base in animal.get_compounds('base'):
    # print(base)
    # print(get_similar(df, base.smiles))
    matches = get_similar(df, base.smiles)
    if len(matches) == 1:
        curated_bases.add(base)

curated_bases

CompoundSet("curated bases", #compounds=88, #poses=184)

In [56]:
delete_list = hippo.cset.CompoundSet('delete list')

n = len(animal.compounds)

for i,comp in enumerate(animal.compounds):

    if i%100 == 0:
        # print(i)
        mout.progress(i,n)

    if 'hits' in comp.tags:
        continue

    if 'base' in comp.tags:
        curated = comp in curated_bases
        if curated:
            continue
        else:
            delete_list.add(comp)
            continue

    if 'elab' in comp.tags:
        base = comp.base
        if base is None:
            base_name = '-'.join(comp.name.split('-')[:-1]) + '-base'
            base = base_name
        
        curated = base in curated_bases
        if curated:
            continue
        else:
            delete_list.add(comp)
            continue

    mout.error('Unknown compound')

mout.finish()

print(delete_list)

[0m[################################################ ] [38;5;27m 99.95%[0m[0mCompoundSet("delete list", #compounds=14197, #poses=49818)


In [62]:
animal._delete_compounds(delete_list, debug=True)

[1m[34m>>> [1mCompoundSet("hits", #compounds=38, #poses=49)[0m[0m
[1m[34m>>> [1mCompound(x0152, O=C1CN(C(=O)COc2ccccc2)CCN1, #poses=2)[0m[0m
[36mreferences deleted[0m = 0[0m[95m [0m[0m           ;5;27m 91.23%[0m[0m
[36mcompounds deleted[0m = 0[0m[95m [0m[0m            ;5;27m 91.23%[0m
[0m[############################################     ] [38;5;27m 91.23%[0m

In [42]:
curated_bases[0]

Compound(matteo-ferla-rocs-cov-Z2737383535-1-of-1-step-base, C=CC(=O)N1Cc2ccccc2OC(C)(C)C1, #poses=1)

In [63]:
animal.write_pickle('pickles/2A_hippo_post_trim.pickle')

                                                            [0m
[0m[############################################     ] [38;5;27m 91.23%[0mDone.[0m
[0m[############################################     ] [38;5;27m 91.23%[0m

In [64]:
animal.compounds

CompoundSet("compounds", #compounds=65607, #poses=216823)

In [65]:
animal.summary()

[1mHIPPO(hippo2_test)[0m[0m                              
[36mtarget_name[0m = A71EV2A[0m[95m [0m[0m            ;5;27m 91.23%[0m
[36mmax_lead_time[0m = 20[0m[95m workdays[0m[0m       ;5;27m 91.23%[0m
[36mmax_bb_price[0m = 100[0m[95m $[0m[0m              ;5;27m 91.23%[0m
[36mmin_bb_quantity[0m = 20[0m[95m mg[0m[0m           ;5;27m 91.23%[0m
[36m#compound_sets[0m = 1[0m[95m [0m[0m               ;5;27m 91.23%[0m
[36m#compounds[0m = 65607[0m[95m [0m[0m               ;5;27m 91.23%[0m
[36m#poses[0m = 216823[0m[95m [0m[0m                  ;5;27m 91.23%[0m
[36m#tags[0m = 8[0m[95m [0m[0m                        ;5;27m 91.23%[0m
[0m                                                        ;5;27m 91.23%[0m
[4mcompound sets:[0m                                      ;5;27m 91.23%[0m
CompoundSet("hits", #compounds=38, #poses=49)[0m           ;5;27m 91.23%[0m
[0m                                                        ;5;27m 91.23%[0m
[4

In [66]:
mout.finish()

[0m[32;1m OK[0m                                          
