In [3]:
import os

import django
from blocks.block import Block
from blocks.blockset import BlockSet
from molgen.blockreactor import MyReactionEnvironment
from molgen.multismarts_rxnenv import MultiSmartsRxnEnv
from molgen.singlereactant_rxnenv import SingleReactantRxnEnv

os.environ["DJANGO_SETTINGS_MODULE"] = "djangochem.settings.denn_molgen"
# this must be run to setup access to the django settings and make database access work etc.
django.setup()

from IPython.display import HTML

get_ipython().magic('matplotlib inline')
from rdkit.Chem import AllChem as Chem
from rdkit.Chem import Draw
from rdkit.Chem import PandasTools  # headsup: this import change the behavior of dataframes with mols in them

# some global configuration of the pandastools
PandasTools.molRepresentation = 'svg'
PandasTools.molSize = (200, 200)

# constatns
HA_TO_EV = 27.211399
PERIODICTABLE = Chem.GetPeriodicTable()

import pandas as pd

# this is a little helper function to render images inside a dataframe
# once again, there are ways to monkey patch the rendering of dataframes, but I am trying to 
# avoid most of that to make things a bit easier to understand

def show(df):
    return HTML(df.to_html(escape=False))


from rdkit.Chem import AllChem
from rdkit.Chem.rdDistGeom import EmbedMolecule
from rdkit.Chem.rdmolops import Kekulize
from rdkit.Chem.rdmolfiles import MolToSmiles

In [4]:
def fuse(reactants_smiles, bridges_smiles, links_smiles, n_gens=3, mix=True,fuse4=False,linkings=False,n_gens_l=1):
    if links_smiles is None:
        links_smiles = []

    N_GENS = n_gens
    N_GENS_L = n_gens_l

    smarts_strings_fuse_2 = ["[*:4]~[c;H1:1]:[c:2]([At:3])~[*:5].[c;H1:7]~[a:8]-[At:9]" + ">>[*:4]:[*:7]:[*:8]:[*:5]",
                         "[*:4]~[c;H1:1]:[c:2]([At:3])~[*:5].[c;H1:7]~[a:8]-[At:9]" + \
                         ">>[*:4]:[*:8]:[*:7]:[*:5]", ]

    smarts_strings_fuse_4_single = "[c;H1:1]~[$([#6;H0;x3]([#6;H0;x3])[#6;H0;x3]):2]~[$([#6;H0;x3]([#6;H0;x3])[#6;H0;x3]):3]~[c;H1:4]" + ">>[c]2:[c:1]:[c:2]:[c:3]:[c:4]:[c]2"

    smarts_strings_linking = ["[*:1][At:3].[*:2][At:4]" + ">>[*:1][*:2].[At:3][At:4]","[*:1][At:3].[*:2][At:4]" + ">>[*:1][*:2].[At:3][At:4]"]

    substution_env_2 = MultiSmartsRxnEnv(smarts_strings_fuse_2, ["At", "At"])
    substution_env_4_single = SingleReactantRxnEnv(smarts_strings_fuse_4_single, [])
    #linking_env_2 = SingleReactantRxnEnv(smarts_strings_linking, ["At", "At"])
    linking_env_2 = MultiSmartsRxnEnv(smarts_strings_linking, ["At", "At"])
    
    cleaner_env = MyReactionEnvironment("", ['At'])

    all_products = BlockSet()
    for reactant in reactants_smiles:
        # Initiate product set with just the reactant
        all_products |= BlockSet([Block(reactant)])

        # Blocks used to extend the molecule: the initial fragment and all the bridges. 
        # Comment int_prods out to avoid mixing more of the initial fragment int
        reagents = BlockSet([Block(k) for k in bridges_smiles]) 
        if mix:
            reagents |= all_products

        # Start of the recursive growth for N_GENS total generation
        for i in range(N_GENS):
            fus_products = BlockSet()
            with substution_env_2:
                try:
                    run_prods = [j for j in all_products.react(reagents)]
                except ValueError as e:
                    print(e)
                    continue
                fus_products |= BlockSet(run_prods)
            all_products |= BlockSet(fus_products)
    
    if linkings:
        # Start of the recursive growth for N_GENS total generation
        reagents_l = BlockSet([Block(k) for k in links_smiles]) 
        for i in range(N_GENS_L):
            products = BlockSet()
            with linking_env_2:
                try:
                    run_prods = [j for j in all_products.react(reagents_l)]
                except ValueError as e:
                    print(e)
                    continue
                products |= BlockSet(run_prods)
            all_products |= BlockSet(products)        
                
    with cleaner_env:
        all_products = BlockSet([i for i in all_products.cleaned()])
        #print(len(all_products))

    if fuse4:
        with substution_env_4_single:
            exten_products = BlockSet()
            try:
                run_prods = [j for j in all_products.react()]
            except ValueError as e:
                print(e)
            exten_products |= BlockSet(run_prods)
            all_products |= exten_products

    mols = [i.mol for i in list(all_products)]
    df_fused=pd.DataFrame(mols,columns=['mol'])
    df_fused['smiles']=df_fused.mol.apply(lambda mol: Chem.MolToSmiles(mol))
    df_fused['inchi_key']=df_fused.mol.apply(lambda mol: Chem.InchiToInchiKey(Chem.MolToInchi(mol)))
    # test_four_site = 'C1=CC=C2C(C=CC3=C2C=CC2=CC=CC=C32)=C1'

    # # patt = Chem.MolFromSmarts('[#6](=[#6])(-[#6])-[#6]')
    # patt = Chem.MolFromSmarts('[#6](-[#6])-[#6]')

    # with substution_env_4_single:
    #     int_prods = [Block(test_four_site)]
    #     int_prods = BlockSet(int_prods)
    #     for j in int_prods.react():
    #         try:
    #             mol = j.mol

    #             Kekulize(mol)
    #             EmbedMolecule(mol)
    #             AllChem.UFFOptimizeMolecule(mol)
    #             # print(Chem.MolToMolBlock(mol))
    #             print(MolToSmiles(mol, kekuleSmiles=True))
    #             print(mol.GetSubstructMatch(patt))
    #         except ValueError as e:
    #             print(e)

    return df_fused
    

In [None]:
reactants_smiles = ['[At]c1cc([At])ccc1',
                    '[At]C1=CC=C([At])S1',
                     
                    ]
bridges_smiles = ['[At]c1ccc([At])cc1',
                  '[At]C1=CC=C([At])S1',
                  '[At]C1=CC=C([At])O1',
                  '[At]c1cc([At])ccc1',
                  '[At]c1ccc([At])nc1',
                  '[At]C1=CC=C([At])N1',
                  ]


In [23]:
dfs=[]

In [22]:
reactants_smiles = ['[At]c1cc([At])ccc1',
                    '[At]c1cc([At])cc([At])c1',
                    '[At]c1ccc([At])cc1',
                    ]
bridges_smiles = [
                  '[At]C1=CC=C([At])S1',
                  '[At]C1=CC=C([At])O1',
                  '[At]C1=CC=C([At])N1',
                  ]


dfres=fuse(reactants_smiles,bridges_smiles,3)
dfres.smiles.count()
dfs.append(dfres)

NameError: name 'dfs' is not defined

In [24]:
show(dfres.sample(10))

Unnamed: 0,mol,smiles,inchi_key
728,\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nS\nS\n,c1ccc2c(c1)sc1c2ccc2ccsc21,GHBPPUPSUFNMPC-UHFFFAOYSA-N
461,\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nNH\nO\nS\n,c1cc2c([nH]1)oc1cc3ccsc3cc12,LTGUHCDCTJQBCQ-UHFFFAOYSA-N
48,\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nO\nO\n,c1ccc2c(c1)oc1c3ccoc3ccc21,SARRFHJVNNNDGD-UHFFFAOYSA-N
263,\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nS\nNH\n,c1ccc2c(c1)ccc1c3ccsc3[nH]c21,UDCQAOBTQYEVSQ-UHFFFAOYSA-N
569,\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nS\nNH\n,c1cc2ccc3ccsc3c2[nH]1,GOIPVTPZHPXGMH-UHFFFAOYSA-N
221,\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nO\nNH\n,c1ccc2c(c1)oc1c2ccc2cc[nH]c21,RAOZCIPYWRYQKN-UHFFFAOYSA-N
436,\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nNH\nNH\nO\n,c1cc2c([nH]1)[nH]c1c3ccoc3ccc21,NEHGTJPEZDSSLG-UHFFFAOYSA-N
637,\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nO\n,c1ccc2c(c1)ccc1c3ccccc3oc21,BCBSVZISIWCHFM-UHFFFAOYSA-N
746,\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nS\nS\nNH\n,c1cc2sc3cc4ccsc4cc3c2[nH]1,DKWMDTUZIDKGSH-UHFFFAOYSA-N
17,\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nNH\nNH\nO\n,c1cc2c([nH]1)[nH]c1c2ccc2ccoc21,VZEVADKOJQXSCG-UHFFFAOYSA-N


In [18]:
reactants_smiles = [
#                    '[At]c1cc([At])ccc1',
                    '[At]c1cc([At])cc([At])c1',
#                       '[At]C1=C([At])SC([At])=C1[At]',
#                    '[At]c1ccc([At])cc1',
#                        '[At]C1=CSC=C1',
]

bridges_smiles = [
                 '[At]C1=CC=C([At])S1',
#                    '[At]C1=CSC=C1[At]',
#                    '[At]C1=CSC=C1',
#                   '[At]C1=C([At])SC([At])=C1[At]',
#                 '[At]C1=CC=C([At])O1',
#                  '[At]C1=CC=C([At])N1',
#                  '[At]c1ccc([At])cc1',    
                  ]

links_smiles = [
                  '[At]C1=CC=C([At])S1',
                ]

dfres=fuse(reactants_smiles,bridges_smiles,links_smiles,3,True,False,True,1)
print(dfres.smiles.count())
#dfs.append(dfres)
show(dfres.sample(5))
with open('/home/denn/harvard/SF/Meetings/Group_Meeting/pi_th_sample1.svg','w') as f:
    f.write(Draw.MolsToGridImage(dfres.sample(14)['mol'], molsPerRow=7,useSVG=True))

524


In [19]:
df_emol=pd.read_pickle('/home/denn/home/ml/data/emolecules_smiles_sdf.pickle')

In [20]:
dfm=pd.merge(df_emol,dfres,left_on='inchikey',right_on='inchi_key',how='inner')
dfm.shape

(22, 6)

In [21]:
show(dfm)

Unnamed: 0,inchi,inchikey,smiles_x,mol,smiles_y,inchi_key
0,InChI=1S/C12H8S/c1-2-4-10-9(3-1)5-6-12-11(10)7-8-13-12/h1-8H,LJOLGGXHRVADAA-UHFFFAOYSA-N,c1ccc2c(c1)ccc1sccc12,\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nS\n,c1ccc2c(c1)ccc1sccc12,LJOLGGXHRVADAA-UHFFFAOYSA-N
1,InChI=1S/C14H10/c1-2-6-12-10-14-8-4-3-7-13(14)9-11(12)5-1/h1-10H,MWPLVEDNUUSJAV-UHFFFAOYSA-N,c1ccc2cc3ccccc3cc2c1,\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n,c1ccc2cc3ccccc3cc2c1,MWPLVEDNUUSJAV-UHFFFAOYSA-N
2,InChI=1S/C18H12/c1-2-7-15-12-18-16(11-14(15)6-1)10-9-13-5-3-4-8-17(13)18/h1-12H,DXBHBZVCASKNBY-UHFFFAOYSA-N,c1ccc2cc3c(ccc4ccccc43)cc2c1,\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n,c1ccc2cc3c(ccc4ccccc43)cc2c1,DXBHBZVCASKNBY-UHFFFAOYSA-N
3,InChI=1S/C18H12/c1-2-6-14-10-18-12-16-8-4-3-7-15(16)11-17(18)9-13(14)5-1/h1-12H,IFLREYGFSNHWGE-UHFFFAOYSA-N,c1ccc2cc3cc4ccccc4cc3cc2c1,\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n,c1ccc2cc3cc4ccccc4cc3cc2c1,IFLREYGFSNHWGE-UHFFFAOYSA-N
4,InChI=1S/C8H6S/c1-2-4-8-7(3-1)5-6-9-8/h1-6H,FCEHBMOGCRZNNI-UHFFFAOYSA-N,c1ccc2sccc2c1,\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nS\n,c1ccc2sccc2c1,FCEHBMOGCRZNNI-UHFFFAOYSA-N
5,InChI=1S/C16H10S/c1-2-6-12-11(5-1)9-10-14-13-7-3-4-8-15(13)17-16(12)14/h1-10H,YEUHHUCOSQOCIX-UHFFFAOYSA-N,c1ccc2c(c1)ccc1c3ccccc3sc21,\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nS\n,c1ccc2c(c1)ccc1c3ccccc3sc21,YEUHHUCOSQOCIX-UHFFFAOYSA-N
6,InChI=1S/C12H8S/c1-3-7-11-9(5-1)10-6-2-4-8-12(10)13-11/h1-8H,IYYZUPMFVPLQIF-UHFFFAOYSA-N,c1ccc2c(c1)sc1ccccc12,\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nS\n,c1ccc2c(c1)sc1ccccc12,IYYZUPMFVPLQIF-UHFFFAOYSA-N
7,InChI=1S/C18H12/c1-2-8-14-13(7-1)15-9-3-4-11-17(15)18-12-6-5-10-16(14)18/h1-12H,SLGBZMMZGDRARJ-UHFFFAOYSA-N,c1ccc2c(c1)c1ccccc1c1ccccc21,\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n,c1ccc2c(c1)c1ccccc1c1ccccc21,SLGBZMMZGDRARJ-UHFFFAOYSA-N
8,InChI=1S/C10H8S/c1-2-5-9(6-3-1)10-7-4-8-11-10/h1-8H,PJRGDKFLFAYRBV-UHFFFAOYSA-N,c1ccc(-c2cccs2)cc1,\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nS\n,c1ccc(-c2cccs2)cc1,PJRGDKFLFAYRBV-UHFFFAOYSA-N
9,InChI=1S/C18H12/c1-3-7-16-13(5-1)9-11-15-12-10-14-6-2-4-8-17(14)18(15)16/h1-12H,TUAHORSUHVUKBD-UHFFFAOYSA-N,c1ccc2c(c1)ccc1ccc3ccccc3c12,\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n,c1ccc2c(c1)ccc1ccc3ccccc3c12,TUAHORSUHVUKBD-UHFFFAOYSA-N


In [21]:
with open('/home/denn/harvard/SF/Meetings/Group_Meeting/know_pi.svg','w') as f:
    f.write(Draw.MolsToGridImage(dfm['mol'], molsPerRow=7,useSVG=True))

In [None]:
show(dfs[3])#.sample(10))

In [None]:
print(dfs[3].count())