# How to use the trained molecule VAE


In [1]:
import sys
sys.path.insert(0, '..')

import grammar.zinc_grammar
import molecule_vae


Using TensorFlow backend.


### You need to have a .hdf5 file with trained weights somewhere.

In [2]:
grammar_weights = "../zinc_vae_L56.hdf5"#"/Users/brookspaige/Downloads/zinc_vae_L56.hdf5"

In [3]:
grammar_model = molecule_vae.ZincGrammarModel(grammar_weights)



### Here are some example smiles strings.

The `encode` method takes a list of smiles strings, and maps them (deterministically) to the posterior mean estimate.

This returns a matrix of dimension `[num_inputs, 56]`.

In [4]:
smiles = ["C[C@@H]1CN(C(=O)c2cc(Br)cn2C)CC[C@H]1[NH3+]",
          "CC[NH+](CC)[C@](C)(CC)[C@H](O)c1cscc1Br",
          "O=C(Nc1nc[nH]n1)c1cccnc1Nc1cccc(F)c1",
          "Cc1c(/C=N/c2cc(Br)ccn2)c(O)n2c(nc3ccccc32)c1C#N",
          "CSc1nncn1/N=C\c1cc(Cl)ccc1F"]

In [7]:
grammar.zinc_grammar.gram

"smiles -> chain\natom -> bracket_atom\natom -> aliphatic_organic\natom -> aromatic_organic\naliphatic_organic -> 'B'\naliphatic_organic -> 'C'\naliphatic_organic -> 'N'\naliphatic_organic -> 'O'\naliphatic_organic -> 'S'\naliphatic_organic -> 'P'\naliphatic_organic -> 'F'\naliphatic_organic -> 'I'\naliphatic_organic -> 'Cl'\naliphatic_organic -> 'Br'\naromatic_organic -> 'c'\naromatic_organic -> 'n'\naromatic_organic -> 'o'\naromatic_organic -> 's'\nbracket_atom -> '[' BAI ']'\nBAI -> isotope symbol BAC\nBAI -> symbol BAC\nBAI -> isotope symbol\nBAI -> symbol\nBAC -> chiral BAH\nBAC -> BAH\nBAC -> chiral\nBAH -> hcount BACH\nBAH -> BACH\nBAH -> hcount\nBACH -> charge class\nBACH -> charge\nBACH -> class\nsymbol -> aliphatic_organic\nsymbol -> aromatic_organic\nisotope -> DIGIT\nisotope -> DIGIT DIGIT\nisotope -> DIGIT DIGIT DIGIT\nDIGIT -> '1'\nDIGIT -> '2'\nDIGIT -> '3'\nDIGIT -> '4'\nDIGIT -> '5'\nDIGIT -> '6'\nDIGIT -> '7'\nDIGIT -> '8'\nchiral -> '@'\nchiral -> '@@'\nhcount -> 'H'

In [18]:
import numpy as np
z1 = grammar_model.encode(smiles)
print z1.shape

(5, 56)


### Decoding is stochastic.

Calling the `decode` method will draw a single sample from the decoding distribution.

In [20]:
for mol in grammar_model.decode(z1):
    print mol

C[C@@H]1CN(C(=O)c2cc(Br)so2C)CC[C@H]1#[NH2+]
CC[NH+](C)=C[C@](C)CCC[C@H](O)c1cs
OC(Nc1nc[nH]n1c1)cccnc1Nc1cccc(F)c1F
Oc1c(/C#N/c2cc(I)cs)c2c(=O)n2c(nc4ccc)cc322c1
COc1scnn1/S=C/c1\c(I)ccc1FFs1


In [16]:
for mol in grammar_model.decode(z1):
    print mol

C[C@@H]1CN(C(=O)c2cc(Br)cs2C)CC[C@H]1[NH3+]
CC[NH+](CC)#[C@](C)CCCC(C(O))c1nc
OC(Nc1nc[nH]n1c1)cccnc1Nc1cccc(F)c1F
Cc1c(/C=N/c2cc(Br)co2)nc(N)c2c(nc4cccc)cc3c1
CSc1ncnn1C=Nc1nc(Br)Clccc1


## The character model has the same interface as the grammar model.

In [8]:
char_weights = "/Users/brookspaige/Downloads/zinc_str_vae_L56_E99.hdf5"

In [9]:
char_model = molecule_vae.ZincCharacterModel(char_weights)

In [10]:
z2 = char_model.encode(smiles)
print z2.shape

(5, 56)


The encoded smiles strings in the character model are also each 56-dimensional vectors.

In [11]:
for mol in char_model.decode(z2):
    print mol

C[C@@H]1CN(C(=O)c2cc(Br)cc2C)CC[C@H]1[NH3+]
CC[NH+](CC)[C@](C)(CC)[C@H](O)c1cscc1Br
O=C(Nc1nn[nH]n1)c1ccccc1Nc1cccc(F)c1
Cn1c(/N=C/c2cc(Br)cc22)c(C)n2c(nc3ccccc32)c1C#N
CCc1nnnc1/N=C/c1cc(Cl)ccc1F


In [12]:
for mol in char_model.decode(z2):
    print mol

C[C@@H]1CN(C(=O)c2cc(Br)cn3C)CC[C@H]1[NH3+]
CC[NH+](CC)[C@](C)(CC)[C@H](O)c1cscc1Br
O=C(Nc1nc[nC]n1)c1cccnc1Nc1cccc(F)c1
Cc1c(/N=C/c2cc(Br)cc2Fcc(C)n2c(nc3ccccc32)c1C#N
CNc1nnnn1/N=C\c1cc(Cl)ccc1F
