#### Example use of the autoencoder

In [3]:
# Load trained model and embeddings
from mol_encode import MolecularAutoencoder
import pandas as pd
import numpy as np

autoencoder = MolecularAutoencoder(latent_dim=64, max_length=120)
autoencoder.build_vocab()  # Must rebuild vocab with same settings as training
autoencoder.load_model('saved_models/ae_onehot_cnn_maxlen120_latentdim64')
autoencoder

# Get embeddings from new CSV
df = pd.read_csv('data/new_mols.csv')
embeddings = []
decoded_smiles = []

for smiles in df['smiles']:
   result = autoencoder.embed_and_decode(smiles)
   embeddings.append(result['embedding'])
   decoded_smiles.append(result['decoded'])

# Convert to numpy arrays
embeddings_array = np.array(embeddings)



2024-12-24 13:48:59.953091: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8907
2024-12-24 13:49:00.038447: I tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory




2024-12-24 13:49:00.322583: I tensorflow/stream_executor/cuda/cuda_blas.cc:1614] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.




In [4]:
from itables import show
show(
    pd.DataFrame({"orig_smiles": df["smiles"],
              "decoded_smiles": decoded_smiles})
)

orig_smiles,decoded_smiles
Loading ITables v2.2.4 from the internet... (need help?),


In [8]:
# batch mode - a much more efficient way of dealing with more than 5 smiles

# Load model
autoencoder = MolecularAutoencoder(latent_dim=64, max_length=120)
autoencoder.build_vocab()
autoencoder.load_model('saved_models/ae_onehot_cnn_maxlen120_latentdim64')

# Process entire CSV
results = autoencoder.process_csv_batch('data/new_mols.csv', batch_size=64)
embeddings = results['embeddings']  # numpy array of shape (n_molecules, latent_dim)
results



{'embeddings': array([[-2.4786696 ,  1.237923  ,  1.7503992 , ...,  2.405345  ,
          2.6119413 , -2.037056  ],
        [-3.8888922 , -2.130727  , -2.6783981 , ..., -4.729442  ,
          1.1778733 , -1.0587604 ],
        [ 1.0430715 ,  4.6842327 , -0.29096732, ..., -0.60494375,
         -7.0482597 ,  5.207939  ],
        ...,
        [-6.371494  ,  1.4254801 ,  3.3906155 , ..., -3.9127812 ,
         -0.02598864,  1.1460123 ],
        [ 1.5440798 , -5.456609  ,  1.0330768 , ...,  2.7896981 ,
         -0.8563098 , -1.297301  ],
        [-2.5847285 , -4.487835  , -2.0344357 , ..., -3.7544408 ,
         -5.0206704 , -0.5285016 ]], dtype=float32),
 'decoded': ['CC(C)(C)NC(=O)[C@H]1CC[C@H]2[C@@H]3CC[C@H]4NC(=O)C=C[C@]4(C)[C@H]3CC[C@]12C',
  'O=C(NCC1CCOCC1)c1cnc(Nc2ccc(Cl)cc2Cl)nc1C(F)(F)F',
  'N#Cc1ccc(Cn2cncc2CN2CCN(c3cccc(Cl)c3)C(=O)C2)cc1',
  'Cc1cn([C@H]2C[C@H]([18F]O[C@@H](CO)O2)c(=O)[nH]c1=O',
  'CCCCCCCCC=CCCCCCCCC(=O)OCC1COP(=O)(O)C1',
  'O=C1CC[C@H](N2C(=O)c3cccc(NCc4ccc(CN5CC