# Compute molecular embeddings from SMILES

In [1]:
import os
import numpy as np
import pandas as pd
import pickle
import sys

import tensorflow as tf
import tensorflow.keras.backend as K
tf.logging.set_verbosity(tf.logging.ERROR)

import cpmolgan.utils
import cpmolgan.inference as infr

### Inputs

In [2]:
args = {
    'compounds_file':'example_compounds.csv',
    'output_file':'example_molecular_embeddings.csv',
    'neural_net':{
        'autoencoder': '../cpmolgan/data/selfies_EncoderDecoder_epoch0010.h5',
         'wgan':{
                'C': '../cpmolgan/data/wgan_C_500epochs.h5',
                'D': '../cpmolgan/data/wgan_D_500epochs.h5',
                'G':'../cpmolgan/data/wgan_G_500epochs.h5',
                'condition_encoder':'../cpmolgan/data/wgan_condition_encoder_500epochs.h5',
                'classifier':'../cpmolgan/data/wgan_classifier.h5'
            }
        },
    'gpu': '7'
}

### Read data and standardize SMILES

In [3]:
cpds = pd.read_csv(args['compounds_file'])
cpds["SMILES_standard"]= cpmolgan.utils.clean_smiles_parallel( cpds.SMILES )
keep_idx = cpds.SMILES_standard.apply(lambda x: len(x) < infr.max_smiles_length )
cpds = cpds[keep_idx].reset_index(drop=True)

### Set compute environment

In [4]:
os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu'])
gpu_options = tf.GPUOptions(visible_device_list='0')
sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
K.set_session(sess)
tf.config.set_soft_device_placement(True)
tf.debugging.set_log_device_placement(True)

### Load model and compute embeddings

In [5]:
model = infr.InferenceModel( args['neural_net'] ) 

# Filter out invalid selfies 
smiles = cpds.SMILES_standard.values.astype(str)
cpds['selfies'], valid_idx = model.encode_smiles_to_selfies(smiles)
print("Removing %i compounds with unvalid selfies"%(valid_idx==False).sum())
cpds = cpds.loc[valid_idx].reset_index(drop=True)

# Compute latents
latents = model.encode_selfies_to_latent(cpds.selfies)
embd_cols = ['MolEmb_'+str(i) for i in range(latents.shape[1])]
latents = pd.concat( [cpds, pd.DataFrame( columns=embd_cols, data=latents)], axis=1)

Removing 1 compounds with unvalid selfies


### Save results

In [6]:
latents.to_csv(args['output_file'], index=False)