In [1]:
import os
import numpy as np
import pandas as pd
import pickle

import tensorflow as tf
import tensorflow.keras.backend as K
import logging 

import sys
import inference as infr
import utils

from rdkit import Chem



In [2]:
logging.basicConfig(level=logging.INFO, format ='%(levelname)s - %(message)s')
tf.logging.set_verbosity(tf.logging.ERROR)

### Arguments

In [8]:

args = {
    'gpu': 7,
    'filename_train_cpds':'Data/Compound_dataset/CellPainting_30kcpds_normalized_profiles__final__only_metadata.csv',
    'model_weight_paths':{
        'autoencoder': 'Trained_models/Selfies_Autoencoder/selfies_EncoderDecoder_epoch0010.h5',
        'wgan':{
            'C': 'Trained_models/WGAN/wgan_C_500epochs.h5', 
            'D': 'Trained_models/WGAN/wgan_D_500epochs.h5',
            'G1':'Trained_models/WGAN/wgan_G1_500epochs.h5',
            'condition_encoder':'Trained_models/WGAN/wgan_condition_encoder_500epochs.h5',
            'classifier':'Trained_models/WGAN/wgan_classifier.h5',
        }
    },
}

args['output_file'] = args['filename_train_cpds'].replace('normalized_profiles__final__only_metadata','molecular_embeddings__unique_SMILES')
args['output_file']

'Data/Compound_dataset/CellPainting_30kcpds_molecular_embeddings__unique_SMILES.csv'

### Load data

In [5]:
train = pd.read_csv(args['filename_train_cpds'], index_col=0)
train = train.loc[ train.Metadata_broad_sample !='DMSO'  ].reset_index(drop=True)
keep_idx = train.SMILES_standard.apply(lambda x: len(x) < infr.max_smiles_length )
train = train.loc[keep_idx].reset_index(drop=True)  

# keep only unique SMILES to reduce computaiton and stroage, sicne there are several repetitions per compound
train = pd.DataFrame( train[["SMILES_standard"] ].drop_duplicates()).reset_index(drop=True)


### Load model ands set GPU

In [6]:
os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu'])
gpu_options = tf.GPUOptions(visible_device_list='0')
sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
K.set_session(sess)
tf.config.set_soft_device_placement(True)
tf.debugging.set_log_device_placement(True)

model = infr.InferenceModel( args['model_weight_paths'] )  


### Compute embeddings training set

In [10]:
if not os.path.isfile(args['output_file']):
    logging.info("Computing selfies")
    train_smiles = train.SMILES_standard.values.astype(str)
    train_selfies, valid_idx = model.encode_smiles_to_selfies(train_smiles)
    logging.info("Removing %i unvalid selfies"%(valid_idx==False).sum())
    train = train.loc[valid_idx].reset_index(drop=True)
    train_selfies = train_selfies[valid_idx]
    logging.info("Computing latent representations")
    train_latents = model.encode_selfies_to_latent(train_selfies)
    logging.info("Saving latent representations")
    embd_cols = ['MolEmb_'+str(i) for i in range(train_latents.shape[1])]
    train = pd.concat( [train, pd.DataFrame( columns=embd_cols, data=train_latents)], axis=1)
    train.to_csv(args['output_file'])
else:
    print('loading file %s'%args['output_file'])
    train = pd.read_csv(args['output_file'], index_col=0)

loading file Data/Compound_dataset/CellPainting_30kcpds_molecular_embeddings__unique_SMILES.csv
