#  Selfies Implementation - CS598 Deep Learning

## Drug Discovery: Variational Autoencoder Techniques for Molecule Generation

## Approach 4: Selfies

Team Members:

- Andrew Jacobson jonaj2@illinois.edu
- Dixon Liang dixonl2@illinois.edu
- John Judge jmjudge2@illinois.edu
- Megan Masanz mjneuman@illinois.edu

Implementation Description:

- Baseline Model
- Character Based Chemical VAE
- Aspuru-Guzik 

References include
* [Automatic Chemical Design Using a Data-Driven Continuous Representation of Molecules](https://arxiv.org/abs/1610.02415)
* https://github.com/aspuru-guzik-group/chemical_vae
* https://github.com/deepchem/deepchem
* https://github.com/molecularsets/moses (Leveraged in Notebook #3, but included here for completeness)
* https://github.com/Azure/azureml-examples/blob/main/tutorials/an-introduction/2.pytorch-model.ipynb

Requirements for notebook to run:
* Run this notebook inside an AzureML workspace (or provide configuration)
* No data is required as the training script will download the dataset

In [6]:
import azureml.core #adding core - this by default is in notebooks run on computer in Azure ML
from azureml.core import Workspace #needed for connecting to workspace
from azureml.core.compute import AmlCompute, ComputeTarget
from azureml.core.compute_target import ComputeTargetException
from azureml.core.runconfig import RunConfiguration, DEFAULT_GPU_IMAGE 
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core import Environment
from azureml.core.conda_dependencies import CondaDependencies


ws = Workspace.from_config()

###  Below is importing the requirements for this notebook to run inside an Azure ML environment for :
- connecting to a workspace
- creating remote computer for training 

In [7]:
cluster_name = 'mmdsvm04e'

try:
    compute_target = ComputeTarget(workspace=ws,  name=cluster_name )
    print('found existing:', compute_target.name)
    
except ComputeTargetException:
    print('creating new.')
    compute_config = AmlCompute.provisioning_configuration(
        vm_size='STANDARD_NC12',
        min_nodes=0,
        max_nodes=1)
    
    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)
    compute_target.wait_for_completion(show_output=True)

found existing: mmdsvm04e


### Below are the required packaged for running an Azure ML Experiment

In [8]:
myenv = Environment('deepchem_backend_selfies')

#already created
conda_dep = CondaDependencies().create(python_version='3.7.10', conda_packages=['tensorflow-gpu==2.4.1', 'rdkit', 'openmm', 'pdbfixer'])
conda_dep.add_channel("conda-forge")
conda_dep.add_channel("omnia")
conda_dep.add_pip_package("azureml-sdk")
conda_dep.add_pip_package("deepchem")
conda_dep.add_pip_package("selfies")
#1.19.4
conda_dep.add_pip_package("numpy==1.19.4")
#IPython
conda_dep.add_pip_package("IPython")
conda_dep.save(path="./train/condadep.yml")
myenv.python.conda_dependencies=conda_dep
#myenv.docker.enabled = True
myenv.docker.base_image = DEFAULT_GPU_IMAGE
myenv.register(workspace=ws)

{
    "databricks": {
        "eggLibraries": [],
        "jarLibraries": [],
        "mavenLibraries": [],
        "pypiLibraries": [],
        "rcranLibraries": []
    },
    "docker": {
        "arguments": [],
        "baseDockerfile": null,
        "baseImage": "mcr.microsoft.com/azureml/intelmpi2018.3-cuda10.0-cudnn7-ubuntu16.04:20210113.v1",
        "baseImageRegistry": {
            "address": null,
            "password": null,
            "registryIdentity": null,
            "username": null
        },
        "enabled": false,
        "platform": {
            "architecture": "amd64",
            "os": "Linux"
        },
        "sharedVolumes": true,
        "shmSize": null
    },
    "environmentVariables": {
        "EXAMPLE_ENV_VAR": "EXAMPLE_VALUE"
    },
    "inferencingStackVersion": null,
    "name": "deepchem_backend_selfies",
    "python": {
        "baseCondaEnvironment": null,
        "condaDependencies": {
            "channels": [
                "anaconda",
 

### Get the current working directory and inside of it, create a subdirectory called "train" to hold the training script - contents of this will be passed to the remote machine for training

In [9]:
cwd = os.getcwd()
current_dir = cwd
print(cwd)

/mnt/batch/tasks/shared/LS_root/mounts/clusters/mm-deepchem/code/Users/meganmasanz.work/vae_training


In [10]:
import os
script_folder = os.path.join(os.getcwd(), "train")
print(script_folder)
os.makedirs(script_folder, exist_ok = True)

/mnt/batch/tasks/shared/LS_root/mounts/clusters/mm-deepchem/code/Users/meganmasanz.work/vae_training/train


### The code below is actually the training and the model evaluation script.  Modifiying the script below will specifically in the main section will change parameters in the training.

### Parameters this script will allow changes to include:
- epoch_count = 20
- learning_rate = 0.0001

*The start and stop annealing steps have been set equal, this means that Cost Annealing will be disabled during runs*

*Note the commented out code will generate the images for valid molecules generated and can be uncommented if a user was interested in generating the images*


In [46]:
%%writefile $script_folder/train.py


import sys
import os
import requests
import subprocess
import shutil
import IPython
from logging import getLogger, StreamHandler, INFO
from deepchem.models.optimizers import Adam, ExponentialDecay
from deepchem.models.seqtoseq import AspuruGuzikAutoEncoder
import rdkit
import numpy as np
import deepchem
import rdkit
import tensorflow as tf
from azureml.core import Run
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Descriptors
from rdkit.Chem import AllChem
from rdkit import DataStructs
import random
import selfies

# DEFINE THE MODEL
#train_smiles, batch_size,learning_rate
def get_model(single_chars, train_smiles, learning_rate):
    
    max_length = max(len(s) for s in train_smiles)
    batch_size = 100
    learning_rate = ExponentialDecay(learning_rate, 0.95, len(train_smiles)/batch_size)
    model = AspuruGuzikAutoEncoder(single_chars, 
                                   max_length, 
                                   model_dir='vae', 
                                   batch_size=batch_size, 
                                   learning_rate=learning_rate, 
                                   teacher_forcing_ratio = 0.5)
    return model




# GENERATE MOLECULES AND TEST IF THEY ARE VALID
def generate_molecules(model, reverse_map, n_molecules=10000):
    run = Run.get_context()
    predictions = model.predict_from_embeddings(np.random.normal(size=(n_molecules,196))) 
    valid = []

    #using chem from rdkit to ensure generated molecules are valid
    count = 0
    for p in predictions:
      count += 1
      selfie = ""
      mapped = ''.join(p)
      for s in mapped:
          selfie += reverse_map[s]
      smiles = selfies.decoder(selfie)
      if rdkit.Chem.MolFromSmiles(smiles) is not None:
        valid.append(smiles) 
        print(smiles)

    run.log('valid', (len(valid) / n_molecules))


def get_mol(smiles):
    mol = rdkit.Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    return rdkit.Chem.Kekulize(mol)
   


def generate_sequences(epochs, train_smiles): 
    run = Run.get_context()
    run.log('epochs', epochs)
    for i in range(epochs):
        print('epoch:', i+1)
        for s in train_smiles: 
            yield (s, s)
            

#deepchem has its own fit model variation
def train(model, epochs, train_smiles):
    model.fit_sequences(generate_sequences(epochs, train_smiles))

    
#train_smiles, batch_size,learning_rate    
def modeltrain(epoch_count, learning_rate): 

    os.makedirs('data', exist_ok = True)
    run = Run.get_context()
    tasks, datasets, _ = deepchem.molnet.load_zinc15(
    featurizer='raw',
    splitter=None,
    transformers=[],
    data_dir='data', 
    save_dir='data')
    print(tasks)


    data = datasets[0]
    train_smiles = []
    for X, _, _, _ in data.itersamples():
        train_smiles.append(selfies.encoder(rdkit.Chem.MolToSmiles(X)))
    print(len(train_smiles))
    #for selfie in train_smiles[0:5]:
    #  print(smile)

    # DEFINE THE SMILES TOKENS AND MAX_LENGTHS
    # DEFINE THE SMILES TOKENS AND MAX_LENGTHS
    tokens = set()
    for s in train_smiles:
        w = ""
        for c in s:
            if c == '[':
                w = "["
            elif c == "]":
                tokens.add(w + ']')
                w = ""
            else:
                w += c

    tokens = sorted(list(tokens))
    max_length = max(len(s) for s in train_smiles)
    
    print(len(tokens))
    # need to discretize the multi-character tokens of SELFIES to get this to work properly
    # Easiest approach: convert every token to a single-char token
    # this means another layer of translation:
    #     Molecule -> SMILES -> SELFIES -> custom single-char format -> latent space

    # Here are unicode characters we can use:
    single_chars = [chr(1000 + i) for i in  range(len(tokens))]

    # form a mapping
    map = {}
    reverse_map = {}
    for i in range(len(tokens)):
        map[tokens[i]] = single_chars[i]
        reverse_map[single_chars[i]] = tokens[i]

    # translate each SELFIE
    for i in range(len(train_smiles)):

        w = train_smiles[i][1:-1].split(']')

        new_w = ""
        for c in w:
            try:
                new_w += map[c + "]"]
            except:
                pass

        train_smiles[i] = new_w
        if i < 5:
            print(train_smiles[i])


    try:
        seed = 123
        tf.random.set_seed(seed)
        device_name = tf.test.gpu_device_name()
        print('***************')
        print(device_name)
        print('***************')
        run.log('device_name', device_name)

        with tf.device(device_name):
            #(train_smiles, learning_rate)
            model = get_model(single_chars, train_smiles,learning_rate)
            train(model, epoch_count, train_smiles)
            generate_molecules(model, reverse_map)

    except Exception as e: 
        print(e)



def main():
    seed = 123
    random.seed(seed)
    tf.random.set_seed(seed)
    np.random.seed(seed)
    print('in main')
    epoch_count = 20
    learning_rate = 0.0001
    modeltrain(epoch_count, learning_rate)
    
if __name__ == "__main__":
    main()


Overwriting /mnt/batch/tasks/shared/LS_root/mounts/clusters/mm-deepchem/code/Users/meganmasanz.work/vae_training/train/train.py


### Below will create an experiment with the name 'moses-teacher-forcing-final_0504' using the train.py file in the training folder leveraging the compute cluster created

In [47]:
from azureml.core import Experiment, ScriptRunConfig
from azureml.widgets import RunDetails

experiment = Experiment(workspace = ws, name = "vae-eep-chem-selfies_504b")
script_config = ScriptRunConfig(source_directory = script_folder, script = 'train.py', environment=myenv, compute_target = cluster_name)

experiment = Experiment(workspace=ws, name = "vae-eep-chem-selfies_504b" )
run = experiment.submit(config= script_config)

In [48]:
RunDetails(run).show()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

In [None]:
run.wait_for_completion