#  Approach 3 - Moses with Teacher Forcing - CS598 Deep Learning

## Drug Discovery: Variational Autoencoder Techniques for Molecule Generation

## Approach 3: Moses with Teacher Forcing

Team Members:

- Andrew Jacobson jonaj2@illinois.edu
- Dixon Liang dixonl2@illinois.edu
- John Judge jmjudge2@illinois.edu
- Megan Masanz mjneuman@illinois.edu

Implementation Description:

- Baseline Model
- Character Based Chemical VAE
- Aspuru-Guzik 

References include
* [Automatic Chemical Design Using a Data-Driven Continuous Representation of Molecules](https://arxiv.org/abs/1610.02415)
* https://github.com/aspuru-guzik-group/chemical_vae
* https://github.com/deepchem/deepchem
* https://github.com/molecularsets/moses 
* https://github.com/Azure/azureml-examples/blob/main/tutorials/an-introduction/2.pytorch-model.ipynb

Requirements for notebook to run:
* Run this notebook inside an AzureML workspace (or provide configuration)
* No data is required as the training script will download the dataset

In [10]:
import azureml.core #adding core - this by default is in notebooks run on computer in Azure ML
from azureml.core import Workspace #needed for connecting to workspace
from azureml.core.compute import AmlCompute, ComputeTarget
from azureml.core.compute_target import ComputeTargetException
from azureml.core.runconfig import RunConfiguration, DEFAULT_GPU_IMAGE 
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core import Environment
from azureml.core.conda_dependencies import CondaDependencies


ws = Workspace.from_config()

###  Below is importing the requirements for this notebook to run inside an Azure ML environment for :
- connecting to a workspace
- creating remote computer for training 

In [11]:
cluster_name = 'mmdsvm04-moses'
#cluster_name = 'mmdsvm04d'
try:
    compute_target = ComputeTarget(workspace=ws,  name=cluster_name )
    print('found existing:', compute_target.name)
    
except ComputeTargetException:
    print('creating new.')
    compute_config = AmlCompute.provisioning_configuration(
        vm_size='STANDARD_NC12',
        min_nodes=0,
        max_nodes=1)
    
    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)
    compute_target.wait_for_completion(show_output=True)

found existing: mmdsvm04-moses


### Below are the required packaged for running an Azure ML Experiment

In [12]:
myenv = Environment('deepchem_backend3')

#already created
conda_dep = CondaDependencies().create(python_version='3.7.10', conda_packages=['tensorflow-gpu==2.4.1', 'rdkit', 'openmm', 'pdbfixer'])
conda_dep.add_channel("conda-forge")
conda_dep.add_channel("omnia")
conda_dep.add_pip_package("azureml-sdk")
conda_dep.add_pip_package("deepchem")
conda_dep.add_pip_package("molsets")
#1.19.4
conda_dep.add_pip_package("numpy==1.19.4")
#IPython
conda_dep.add_pip_package("IPython")
conda_dep.save(path="./train/condadep.yml")
myenv.python.conda_dependencies=conda_dep
#myenv.docker.enabled = True
myenv.docker.base_image = DEFAULT_GPU_IMAGE
myenv.register(workspace=ws)

{
    "databricks": {
        "eggLibraries": [],
        "jarLibraries": [],
        "mavenLibraries": [],
        "pypiLibraries": [],
        "rcranLibraries": []
    },
    "docker": {
        "arguments": [],
        "baseDockerfile": null,
        "baseImage": "mcr.microsoft.com/azureml/intelmpi2018.3-cuda10.0-cudnn7-ubuntu16.04:20210113.v1",
        "baseImageRegistry": {
            "address": null,
            "password": null,
            "registryIdentity": null,
            "username": null
        },
        "enabled": false,
        "platform": {
            "architecture": "amd64",
            "os": "Linux"
        },
        "sharedVolumes": true,
        "shmSize": null
    },
    "environmentVariables": {
        "EXAMPLE_ENV_VAR": "EXAMPLE_VALUE"
    },
    "inferencingStackVersion": null,
    "name": "deepchem_backend2",
    "python": {
        "baseCondaEnvironment": null,
        "condaDependencies": {
            "channels": [
                "anaconda",
        

### Get the current working directory and inside of it, create a subdirectory called "train" to hold the training script - contents of this will be passed to the remote machine for training

In [13]:
cwd = os.getcwd()
current_dir = cwd
print(cwd)

/mnt/batch/tasks/shared/LS_root/mounts/clusters/mm-deepchem/code/Users/meganmasanz.work/vae_training


In [14]:
import os
script_folder = os.path.join(os.getcwd(), "train")
print(script_folder)
os.makedirs(script_folder, exist_ok = True)

/mnt/batch/tasks/shared/LS_root/mounts/clusters/mm-deepchem/code/Users/meganmasanz.work/vae_training/train


### The code below is actually the training and the model evaluation script.  Modifiying the script below will specifically in the main section will change parameters in the training.

### Parameters this script will allow changes to include:
- epoch_count = 20
- config.lr_end  =0.0001
- config.lr_start=0.0001
- config.q_dropout

*The start and stop annealing steps have been set equal, this means that Cost Annealing will be disabled during runs*

*Note the commented out code will generate the images for valid molecules generated and can be uncommented if a user was interested in generating the images*


In [90]:
%%writefile $script_folder/train.py

###################################################
import sys
import os
import requests
import subprocess
import shutil
import IPython
from logging import getLogger, StreamHandler, INFO
from deepchem.models.optimizers import Adam, ExponentialDecay
from deepchem.models.seqtoseq import AspuruGuzikAutoEncoder
import rdkit
import numpy as np
import deepchem
import rdkit
import tensorflow as tf
from azureml.core import Run
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Descriptors
from rdkit.Chem import AllChem
from rdkit import DataStructs
import random
import moses
import torch
from moses.vae.model import VAE
from moses.vae.config import get_parser
from moses.vae.trainer import VAETrainer
import numpy as np



    
def train_model(epochs=2):
    print('train model')
    ##################
    os.makedirs('data', exist_ok = True)
    run = Run.get_context()
    run.log("epochs", epochs)
    tasks, datasets, _ = deepchem.molnet.load_zinc15(
    featurizer='raw',
    splitter=None,
    transformers=[],
    data_dir='data', 
    save_dir='data')
    print(tasks)


    data = datasets[0]
    train_smiles = []
    for X, _, _, _ in data.itersamples():
        train_smiles.append(rdkit.Chem.MolToSmiles(X))
    print(len(train_smiles))
    run.log('datasetsize', len(train_smiles))
    for smile in train_smiles[0:5]:
        print(smile)

    # DEFINE THE SMILES TOKENS AND MAX_LENGTHS
    tokens = set()
    for s in train_smiles:
        tokens = tokens.union(set(s))
    tokens = sorted(list(tokens))
    max_length = max(len(s) for s in train_smiles)
    
    ##################
    #Namespace(clip_grad=50, d_cell='gru', d_d_h=512, d_dropout=0, d_n_layers=3, d_z=128, 
    #freeze_embeddings=False, kl_start=0, kl_w_end=0.05, kl_w_start=0, log_file=None, 
    #lr_end=0.00030000000000000003, lr_n_mult=1, lr_n_period=5, lr_n_restarts=1, 
    #lr_start=0.00030000000000000003, model_save=None, n_batch=512, n_jobs=1, 
    #n_last=1000, n_workers=1, q_bidir=False, q_cell='gru', q_d_h=256, q_dropout=0.5, q_n_layers=1)
    run = Run.get_context()
    
    config = get_parser().parse_known_args()[0]
    config.log_file = None
    config.model_save = None
    config.lr_n_restarts = 1
    #updated to remove type
    config.lr_n_period = epochs

    config.d_n_layers=3
    config.q_n_layers=3
    config.d_dropout=0
    config.q_dropout=0.0
    run.log('drop_out', config.q_dropout)
    
    run.log('encoding_layers', config.q_n_layers)
    run.log('decoding_layers', config.d_n_layers)
    #encoder  - set to 3 layers

    print(config)
    
    run.log("lr", config.lr_end)

    trainer = VAETrainer(config)
    model = VAE(trainer.get_vocabulary(train_smiles), config)
    model.cuda(device="cuda")
    print('fitting model')
    model = trainer.fit(model, train_smiles)
    return model, max_length

def generate_molecules(model, n_molecules=1000, max_length=100000):
    run = Run.get_context()
    print('max_length =' + str(max_length))
    predictions = model.sample(n_molecules, max_length) 
    valid = []

    #using chem from rdkit to ensure generated molecules are valid
    count = 0
    for p in predictions:
      count += 1
      smiles = ''.join(p)
      if count < 10:
        print(smiles)
      if rdkit.Chem.MolFromSmiles(smiles) is not None:
        valid.append(smiles) 

    print(len(valid) / n_molecules)
    
    run.log('valid', (len(valid) / n_molecules))
    
    return valid


def main():
    print('nearly default config')
    model, max_length = train_model(20)
    print('max_length = ' + str(max_length))
    valid = generate_molecules(model, 5000, max_length)

    print(len(valid), 'valid molecules')
    count = 0
    for v in valid:
        count += 1
        if count > 20:
            break
        print(v)

if __name__ == "__main__":
    print(torch.cuda.device_count())
    torch.cuda.get_device_name(0)

    main()

Overwriting /mnt/batch/tasks/shared/LS_root/mounts/clusters/mm-deepchem/code/Users/meganmasanz.work/vae_training/train/train.py


### Below will create an experiment with the name 'moses-teacher-forcing-final_0504' using the train.py file in the training folder leveraging the compute cluster created

In [91]:
from azureml.core import Experiment, ScriptRunConfig
from azureml.widgets import RunDetails

experiment = Experiment(workspace = ws, name = "moses-teacher-forcing-final_0504")
script_config = ScriptRunConfig(source_directory = script_folder, script = 'train.py', environment=myenv, compute_target = cluster_name)

experiment = Experiment(workspace=ws, name = "moses-teacher-forcing-final_0504" )
run = experiment.submit(config= script_config)

In [92]:
RunDetails(run).show()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

In [25]:
run.wait_for_completion

<bound method Run.wait_for_completion of Run(Experiment: moses-teacher-forcing,
Id: moses-teacher-forcing_1619967239_9191078e,
Type: azureml.scriptrun,
Status: Preparing)>