In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.optim.lr_scheduler as lr_scheduler
from torch.utils.data import DataLoader
from torch.autograd import Variable

import math, random, sys
import numpy as np
import argparse
from argparse import Namespace
from collections import deque
import pickle as pickle

from jtnn import *
from auxiliaries import build_parser, set_random_seed
import rdkit
import pandas as pd
import json, os
from rdkit import RDLogger
import pathlib

In [2]:
lg = RDLogger.logger() 
lg.setLevel(RDLogger.CRITICAL)
root = str(pathlib.Path().absolute())

## Arguments

In [3]:
# overwrite default parameters
cmd_args = {
    'beta': 0.002, 
    'max_beta': 1.0,
    'latent_size': 4,
}

train_path = os.path.join(root, 'rafa-processed')

In [4]:
model_dir = os.path.join(root, 'gen_models')
json_path = os.path.join(model_dir, 'default_gen_args.json')
with open(json_path) as handle:
    arguments = json.loads(handle.read())
arguments.update(cmd_args)
if 'seed' in arguments:
    set_random_seed(args['seed'])
else:
    arguments['seed'] = set_random_seed()
arguments['cuda'] = torch.cuda.is_available()

## Train

In [15]:
def train(parametrization):
    global arguments, model_dir
    args = {**arguments, **parametrization}
    model_name = f"gen-ls{args['latent_size']}-lr{args['lr']:.3f}-h{args['hidden_size']}-l{args['num_layers']}-e{args['epoch']}-s{args['seed']}"
    args['save_dir'] = os.path.join(model_dir, model_name)
    # save model settings
    os.makedirs(args['save_dir'], exist_ok=True)
    dump_json_path = os.path.join(args['save_dir'], 'model.json')
    if not os.path.exists(dump_json_path):
        with open(dump_json_path, "w") as fp:
            json.dump(args, fp, sort_keys=True, indent=4)
    args = Namespace(**args)
    vocab = Vocab([x.strip("\r\n ") for x in open(args.vocab)])
    print(args)
    model = JTNNVAE(vocab, args)
    if args.cuda:
        model = model.cuda()
    return trainer(model, args, vocab)

In [16]:
def trainer(model, args, vocab):
    global train_path
    for param in model.parameters():
        if param.dim() == 1:
            nn.init.constant_(param, 0)
        else:
            nn.init.xavier_normal_(param)
    print(("Model #Params: %dK" % (sum([x.nelement() for x in model.parameters()]) / 1000,)))
    
    optimizer = optim.Adam(model.parameters(), lr=args.lr)
    scheduler = lr_scheduler.ExponentialLR(optimizer, args.anneal_rate)
    # scheduler.step()

    param_norm = lambda m: math.sqrt(sum([p.norm().item() ** 2 for p in m.parameters()]))
    grad_norm = lambda m: math.sqrt(sum([p.grad.norm().item() ** 2 for p in m.parameters() if p.grad is not None]))

    total_step = args.load_epoch
    beta = args.beta
    meters = np.zeros(4)
    
    for epoch in range(args.epoch):
        print(f"Currently at epoch: {epoch+1}")
        loader = MolTreeFolder(train_path, vocab, args.batch_size, num_workers=4)
        for batch in loader:
            total_step += 1
            model.zero_grad()
            loss, kl_div, wacc, tacc, sacc = model(batch, beta)
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), args.clip_norm)
            optimizer.step()

            meters = meters + np.array([kl_div, wacc * 100, tacc * 100, sacc * 100])

            if total_step % args.print_iter == 0:
                meters /= args.print_iter
                print(("[%d] Beta: %.3f, KL: %.2f, Word: %.2f, Topo: %.2f, Assm: %.2f, PNorm: %.2f, GNorm: %.2f" % (total_step, beta, meters[0], meters[1], meters[2], meters[3], param_norm(model), grad_norm(model))))
                sys.stdout.flush()
                meters *= 0

            if total_step % args.save_iter == 0:
                torch.save(model.state_dict(), args.save_dir + "/model.iter-" + str(total_step))

            if total_step % args.anneal_iter == 0:
                scheduler.step()
                print(("learning rate: %.6f" % scheduler.get_lr()[0]))

            if total_step % args.kl_anneal_iter == 0 and total_step >= args.warmup:
                beta = min(args.max_beta, beta + args.step_beta)
    
    torch.save(model.state_dict(), args.save_dir + f"/model")
    
    return evaluate(model)


In [17]:
def evaluate(model):
    data = []
    for i in range(10000):
        data.append(model.sample_prior())
    # df = pd.read_csv('/home/huang651/port-to-botorch/rafa-pred-model/data/rafa/mols_rafadb.csv')
    data = list(set(data))
    # return len(data) - smiles.isin(data).sum()
    return len(data)

## Hyperparameter Search

In [18]:
arguments['epoch'] = 1

In [None]:
from ax.service.managed_loop import optimize
best_parameters, values, experiment, model = optimize(
    parameters = [
        { "name": "lr", "type": "range", "bounds": [1e-5, 5e-3] },
        { "name": "hidden_size", "type": "range", "value_type": "int", "bounds": [32, 256] },
        { "name": "num_layers", "type": "range", "value_type": "int", "bounds": [1, 3] },
        # { "name": "latent_size", "type": "range", "value_type": "int", "bounds": [40, 100] },
        { "name": "epoch", "type": "range", "value_type": "int", "bounds": [20, 60] },
    ],
    evaluation_function=train,
    minimize=False,
    total_trials=20
)
means, covariances = values
print('best parameters:', best_parameters)
print(means)


[INFO 05-27 01:43:45] ax.modelbridge.dispatch_utils: Using Bayesian Optimization generation strategy: GenerationStrategy(name='Sobol+GPEI', steps=[Sobol for 5 trials, GPEI for subsequent trials]). Iterations after 5 will take longer to generate due to  model-fitting.
[INFO 05-27 01:43:45] ax.service.managed_loop: Started full optimization with 20 steps.
[INFO 05-27 01:43:45] ax.service.managed_loop: Running optimization trial 1...


Namespace(anneal_iter=40000, anneal_rate=0.9, batch_size=32, beta=0.002, clip_norm=50.0, cuda=True, depthG=3, depthT=20, epoch=25, hidden_size=243, kl_anneal_iter=2000, latent_size=4, load_epoch=0, lr=0.003665708072781563, max_beta=1.0, n_out=1, num_layers=2, print_iter=100, save_dir='/home/huang651/junction-tree/gen_models/gen2d-lr0.00367-h243-l2-e25-s3765065388', save_iter=5000, seed=3765065388, target='homo', total_trials=50, use_activation=True, vocab='/home/huang651/port-to-botorch/rafa-pred-model/data/rafa/vocab.txt', warmup=20000)
Model #Params: 1376K
Currently at epoch: 1
[100] Beta: 0.002, KL: 48.87, Word: 64.62, Topo: 91.03, Assm: 93.71, PNorm: 83.41, GNorm: 24.12
[200] Beta: 0.002, KL: 46.50, Word: 79.43, Topo: 97.54, Assm: 96.81, PNorm: 88.99, GNorm: 12.67
[300] Beta: 0.002, KL: 46.16, Word: 81.64, Topo: 98.51, Assm: 96.98, PNorm: 93.02, GNorm: 14.99
[400] Beta: 0.002, KL: 51.77, Word: 83.39, Topo: 98.70, Assm: 97.14, PNorm: 96.55, GNorm: 11.36
Currently at epoch: 2
[500] B