In [1]:
#training test for prediction capabilities 
import math, random, sys
sys.path.insert(0, '/home/marcase/hgraph2graph/')
import torch
import torch.nn as nn
import torch.optim as optim
import torch.optim.lr_scheduler as lr_scheduler
from torch.utils.data import DataLoader
import networkx as nx
import rdkit
import numpy as np
import argparse
import os
from tqdm.auto import tqdm
import pickle
from hgraph import *
from hgraph.inc_graph import *
from hgraph.encoder import *
import matplotlib.pyplot as plt
from hgraph.predict import HierPredict
from torch.autograd import Variable

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
vocab = '/home/marcase/hgraph2graph/data/cyclic_peptides/cyclic_vocab_new.txt'
vocab = [x.strip("\r\n ").split() for x in open(vocab)]
vocab = PairVocab(vocab)
vocab.vocab[21][0]
vocab.hvocab

['CC',
 'CC(C)C',
 'CC(C)O',
 'CCC(=O)O',
 'CCC(C)C',
 'CCC(N)=O',
 'CCC1=CC=C(O)C=C1',
 'CCC1=CC=CC=C1',
 'CCC1=CN=CN1',
 'CCC1=CNC2=C1C=CC=C2',
 'CCCC(=O)O',
 'CCCC(N)=O',
 'CCCCCN',
 'CCCCNC(=N)N',
 'CCCSC',
 'CCO',
 'CCSCC(=O)CSCC',
 'CN1CCCC1C(=O)O',
 'CN1CCCC1C=O',
 'CNCC(=O)O',
 'CNCC=O',
 'NCC=O',
 'O=CC1CCCN1']

In [3]:
class Args:
    train = '/home/marcase/hgraph2graph/predict/preprocessed_train/'
    train_labels = '/home/marcase/hgraph2graph/predict/preprocessed_train_labels/'
    test = '/home/marcase/hgraph2graph/predict/preprocessed_test/'
    test_labels = '/home/marcase/hgraph2graph/predict/preprocessed_test_labels/'
    vocab = vocab
    save_dir = 'test/'
    atom_vocab = common_atom_vocab
    load_model = None
    seed = 7
    rnn_type = 'LSTM'
    hidden_size=125
    embed_size=250
    batch_size=32
    latent_size=32
    depthT=15
    depthG=15
    diterT=1
    diterG=3
    dropout=0.0
    lr = 1e-3
    clip_norm=5.0
    step_beta=0.001
    max_beta=1.0
    warmup=10000
    kl_anneal_iter=2000
    epoch=2000
    anneal_rate=0.9
    anneal_iter=25000
    print_iter=50
    save_iter=1000000
    model = '/home/marcase/hgraph2graph/ckpt/cyclic_new2/model.ckpt.140000'
    load_model = True
    nsample = 1
    max_nodes=200
    max_edges=400
    max_AA = 6
    max_sub_nodes = 50
    label_size=2
    lock_pretrain_weights=False
    
args=Args()

model = HierVAE(args).cuda()



In [4]:
torch.manual_seed(args.seed)
random.seed(args.seed)

for param in model.parameters():
    if param.dim() == 1:
        nn.init.constant_(param, 0)
    else:
        nn.init.xavier_normal_(param)
        
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=args.lr)
scheduler = lr_scheduler.ExponentialLR(optimizer, args.anneal_rate)

if args.load_model:
    print('continuing from checkpoint ' + args.model)
    model_state, optimizer_state, total_step, beta = torch.load(args.model)
                
    #initialize weights in model.predict that don't exist from pre-training
    for key in model.predict.state_dict().keys():
        model_state['predict.' + key] = model.predict.state_dict()[key]
    
    if args.lock_pretrain_weights:
        for param in model.parameters():
            param.requires_grad = False

        layers=list(model_state.keys())
        layers_split = [l.split('.') for l in layers]
        print([l for l in layers_split if 'predict' in l])
        model.predict.ff1.requires_grad = True
        model.predict.ff2.requires_grad = True

    
    model.load_state_dict(model_state)

else:
    total_step = beta = 0

param_norm = lambda m: math.sqrt(sum([p.norm().item() ** 2 for p in m.parameters()]))
grad_norm = lambda m: math.sqrt(sum([p.grad.norm().item() ** 2 for p in m.parameters() if p.grad is not None]))


continuing from checkpoint /home/marcase/hgraph2graph/ckpt/cyclic_new2/model.ckpt.140000


In [17]:
meters = np.zeros(2)
meters_list = list(meters)
validation_list = list(np.zeros(1))
total_step = 0
for epoch in range(args.epoch):
    random.seed(args.seed)
    dataset_x = DataFolder(args.train, args.batch_size,shuffle = False)
    dataset_y = DataFolder(args.train_labels, args.batch_size,shuffle = False)
    dataset_x.data_files = ['tensors-0.pkl','tensors-1.pkl', 'tensors-2.pkl', 'tensors-3.pkl', 'tensors-4.pkl', 'tensors-5.pkl', 'tensors-6.pkl', 'tensors-7.pkl']
    dataset_y.data_files = ['tensors_labels-0.pkl','tensors_labels-1.pkl', 'tensors_labels-2.pkl', 'tensors_labels-3.pkl', 'tensors_labels-4.pkl', 'tensors_labels-5.pkl', 'tensors_labels-6.pkl', 'tensors_labels-7.pkl']
    model.train()
    for batch_x,batch_y in zip(dataset_x,dataset_y):
        total_step += 1
        model.zero_grad()
        y_pred = model.forward(*batch_x, beta=beta) 
        y_true = torch.Tensor([int(y) for y in batch_y]).cuda()
        y_true = y_true.type(torch.LongTensor).cuda()
        loss = criterion(y_pred,y_true)
        
        accuracy = torch.sum(torch.argmax(y_pred, dim=1).cuda() == y_true)/len(y_pred)
        loss = Variable(loss, requires_grad = True)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), args.clip_norm)
        optimizer.step()

        meters = meters + np.array([loss.item(),accuracy.cpu()])
        meters_list.append(meters)

        if total_step % args.print_iter == 0:
            meters /= args.print_iter
            print("[%d] Beta: %.3f, loss: %.3f, accuracy: %.3f, PNorm: %.2f, GNorm: %.2f" % (total_step, beta, meters[0], meters[1], param_norm(model), grad_norm(model)))
            sys.stdout.flush()
            meters *= 0
        
        if total_step % args.save_iter == 0:
            ckpt = (model.state_dict(), optimizer.state_dict(), total_step, beta)
            torch.save(ckpt, os.path.join(args.save_dir, f"model.ckpt.{total_step}"))

        if total_step % args.anneal_iter == 0:
            scheduler.step()
            print("learning rate: %.6f" % scheduler.get_lr()[0])

        if total_step >= args.warmup and total_step % args.kl_anneal_iter == 0:
            beta = min(args.max_beta, beta + args.step_beta)
    
    #"validation" set
    model.eval()
    torch.no_grad()
    dataset_x = DataFolder(args.test, args.batch_size,shuffle = False)
    dataset_y = DataFolder(args.test_labels, args.batch_size,shuffle = False)
    dataset_x.data_files = ['tensors-0.pkl','tensors-1.pkl', 'tensors-2.pkl']
    dataset_y.data_files = ['tensors_labels-0.pkl','tensors_labels-1.pkl', 'tensors_labels-2.pkl']
    random.seed()
    i=0
    for batch_x,batch_y in zip(dataset_x,dataset_y):
        if i == random.randint(0, 32):
            batch_x0 = batch_x
            batch_y0 = batch_y
            print(i)
            break
        i += 1

    y_pred = model.forward(*batch_x0, beta=beta)
    y_pred = torch.argmax(y_pred, dim=1)
    y_true = torch.Tensor([int(y) for y in batch_y0]).cuda()
    y_true = y_true.type(torch.LongTensor).cuda()
    print('Accuracy on validation set: %.3f' % (torch.sum(y_pred == y_true)/len(y_pred)).item())
    validation_list.append((torch.sum(y_pred == y_true)/len(y_pred)).item())

[50] Beta: 0.065, loss: 1.012, accuracy: 0.421, PNorm: 286.57, GNorm: 0.00
27
Accuracy on validation set: 0.469
[100] Beta: 0.065, loss: 0.996, accuracy: 0.432, PNorm: 286.57, GNorm: 0.00
[150] Beta: 0.065, loss: 1.017, accuracy: 0.409, PNorm: 286.57, GNorm: 0.00
Accuracy on validation set: 0.531
[200] Beta: 0.065, loss: 0.993, accuracy: 0.425, PNorm: 286.57, GNorm: 0.00
[250] Beta: 0.065, loss: 1.014, accuracy: 0.424, PNorm: 286.57, GNorm: 0.00
18
Accuracy on validation set: 0.531
[300] Beta: 0.065, loss: 1.011, accuracy: 0.419, PNorm: 286.57, GNorm: 0.00
4
Accuracy on validation set: 0.469
[350] Beta: 0.065, loss: 0.993, accuracy: 0.436, PNorm: 286.57, GNorm: 0.00
[400] Beta: 0.065, loss: 1.033, accuracy: 0.436, PNorm: 286.57, GNorm: 0.00
16
Accuracy on validation set: 0.344
[450] Beta: 0.065, loss: 0.993, accuracy: 0.428, PNorm: 286.57, GNorm: 0.00
[500] Beta: 0.065, loss: 1.016, accuracy: 0.412, PNorm: 286.57, GNorm: 0.00
Accuracy on validation set: 0.312
[550] Beta: 0.065, loss: 1

KeyboardInterrupt: 

In [7]:
i = 0
dataset_x = DataFolder(args.train, args.batch_size,shuffle = False)
dataset_y = DataFolder(args.train_labels, args.batch_size,shuffle = False)
dataset_x.data_files = ['tensors-0.pkl','tensors-1.pkl', 'tensors-2.pkl', 'tensors-3.pkl', 'tensors-4.pkl', 'tensors-5.pkl', 'tensors-6.pkl', 'tensors-7.pkl']
dataset_y.data_files = ['tensors_labels-0.pkl','tensors_labels-1.pkl', 'tensors_labels-2.pkl', 'tensors_labels-3.pkl', 'tensors_labels-4.pkl', 'tensors_labels-5.pkl', 'tensors_labels-6.pkl', 'tensors_labels-7.pkl']
for batch_x,batch_y in zip(tqdm(dataset_x),tqdm(dataset_y)):
    batch_x0 = batch_x
    batch_y0 = batch_y
    # print(len(batch_x0[1][0][5]))
    # print(len(batch_y))
    if i > 1:
        break
    
    i += 1

  0%|          | 0/8000 [00:00<?, ?it/s]
  0%|          | 1/8000 [00:00<50:30,  2.64it/s]
  0%|          | 2/8000 [00:00<25:18,  5.27it/s][A
  0%|          | 2/8000 [00:00<26:31,  5.02it/s]


In [8]:
y_pred = model.forward_predict(*batch_x0, beta=beta)
y_pred = torch.argmax(y_pred, dim=1)
y_pred

tensor([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        1, 0, 0, 0, 1, 0, 0, 1], device='cuda:0')

In [9]:
y_true = torch.Tensor([int(y) for y in batch_y0]).cuda()
y_true = y_true.type(torch.LongTensor).cuda()
y_true

tensor([0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1,
        1, 0, 1, 1, 1, 1, 1, 1], device='cuda:0')

In [10]:
torch.sum(y_pred == y_true)/args.batch_size

tensor(0.5000, device='cuda:0')

In [21]:
i = 0
dataset_x = DataFolder(args.test, args.batch_size,shuffle = False)
dataset_y = DataFolder(args.test_labels, args.batch_size,shuffle = False)
dataset_x.data_files = ['tensors-0.pkl','tensors-1.pkl', 'tensors-2.pkl']
dataset_y.data_files = ['tensors_labels-0.pkl','tensors_labels-1.pkl', 'tensors_labels-2.pkl']
for batch_x,batch_y in zip(tqdm(dataset_x),tqdm(dataset_y)):
    if i == random.randint(0, 32):
        batch_x0 = batch_x
        batch_y0 = batch_y
        print(i)
        break
    i += 1

print(i)

  0%|          | 0/3000 [00:00<?, ?it/s]
  0%|          | 1/3000 [00:00<16:45,  2.98it/s]
  0%|          | 3/3000 [00:00<05:37,  8.88it/s][A
  0%|          | 3/3000 [00:00<06:00,  8.32it/s]

3
3





In [22]:
y_pred = model.forward_predict(*batch_x0, beta=beta)
y_pred = torch.argmax(y_pred, dim=1)
y_pred

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1,
        1, 1, 0, 0, 1, 0, 0, 0], device='cuda:0')

In [23]:
y_true = torch.Tensor([int(y) for y in batch_y0]).cuda()
y_true = y_true.type(torch.LongTensor).cuda()
y_true

tensor([0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1,
        0, 1, 1, 1, 0, 1, 1, 0], device='cuda:0')

In [24]:
(torch.sum(y_pred == y_true)/args.batch_size).item()

0.5

<hgraph.dataset.DataFolder at 0x14e446207790>