## Import dependencies

In [1]:
import os
import random
import time
from argparse import Namespace
from datetime import datetime
from shutil import copyfile

import numpy as np
import torch

from data.loader import DataLoader
from model.trainer import GCNTrainer
from utils import (
    scorer,
    constant,
    helper,
)
from utils.vocab import Vocab



## Parse arguments

In [2]:
args = Namespace(
    data_dir='dataset/definition/textbook',
    vocab_dir='dataset/definition/textbook/vocab',
    emb_dim=300,
    ner_dim=30,
    pos_dim=30,
    hidden_dim=200,
    num_layers=2,
    input_dropout=0.5,
    gcn_dropout=0.5,
    word_dropout=0.04,
    topn=10000000000.0,
    lower=False,
    ratio=1,
    only_label=0,
    sent_loss=100.0,
    dep_path_loss=100.0,
    consistency_loss=1.0,
    prune_k=-1,
    conv_l2=0,
    pooling='max',
    pooling_l2=0.003,
    mlp_layers=2,
    no_adj=False,
    rnn=True,
    rnn_hidden=200,
    rnn_layers=1,
    rnn_dropout=0.5,
    lr=0.0003,
    lr_decay=0.9,
    decay_epoch=5,
    optim='adamax',
    num_epoch=100,
    batch_size=50,
    max_grad_norm=5.0,
    log_step=20,
    log='logs.txt',
    save_epoch=100,
    save_dir='./saved_models',
    id='first_model',
    info='',
    seed=0,
    cuda=False,
    cpu=False,
    load=False,
    model_file=None
)
opt = vars(args)

opt['num_class'] = len(constant.LABEL_TO_ID)




## Set random seed

In [3]:
torch.manual_seed(args.seed)
np.random.seed(args.seed)
random.seed(1234)
if not torch.cuda.is_available():
    args.cuda = False
elif args.cuda:
    torch.cuda.manual_seed(args.seed)



## Load vocab

In [4]:
# vocabulary: set of unique words that the dataset contains.
vocab = Vocab(os.path.join(opt['vocab_dir'], 'vocab.pkl'), load=True)
opt['vocab_size'] = vocab.size

# word embedding: vector representation of each word in the vocabulary
emb_matrix = np.load(os.path.join(opt['vocab_dir'], 'embedding.npy'))

print(f"""Loaded vocab with {vocab.size} words and {emb_matrix.shape[1]} dims.""")

Loaded vocab with 26106 words and 300 dims.




## Load data

In [5]:
print(f"Loading data from {opt['data_dir']} with batch size {opt['batch_size']}...")
train_batch = DataLoader(os.path.join(opt['data_dir'], 'train.json'), opt, vocab, evaluation=False)
dev_batch = DataLoader(os.path.join(opt['data_dir'], 'dev.json'), opt, vocab, evaluation=True)

model_id = opt['id'] if len(opt['id']) > 1 else '0' + opt['id']
model_save_dir = os.path.join(opt['save_dir'], model_id)
opt['model_save_dir'] = model_save_dir
os.makedirs(model_save_dir, exist_ok=True)

Loading data from dataset/definition/textbook with batch size 50...
354 batches created for dataset/definition/textbook/train.json
45 batches created for dataset/definition/textbook/dev.json


## Save config

In [17]:
helper.save_config(opt, os.path.join(model_save_dir, 'config.json'), verbose=True)
vocab.save(os.path.join(model_save_dir, 'vocab.pkl'))
file_logger = helper.FileLogger(
    os.path.join(model_save_dir, opt['log']),
    header="# epoch\ttrain_loss\tsent_loss\tdep_path_loss\tdev_loss\tdev_score\tbest_dev_score"
)


Config saved to file ./saved_models/first_model/config.json
Overwriting old vocab file at ./saved_models/first_model/vocab.pkl


## Train model

In [18]:
trainer = GCNTrainer(opt, emb_matrix=emb_matrix)

Finetune all embeddings.




In [19]:
id2label = dict([(v, k) for k, v in constant.LABEL_TO_ID.items()])
dev_score_history = []
current_lr = opt['lr']


In [20]:
global_step = 0
global_start_time = time.time()
format_str = ('{}: step {}/{} (epoch {}/{}), loss = {:.6f}, sent_loss = {:.6f}, dep_path_loss = {:.6f} ({:.3f} '
              'sec/batch), lr: {:.6f}')
max_steps = len(train_batch) * opt['num_epoch']

In [21]:

# start training
for epoch in range(1, opt['num_epoch'] + 1):
    train_loss = 0
    train_sent_loss = 0
    train_dep_path_loss = 0
    for i, batch in enumerate(train_batch):
        start_time = time.time()
        global_step += 1
        loss, sent_loss, dep_path_loss = trainer.update(batch)
        train_loss += loss
        train_sent_loss += sent_loss
        train_dep_path_loss += dep_path_loss
        if global_step % opt['log_step'] == 0:
            duration = time.time() - start_time
            print(
                format_str.format(
                    datetime.now(), global_step, max_steps, epoch,
                    opt['num_epoch'], loss, sent_loss, dep_path_loss, duration, current_lr
                )
            )

    # eval on dev
    print("Evaluating on dev set...")
    predictions = []
    dev_loss = 0
    for i, batch in enumerate(dev_batch):
        preds, _, loss, _ = trainer.predict(batch)
        predictions += preds
        dev_loss += loss
    predictions = [[id2label[l + 1]] for p in predictions for l in p]
    train_loss = train_loss / train_batch.num_examples * opt['batch_size']  # avg loss per batch
    train_sent_loss = train_sent_loss / train_batch.num_examples * opt['batch_size']  # avg loss per batch
    train_dep_path_loss = train_dep_path_loss / train_batch.num_examples * opt['batch_size']  # avg loss per batch
    dev_loss = dev_loss / dev_batch.num_examples * opt['batch_size']

    dev_p, dev_r, dev_f1 = scorer.score(dev_batch.gold(), predictions, method='macro')
    print(
        f"epoch {epoch}: train_loss = {train_loss:.6f}, "
        f"train_sent_loss = {train_sent_loss:.6f}, "
        f"train_dep_path_loss = {train_dep_path_loss:.6f}, "
        f"dev_loss = {dev_loss:.6f}, dev_f1 = {dev_f1:.4f}"
    )
    dev_score = dev_f1
    file_logger.log(
        f"{epoch}\t{train_loss:.6f}\t{train_sent_loss:.6f}"
        f"\t{train_dep_path_loss:.6f}\t{dev_loss:.6f}"
        f"\t{dev_score:.4f}\t{max([dev_score] + dev_score_history):.4f}"
    )

    # save
    model_file = model_save_dir + f'/checkpoint_epoch_{epoch}.pt'
    trainer.save(model_file, epoch)
    if epoch == 1 or dev_score > max(dev_score_history):
        copyfile(model_file, model_save_dir + '/best_model.pt')
    print("new best model saved.")
    file_logger.log(
        f"new best model saved at epoch {epoch}: {dev_p * 100:.2f}\t{dev_r * 100:.2f}\t{dev_score * 100:.2f}"
    )
    if epoch % opt['save_epoch'] != 0:
        os.remove(model_file)

    # lr schedule
    if (
            len(dev_score_history) > opt['decay_epoch'] and
            dev_score <= dev_score_history[-1] and
            opt['optim'] in ['sgd', 'adagrad', 'adadelta']
    ):
        current_lr *= opt['lr_decay']

    trainer.update_lr(current_lr)
    dev_score_history += [dev_score]

    print("Training ended with {} epochs.".format(epoch))

  terms_out = pool(F.softmax(outputs), terms.unsqueeze(2).byte(), type=pool_type)
  defs_out = pool(F.softmax(outputs), defs.unsqueeze(2).byte(), type=pool_type)
  score = torch.where(mask[i].unsqueeze(1), next_score, score)


2024-04-07 18:50:11.741906: step 20/35400 (epoch 1/100), loss = 1011.164185, sent_loss = 0.683836, dep_path_loss = 17.089998 (0.263 sec/batch), lr: 0.000300
2024-04-07 18:50:19.363878: step 40/35400 (epoch 1/100), loss = 730.274231, sent_loss = 0.689742, dep_path_loss = 8.228372 (0.253 sec/batch), lr: 0.000300
2024-04-07 18:50:25.702109: step 60/35400 (epoch 1/100), loss = 687.183960, sent_loss = 0.648916, dep_path_loss = 4.087361 (0.380 sec/batch), lr: 0.000300
2024-04-07 18:50:32.375247: step 80/35400 (epoch 1/100), loss = 587.915466, sent_loss = 0.621392, dep_path_loss = 5.546593 (0.258 sec/batch), lr: 0.000300


KeyboardInterrupt: 