## Import dependencies

In [1]:
import json
import os
import pickle
import random
import time
from argparse import Namespace
from collections import (
    Counter,
    defaultdict,
)
from shutil import (
    copyfile,
)

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torchcrf import CRF

from utils import (
    torch_utils,
    scorer,
    constant,
)
from utils.constant import UNK_ID



## Parse arguments

In [2]:
args = Namespace(
    data_dir='dataset/definition/textbook',
    vocab_dir='dataset/definition/textbook/vocab',
    emb_dim=300,
    ner_dim=30,
    pos_dim=30,
    hidden_dim=200,
    num_layers=2,
    input_dropout=0.5,
    gcn_dropout=0.5,
    word_dropout=0.04,
    topn=10000000000.0,
    lower=False,
    ratio=1,
    only_label=0,
    sent_loss=100.0,
    dep_path_loss=100.0,
    consistency_loss=1.0,
    prune_k=-1,
    conv_l2=0,
    pooling='max',
    pooling_l2=0.003,
    mlp_layers=2,
    no_adj=False,
    rnn=True,
    rnn_hidden=200,
    rnn_layers=1,
    rnn_dropout=0.5,
    lr=0.0003,
    lr_decay=0.9,
    decay_epoch=5,
    optim='adamax',
    num_epoch=100,
    batch_size=50,
    max_grad_norm=5.0,
    log_step=20,
    log='logs.txt',
    save_epoch=100,
    save_dir='./saved_models',
    id='1',
    info='',
    seed=0,
    cuda=torch.cuda.is_available(),
    cpu=not torch.cuda.is_available(),
    load=False,
    model_file=None
)
opt = vars(args)

opt['num_class'] = len(constant.LABEL_TO_ID)




## Set random seed

In [3]:
torch.manual_seed(args.seed)
np.random.seed(args.seed)
random.seed(args.seed)

if torch.cuda.is_available():
    torch.cuda.manual_seed(args.seed)



## Load vocab

In [4]:
class Vocab:
    def __init__(self, filename):
        with open(filename, 'rb') as f:
            self.id2word = pickle.load(f)
            self.word2id = defaultdict(
                lambda: UNK_ID,
                {self.id2word[idx]: idx for idx in range(len(self.id2word))}
            )

    @property
    def size(self):
        return len(self.id2word)


In [5]:
# vocabulary: set of unique words that the dataset contains.
vocab = Vocab(os.path.join(opt['vocab_dir'], 'vocab.pkl'))
opt['vocab_size'] = vocab.size

# word embedding: vector representation of each word in the vocabulary
emb_matrix = np.load(os.path.join(opt['vocab_dir'], 'embedding.npy'))

print(f"""Loaded vocab with {vocab.size} words and {emb_matrix.shape[1]} dims.""")

Loaded vocab with 26106 words and 300 dims.


## Load data

In [6]:
class DataLoader:
    """
    Load data from json files, preprocess and prepare batches.
    """

    def __init__(self, filename, opt, vocab, evaluation=False):
        self.batch_size = opt['batch_size']
        self.opt = opt
        self.vocab = vocab
        self.eval = evaluation

        with open(filename) as f:
            data = self.preprocess(json.load(f), vocab, opt)

        # shuffle for training
        if not evaluation:
            indices = list(range(len(data)))
            random.shuffle(indices)
            data = [data[i] for i in indices]

        self.id2label = dict([(v, k) for k, v in constant.LABEL_TO_ID.items()])
        self.sent_id2label = dict([(v, k) for k, v in constant.SENT_LABEL_TO_ID.items()])
        self.labels = [[self.id2label[l]] for d in data for l in d[-2]]
        self.sent_labels = [self.sent_id2label[d[-1]] for d in data]

        # chunk into batches
        data = [
            data[idx:idx + opt['batch_size']]
            for idx in range(0, len(data), opt['batch_size'])
        ]
        self.data = data
        print(f"{len(data)} batches created for {filename}")

    def preprocess(self, dataset, vocab, opt):
        """ Preprocess the data and convert to ids. """
        processed = []
        for sentence in dataset:
            tokens = list(sentence['tokens'])
            if opt['lower']:
                tokens = [t.lower() for t in tokens]

            tokens = [vocab.word2id[token] for token in tokens]
            pos = [constant.POS_TO_ID[pos] for pos in sentence['pos']]
            labels = [constant.LABEL_TO_ID[label] for label in sentence['labels']]

            # Parses the dependency head information
            head = [int(x) for x in sentence['heads']]
            # checks for at least one root in the dependency tree
            assert any([x == -1 for x in head])

            # Initializes dependency path representation
            dep_path = [0] * len(sentence['tokens'])
            for i in sentence['dep_path']:
                if i != -1:
                    dep_path[i] = 1

            # Constructs adjacency matrix
            # indicating direct connections between tokens.
            adj = np.zeros((len(sentence['heads']), len(sentence['heads'])))
            for i, h in enumerate(sentence['heads']):
                adj[i][h] = 1
                adj[h][i] = 1

            if self.eval or self.opt['only_label'] != 1 or sentence['label'] != 'none':
                counter = Counter(sentence['labels'])
                terms = [0] * len(sentence['labels'])
                defs = [0] * len(sentence['labels'])
                # Identifies 'Term' and 'Definition' entities within the labels, marking their positions.
                if counter['B-Term'] == 1 and counter['B-Definition'] == 1:
                    for i, label in enumerate(sentence['labels']):
                        if 'Term' in label:
                            terms[i] = 1
                        if 'Definition' in label:
                            defs[i] = 1

                processed.append(
                    (
                        tokens, pos, head, terms, defs, dep_path, adj, labels,
                        constant.SENT_LABEL_TO_ID[sentence['label']])
                )

        return processed

    def gold(self):
        """ Return gold labels as a list. """
        return self.labels

    def sent_gold(self):
        return self.sent_labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        """ Get a batch with index. """
        batch = self.data[index]
        batch_size = len(batch)
        batch = list(zip(*batch))
        assert len(batch) == 9

        # sort all fields by lens for easy RNN operations
        lens = [len(x) for x in batch[0]]
        batch, orig_idx = self.sort_all(batch, lens)

        # word dropout
        if not self.eval:
            words = [
                self.word_dropout(sent, self.opt['word_dropout'])
                for sent in batch[0]
            ]
        else:
            words = batch[0]

        # convert to tensors
        words = self.get_long_tensor(words, batch_size)
        masks = torch.eq(words, 0)
        pos = self.get_long_tensor(batch[1], batch_size)
        head = self.get_long_tensor(batch[2], batch_size)
        terms = self.get_long_tensor(batch[3], batch_size)
        defs = self.get_long_tensor(batch[4], batch_size)
        dep_path = self.get_long_tensor(batch[5], batch_size).float()
        adj = self.get_float_tensor2D(batch[6], batch_size)

        labels = self.get_long_tensor(batch[7], batch_size)

        sent_labels = torch.FloatTensor(batch[8])

        return words, masks, pos, head, terms, defs, adj, labels, sent_labels, dep_path, orig_idx

    def __iter__(self):
        for i in range(self.__len__()):
            yield self.__getitem__(i)

    @staticmethod
    def get_long_tensor(tokens_list, batch_size):
        """ Convert list of list of tokens to a padded LongTensor. """
        token_len = max(len(x) for x in tokens_list)
        tokens = torch.LongTensor(batch_size, token_len).fill_(constant.PAD_ID)
        for i, s in enumerate(tokens_list):
            tokens[i, :len(s)] = torch.LongTensor(s)

        return tokens

    @staticmethod
    def get_float_tensor2D(tokens_list, batch_size):
        """ Convert list of list of tokens to a padded LongTensor. """
        token_len = max(len(x) for x in tokens_list)
        tokens = torch.FloatTensor(batch_size, token_len, token_len).fill_(constant.PAD_ID)
        for i, s in enumerate(tokens_list):
            tokens[i, :len(s), :len(s)] = torch.FloatTensor(s)
        return tokens

    @staticmethod
    def sort_all(batch, lens):
        """ Sort all fields by descending order of lens, and return the original indices. """
        unsorted_all = [lens] + [range(len(lens))] + list(batch)
        sorted_all = [list(t) for t in zip(*sorted(zip(*unsorted_all), reverse=True))]
        return sorted_all[2:], sorted_all[1]

    @staticmethod
    def word_dropout(tokens, dropout):
        """ Randomly dropout tokens (IDs) and replace them with <UNK> tokens. """
        return [
            constant.UNK_ID if x != constant.UNK_ID and np.random.random() < dropout else x
            for x in tokens
        ]


In [7]:
print(f"Loading data from {opt['data_dir']} with batch size {opt['batch_size']}...")
train_batch = DataLoader(os.path.join(opt['data_dir'], 'train.json'), opt, vocab, evaluation=False)
dev_batch = DataLoader(os.path.join(opt['data_dir'], 'dev.json'), opt, vocab, evaluation=True)

model_save_dir = os.path.join(opt['save_dir'], opt['id'])
opt['model_save_dir'] = os.path.join(opt['save_dir'], opt['id'])
os.makedirs(model_save_dir, exist_ok=True)

Loading data from dataset/definition/textbook with batch size 50...
354 batches created for dataset/definition/textbook/train.json
45 batches created for dataset/definition/textbook/dev.json


## Train model

In [8]:
class GCN(nn.Module):
    """ A GCN/Contextualized GCN module operated on dependency graphs. """

    def __init__(self, opt, embeddings, mem_dim, num_layers):
        super(GCN, self).__init__()
        self.opt = opt
        self.layers = num_layers
        self.use_cuda = opt['cuda']
        self.mem_dim = mem_dim
        self.in_dim = opt['emb_dim'] + opt['pos_dim']

        self.emb, self.pos_emb = embeddings

        # rnn layer
        if self.opt.get('rnn', False):
            input_size = self.in_dim
            self.rnn = nn.LSTM(input_size, opt['rnn_hidden'], opt['rnn_layers'], batch_first=True,
                               dropout=opt['rnn_dropout'], bidirectional=True)
            self.in_dim = opt['rnn_hidden'] * 2
            self.rnn_drop = nn.Dropout(opt['rnn_dropout'])  # use on last layer output

        self.in_drop = nn.Dropout(opt['input_dropout'])
        self.gcn_drop = nn.Dropout(opt['gcn_dropout'])

        # gcn layer
        self.W = nn.ModuleList()

        for layer in range(self.layers):
            input_dim = self.in_dim if layer == 0 else self.mem_dim
            self.W.append(nn.Linear(input_dim, self.mem_dim))

    def conv_l2(self):
        conv_weights = []
        for w in self.W:
            conv_weights += [w.weight, w.bias]
        return sum([x.pow(2).sum() for x in conv_weights])

    def encode_with_rnn(self, rnn_inputs, masks, batch_size):
        seq_lens = list(masks.data.eq(constant.PAD_ID).long().sum(1).squeeze())
        h0, c0 = self.rnn_zero_state(batch_size, self.opt['rnn_hidden'], self.opt['rnn_layers'])
        rnn_inputs = nn.utils.rnn.pack_padded_sequence(rnn_inputs, seq_lens, batch_first=True)
        rnn_outputs, (ht, ct) = self.rnn(rnn_inputs, (h0, c0))
        rnn_outputs, _ = nn.utils.rnn.pad_packed_sequence(rnn_outputs, batch_first=True)
        return rnn_outputs

    def forward(self, adj, inputs):
        words, masks, pos, head, terms, defs, adj = inputs  # unpack
        word_embs = self.emb(words)
        embs = [word_embs]
        if self.opt['pos_dim'] > 0:
            embs += [self.pos_emb(pos)]
        embs = torch.cat(embs, dim=2)
        embs = self.in_drop(embs)

        # rnn layer
        if self.opt.get('rnn', False):
            gcn_inputs = self.rnn_drop(self.encode_with_rnn(embs, masks, words.size()[0]))
        else:
            gcn_inputs = embs

        lstm_outs = gcn_inputs.clone()

        # gcn layer
        denom = adj.sum(2).unsqueeze(2) + 1
        mask = (adj.sum(2) + adj.sum(1)).eq(0).unsqueeze(2)

        for l in range(self.layers):
            Ax = adj.bmm(gcn_inputs)
            AxW = self.W[l](Ax)
            AxW = AxW + self.W[l](gcn_inputs)  # self loop
            AxW = AxW / denom

            gAxW = F.relu(AxW)
            gcn_inputs = self.gcn_drop(gAxW) if l < self.layers - 1 else gAxW

        return lstm_outs, masks.unsqueeze(2), gcn_inputs

    @staticmethod
    def rnn_zero_state(batch_size, hidden_dim, num_layers, bidirectional=True, use_cuda=True):
        total_layers = num_layers * 2 if bidirectional else num_layers
        state_shape = (total_layers, batch_size, hidden_dim)
        h0 = c0 = Variable(torch.zeros(*state_shape), requires_grad=False)
        if torch.cuda.is_available():
            return h0.cuda(), c0.cuda()
        else:
            return h0, c0


In [9]:
class GCNRelationModel(nn.Module):
    def __init__(self, opt, emb_matrix=None):
        super().__init__()
        self.opt = opt
        self.emb_matrix = emb_matrix

        # create embedding layers
        self.emb = nn.Embedding(opt['vocab_size'], opt['emb_dim'], padding_idx=constant.PAD_ID)
        self.pos_emb = nn.Embedding(len(constant.POS_TO_ID), opt['pos_dim']) if opt['pos_dim'] > 0 else None
        embeddings = (self.emb, self.pos_emb)
        self.init_embeddings()

        # gcn layer
        self.gcn = GCN(opt, embeddings, opt['hidden_dim'], opt['num_layers'])

        # output mlp layers
        in_dim = opt['hidden_dim'] * 2
        layers = [nn.Linear(in_dim, opt['hidden_dim']), nn.ReLU()]
        for _ in range(self.opt['mlp_layers'] - 1):
            layers += [nn.Linear(opt['hidden_dim'], opt['hidden_dim']), nn.ReLU()]
        self.out_mlp = nn.Sequential(*layers)

        # gcn output mlp layers
        in_dim = opt['hidden_dim']
        layers = [nn.Linear(in_dim, opt['hidden_dim']), nn.ReLU()]
        for _ in range(self.opt['mlp_layers'] - 1):
            layers += [nn.Linear(opt['hidden_dim'], opt['hidden_dim']), nn.ReLU()]
        self.gcn_out_mlp = nn.Sequential(*layers)

    def init_embeddings(self):
        if self.emb_matrix is None:
            self.emb.weight.data[1:, :].uniform_(-1.0, 1.0)
        else:
            self.emb_matrix = torch.from_numpy(self.emb_matrix)
            self.emb.weight.data.copy_(self.emb_matrix)
        # decide finetuning
        if self.opt['topn'] <= 0:
            print("Do not finetune word embedding layer.")
            self.emb.weight.requires_grad = False
        elif self.opt['topn'] < self.opt['vocab_size']:
            print("Finetune top {} word embeddings.".format(self.opt['topn']))
            self.emb.weight.register_hook(lambda x: torch_utils.keep_partial_grad(x, self.opt['topn']))
        else:
            print("Finetune all embeddings.")

    def forward(self, inputs):
        words, masks, pos, head, terms, defs, adj = inputs  # unpack
        l = (masks.data.cpu().numpy() == 0).astype(np.int64).sum(1)
        maxlen = max(l)

        h, pool_mask, gcn_outputs = self.gcn(adj, inputs)

        outputs = self.out_mlp(h)
        gcn_outputs = self.gcn_out_mlp(gcn_outputs)
        return outputs, gcn_outputs


In [10]:
class GCNClassifier(nn.Module):
    """ A wrapper classifier for GCNRelationModel. """

    def __init__(self, opt, emb_matrix=None):
        super().__init__()
        self.opt = opt
        self.gcn_model = GCNRelationModel(opt, emb_matrix=emb_matrix)
        in_dim = opt['hidden_dim']
        self.classifier = nn.Linear(in_dim * 2, opt['num_class'])
        self.selector = nn.Sequential(nn.Linear(in_dim, 1), nn.Sigmoid())

        in_dim = opt['hidden_dim']
        layers = [nn.Linear(in_dim, opt['hidden_dim']), nn.ReLU()]
        for _ in range(self.opt['mlp_layers'] - 1):
            layers += [nn.Linear(opt['hidden_dim'], opt['hidden_dim']), nn.ReLU()]
        self.out_mlp = nn.Sequential(*layers)

        self.sent_classifier = nn.Sequential(nn.Linear(in_dim, 1), nn.Sigmoid())
        self.term_classifier = nn.Sequential(nn.Linear(in_dim * 2, 1), nn.Sigmoid())
        self.opt = opt

    def conv_l2(self):
        return self.gcn_model.gcn.conv_l2()

    def forward(self, inputs):
        _, masks, _, _, terms, defs, _ = inputs  # unpack

        outputs, gcn_outputs = self.gcn_model(inputs)
        logits = self.classifier(torch.cat([outputs, gcn_outputs], dim=2))

        pool_type = self.opt['pooling']
        out = self.pool(outputs, masks.unsqueeze(2), type=pool_type)
        out = self.out_mlp(out)
        sent_logits = self.sent_classifier(out)

        terms_out = self.pool(F.softmax(outputs), terms.unsqueeze(2).byte(), type=pool_type)
        defs_out = self.pool(F.softmax(outputs), defs.unsqueeze(2).byte(), type=pool_type)
        term_def = (terms_out * defs_out).sum(1).mean()
        not_term_def = (terms_out * defs_out[torch.randperm(terms_out.shape[0])]).sum(1).mean()

        selections = self.selector(gcn_outputs)

        defs_out = self.pool(outputs, defs.unsqueeze(2).byte(), type=pool_type).repeat(1, outputs.shape[1]).view(
            *outputs.shape)
        term_selections = self.term_classifier(torch.cat([defs_out, outputs], dim=2))

        return logits, sent_logits.squeeze(), selections.squeeze(), term_def, not_term_def, term_selections

    @staticmethod
    def pool(h, mask, type='max'):
        if type == 'max':
            h = h.masked_fill(mask.bool(), -constant.INFINITY_NUMBER)
            return torch.max(h, 1)[0]
        elif type == 'avg':
            h = h.masked_fill(mask.bool(), 0)
            return h.sum(1) / (mask.size(1) - mask.float().sum(1))
        else:
            h = h.masked_fill(mask.bool(), 0)
            return h.sum(1)


In [11]:
class GCNTrainer:
    def __init__(self, opt, emb_matrix=None):
        self.opt = opt
        self.emb_matrix = emb_matrix
        self.model = GCNClassifier(opt, emb_matrix=emb_matrix)
        self.criterion = nn.CrossEntropyLoss(reduction="none")
        self.parameters = [
            parameter
            for parameter in self.model.parameters()
            if parameter.requires_grad
        ]
        self.crf = CRF(self.opt['num_class'], batch_first=True)
        self.bc = nn.BCELoss()
        if torch.cuda.is_available():
            self.model.cuda()
            self.criterion.cuda()
            self.crf.cuda()
            self.bc.cuda()

        self.optimizer = torch_utils.get_optimizer(
            opt['optim'],
            self.parameters,
            opt['lr']
        )

    def update(self, batch):
        inputs, labels, sent_labels, dep_path, tokens, head, lens = self.unpack_batch(batch, self.opt['cuda'])

        _, _, _, _, terms, _, _ = inputs

        # step forward
        self.model.train()
        self.optimizer.zero_grad()
        logits, class_logits, selections, term_def, not_term_def, term_selections = self.model(inputs)

        labels = labels - 1
        labels[labels < 0] = 0
        mask = inputs[1].float()
        mask[mask == 0.] = -1.
        mask[mask == 1.] = 0.
        mask[mask == -1.] = 1.
        mask = mask.byte()
        loss = -self.crf(logits, labels, mask=mask)

        sent_loss = self.bc(class_logits, sent_labels)
        loss += self.opt['sent_loss'] * sent_loss

        selection_loss = self.bc(selections.view(-1, 1), dep_path.view(-1, 1))
        loss += self.opt['dep_path_loss'] * selection_loss

        term_def_loss = -self.opt['consistency_loss'] * (term_def - not_term_def)
        loss += term_def_loss
        # loss += self.opt['consistency_loss'] * not_term_def

        term_loss = self.opt['sent_loss'] * self.bc(term_selections.view(-1, 1), terms.float().view(-1, 1))
        loss += term_loss

        loss_val = loss.item()
        # backward
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.opt['max_grad_norm'])
        self.optimizer.step()
        return loss_val, sent_loss.item(), term_loss.item()

    def predict(self, batch, unsort=True):
        inputs, labels, sent_labels, dep_path, tokens, head, lens = self.unpack_batch(batch, self.opt['cuda'])

        orig_idx = batch[-1]
        # forward
        self.model.eval()
        logits, sent_logits, _, _, _, _ = self.model(inputs)

        labels = labels - 1
        labels[labels < 0] = 0
        mask = inputs[1].float()
        mask[mask == 0.] = -1.
        mask[mask == 1.] = 0.
        mask[mask == -1.] = 1.
        mask = mask.byte()
        loss = -self.crf(logits, labels, mask=mask)

        probs = F.softmax(logits, dim=1)
        predictions = self.crf.decode(logits, mask=mask)

        sent_predictions = sent_logits.round().long().data.cpu().numpy()

        if unsort:
            _, predictions, probs, sent_predictions = [list(t) for t in zip(*sorted(
                zip(orig_idx, predictions, probs, sent_predictions)))]
        return predictions, probs, loss.item(), sent_predictions

    def update_lr(self, new_lr):
        """
        This function updates the learning rate of the optimizer used by the Trainer.
        It sets a new learning rate new_lr for all parameter groups in the optimizer.

        The learning rate influences the speed and quality of the learning process
        """
        for param_group in self.optimizer.param_groups:
            param_group['lr'] = new_lr

    def load(self, filename):
        """
        The load function loads a model from the given filename using torch.load.

        It then loads the state dictionary of the model and the configuration from the checkpoint.

        If an exception occurs during the loading process,
        it prints an error message and exits the program.
        """
        try:
            checkpoint = torch.load(filename)
        except BaseException:
            checkpoint = torch.load(filename, map_location=torch.device('cpu'))
        self.model.load_state_dict(checkpoint['model'])
        self.opt = checkpoint['config']

    def save(self, filename, epoch):
        params = {
            'model':  self.model.state_dict(),
            'config': self.opt,
        }
        try:
            torch.save(params, filename)
            print("model saved to {}".format(filename))
        except BaseException:
            print("[Warning: Saving failed... continuing anyway.]")

    @staticmethod
    def unpack_batch(batch, cuda):
        if torch.cuda.is_available():
            inputs = [Variable(b.cuda()) for b in batch[:7]]
            labels = Variable(batch[7].cuda())
            sent_labels = Variable(batch[8].cuda())
            dep_path = Variable(batch[9].cuda())
        else:
            inputs = [Variable(b) for b in batch[:7]]
            labels = Variable(batch[7])
            sent_labels = Variable(batch[8])
            dep_path = Variable(batch[9])

        tokens = batch[0]
        head = batch[3]
        lens = batch[1].eq(0).long().sum(1).squeeze()
        return inputs, labels, sent_labels, dep_path, tokens, head, lens


In [12]:
trainer = GCNTrainer(opt, emb_matrix=emb_matrix)

Finetune all embeddings.




In [13]:
id2label = dict([(v, k) for k, v in constant.LABEL_TO_ID.items()])
dev_score_history = []
current_lr = opt['lr']

In [14]:
global_step = 0
global_start_time = time.time()
max_steps = len(train_batch) * opt['num_epoch']

In [None]:
# start training
for epoch in range(1, opt['num_epoch'] + 1):
    train_loss = 0
    train_sent_loss = 0
    train_dep_path_loss = 0
    for i, batch in enumerate(train_batch):
        start_time = time.time()
        global_step += 1
        loss, sent_loss, dep_path_loss = trainer.update(batch)
        train_loss += loss
        train_sent_loss += sent_loss
        train_dep_path_loss += dep_path_loss
        if global_step % opt['log_step'] == 0:
            duration = time.time() - start_time
            print(
                f"{time.strftime('%H:%M:%S', time.localtime())}: step {global_step}/{max_steps} (epoch {epoch}/{opt['num_epoch']}), loss = {loss:.6f}, sent_loss = {sent_loss:.6f}, dep_path_loss = {dep_path_loss:.6f} ({duration:.3f} sec/batch), lr: {current_lr:.6f}"
            )

    # eval on dev
    print("Evaluating on dev set...")
    predictions = []
    dev_loss = 0
    for i, batch in enumerate(dev_batch):
        preds, _, loss, _ = trainer.predict(batch)
        predictions += preds
        dev_loss += loss

    predictions = [[id2label[l + 1]] for p in predictions for l in p]
    train_loss = train_loss / len(train_batch) * opt['batch_size']  # avg loss per batch
    train_sent_loss = train_sent_loss / len(train_batch) * opt['batch_size']  # avg loss per batch
    train_dep_path_loss = train_dep_path_loss / len(train_batch) * opt['batch_size']  # avg loss per batch
    dev_loss = dev_loss / len(dev_batch) * opt['batch_size']

    dev_p, dev_r, dev_f1 = scorer.score(dev_batch.gold(), predictions, method='macro')
    print(
        f"epoch {epoch}: train_loss = {train_loss:.6f}, "
        f"train_sent_loss = {train_sent_loss:.6f}, "
        f"train_dep_path_loss = {train_dep_path_loss:.6f}, "
        f"dev_loss = {dev_loss:.6f}, dev_f1 = {dev_f1:.4f}"
    )
    dev_score = dev_f1

    # save
    model_file = model_save_dir + f'/checkpoint_epoch_{epoch}.pt'
    trainer.save(model_file, epoch)
    if (epoch == 1 or dev_score > max(dev_score_history)) and epoch % opt['save_epoch'] == 0:
        copyfile(model_file, model_save_dir + '/best_model.pt')

        print(f"new best model saved at epoch {epoch}: {dev_p * 100:.2f}\t{dev_r * 100:.2f}\t{dev_score * 100:.2f}")

    # lr schedule
    if (
            len(dev_score_history) > opt['decay_epoch'] and
            dev_score <= dev_score_history[-1] and
            opt['optim'] in ['sgd', 'adagrad', 'adadelta']
    ):
        current_lr *= opt['lr_decay']

    trainer.update_lr(current_lr)
    dev_score_history += [dev_score]

    print("Training ended with {} epochs.".format(epoch))

  terms_out = self.pool(F.softmax(outputs), terms.unsqueeze(2).byte(), type=pool_type)
  defs_out = self.pool(F.softmax(outputs), defs.unsqueeze(2).byte(), type=pool_type)
  score = torch.where(mask[i].unsqueeze(1), next_score, score)


01:59:36: step 20/35400 (epoch 1/100), loss = 912.394592, sent_loss = 0.605355, dep_path_loss = 7.904532 (0.344 sec/batch), lr: 0.000300
01:59:42: step 40/35400 (epoch 1/100), loss = 780.796692, sent_loss = 0.650625, dep_path_loss = 8.824707 (0.257 sec/batch), lr: 0.000300
01:59:47: step 60/35400 (epoch 1/100), loss = 835.857971, sent_loss = 0.638335, dep_path_loss = 7.909427 (0.301 sec/batch), lr: 0.000300
01:59:54: step 80/35400 (epoch 1/100), loss = 571.141479, sent_loss = 0.607847, dep_path_loss = 5.441769 (0.267 sec/batch), lr: 0.000300
01:59:59: step 100/35400 (epoch 1/100), loss = 756.993103, sent_loss = 0.586071, dep_path_loss = 4.268195 (0.371 sec/batch), lr: 0.000300
02:00:06: step 120/35400 (epoch 1/100), loss = 810.769470, sent_loss = 0.441815, dep_path_loss = 7.167094 (0.254 sec/batch), lr: 0.000300
02:00:12: step 140/35400 (epoch 1/100), loss = 594.306335, sent_loss = 0.481428, dep_path_loss = 4.250006 (0.288 sec/batch), lr: 0.000300
02:00:19: step 160/35400 (epoch 1/100)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


epoch 1: train_loss = 34531.108598, train_sent_loss = 27.384392, train_dep_path_loss = 291.068073, dev_loss = 25822.843348, dev_f1 = 0.2335
model saved to ./saved_models/1/checkpoint_epoch_1.pt
Training ended with 1 epochs.


  terms_out = self.pool(F.softmax(outputs), terms.unsqueeze(2).byte(), type=pool_type)
  defs_out = self.pool(F.softmax(outputs), defs.unsqueeze(2).byte(), type=pool_type)


02:01:35: step 360/35400 (epoch 2/100), loss = 578.206909, sent_loss = 0.531332, dep_path_loss = 2.718844 (0.282 sec/batch), lr: 0.000300
02:01:41: step 380/35400 (epoch 2/100), loss = 452.817261, sent_loss = 0.375164, dep_path_loss = 4.162277 (0.283 sec/batch), lr: 0.000300
02:01:47: step 400/35400 (epoch 2/100), loss = 632.138123, sent_loss = 0.568697, dep_path_loss = 4.397583 (0.305 sec/batch), lr: 0.000300
02:01:56: step 420/35400 (epoch 2/100), loss = 526.174255, sent_loss = 0.433998, dep_path_loss = 3.143573 (0.425 sec/batch), lr: 0.000300
02:02:03: step 440/35400 (epoch 2/100), loss = 662.606689, sent_loss = 0.426815, dep_path_loss = 4.662734 (0.289 sec/batch), lr: 0.000300
02:02:10: step 460/35400 (epoch 2/100), loss = 847.318970, sent_loss = 0.494855, dep_path_loss = 5.670879 (0.376 sec/batch), lr: 0.000300
02:02:16: step 480/35400 (epoch 2/100), loss = 666.997864, sent_loss = 0.525844, dep_path_loss = 3.012007 (0.319 sec/batch), lr: 0.000300
02:02:22: step 500/35400 (epoch 2/

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


epoch 2: train_loss = 29223.149057, train_sent_loss = 23.580261, train_dep_path_loss = 184.560276, dev_loss = 25951.950294, dev_f1 = 0.2754
model saved to ./saved_models/1/checkpoint_epoch_2.pt
Training ended with 2 epochs.


  terms_out = self.pool(F.softmax(outputs), terms.unsqueeze(2).byte(), type=pool_type)
  defs_out = self.pool(F.softmax(outputs), defs.unsqueeze(2).byte(), type=pool_type)


02:04:18: step 720/35400 (epoch 3/100), loss = 392.521881, sent_loss = 0.392996, dep_path_loss = 1.604202 (1.104 sec/batch), lr: 0.000300
02:04:34: step 740/35400 (epoch 3/100), loss = 490.627441, sent_loss = 0.515500, dep_path_loss = 3.180596 (0.787 sec/batch), lr: 0.000300
02:04:51: step 760/35400 (epoch 3/100), loss = 588.346375, sent_loss = 0.363829, dep_path_loss = 3.823520 (0.839 sec/batch), lr: 0.000300
02:05:02: step 780/35400 (epoch 3/100), loss = 318.281494, sent_loss = 0.347403, dep_path_loss = 2.923869 (0.309 sec/batch), lr: 0.000300
02:05:15: step 800/35400 (epoch 3/100), loss = 631.860596, sent_loss = 0.446777, dep_path_loss = 4.156435 (0.296 sec/batch), lr: 0.000300
02:05:24: step 820/35400 (epoch 3/100), loss = 647.165649, sent_loss = 0.528728, dep_path_loss = 2.137358 (0.342 sec/batch), lr: 0.000300
