# BERT
微调将最后一层的第一个token即[CLS]的隐藏向量作为句子的表示，然后输入到softmax层进行分类。

In [10]:
!pip install transformers

Collecting transformers
  Downloading transformers-3.0.2-py3-none-any.whl (769 kB)
Collecting sacremoses
  Downloading sacremoses-0.0.43.tar.gz (883 kB)
Collecting tokenizers==0.8.1.rc1
  Downloading tokenizers-0.8.1rc1-cp38-cp38-win_amd64.whl (1.9 MB)
Collecting sentencepiece!=0.1.92
  Downloading sentencepiece-0.1.91-cp38-cp38-win_amd64.whl (1.2 MB)
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py): started
  Building wheel for sacremoses (setup.py): finished with status 'done'
  Created wheel for sacremoses: filename=sacremoses-0.0.43-py3-none-any.whl size=893262 sha256=5df2728df59b0177f9da0fa538f18f507b7a40513205bc56cb02fdecebd4adc7
  Stored in directory: c:\users\learn\appdata\local\pip\cache\wheels\7b\78\f4\27d43a65043e1b75dbddaa421b573eddc67e712be4b1c80677
Successfully built sacremoses
Installing collected packages: sacremoses, tokenizers, sentencepiece, transformers
Successfully installed sacremoses-0.0.43 sentencepiece-0.1.91 tokeniz

In [49]:
import logging
import random

import numpy as np
import torch

logging.basicConfig(level=logging.INFO, format='%(asctime)-15s %(levelname)s: %(message)s')

# set seed
seed = 666
random.seed(seed)
np.random.seed(seed)
torch.cuda.manual_seed(seed)
torch.manual_seed(seed)

# set cuda
gpu = 0
use_cuda = gpu >= 0 and torch.cuda.is_available()
if use_cuda:
    torch.cuda.set_device(gpu)
    device = torch.device("cuda", gpu)
else:
    device = torch.device("cpu")
logging.info("Use cuda: %s, gpu id: %d.", use_cuda, gpu)

2020-08-05 22:17:33,423 INFO: Use cuda: True, gpu id: 0.


In [50]:
# split data to 10 fold
fold_num = 10
data_file = '../input/train_set.csv'
import pandas as pd

def all_data2fold(fold_num, num=200000):#10000->200000
    fold_data = []
    f = pd.read_csv(data_file, sep='\t', encoding='UTF-8')
    texts = f['text'].tolist()[:num]
    labels = f['label'].tolist()[:num]

    total = len(labels)

    index = list(range(total))
    np.random.shuffle(index)

    all_texts = []
    all_labels = []
    for i in index:
        all_texts.append(texts[i])
        all_labels.append(labels[i])

    label2id = {}
    for i in range(total):
        label = str(all_labels[i])
        if label not in label2id:
            label2id[label] = [i]
        else:
            label2id[label].append(i)

    all_index = [[] for _ in range(fold_num)]
    for label, data in label2id.items():
        # print(label, len(data))
        batch_size = int(len(data) / fold_num)
        other = len(data) - batch_size * fold_num
        for i in range(fold_num):
            cur_batch_size = batch_size + 1 if i < other else batch_size
            # print(cur_batch_size)
            batch_data = [data[i * batch_size + b] for b in range(cur_batch_size)]
            all_index[i].extend(batch_data)

    batch_size = int(total / fold_num)
    other_texts = []
    other_labels = []
    other_num = 0
    start = 0
    for fold in range(fold_num):
        num = len(all_index[fold])
        texts = [all_texts[i] for i in all_index[fold]]
        labels = [all_labels[i] for i in all_index[fold]]

        if num > batch_size:
            fold_texts = texts[:batch_size]
            other_texts.extend(texts[batch_size:])
            fold_labels = labels[:batch_size]
            other_labels.extend(labels[batch_size:])
            other_num += num - batch_size
        elif num < batch_size:
            end = start + batch_size - num
            fold_texts = texts + other_texts[start: end]
            fold_labels = labels + other_labels[start: end]
            start = end
        else:
            fold_texts = texts
            fold_labels = labels

        assert batch_size == len(fold_labels)

        # shuffle
        index = list(range(batch_size))
        np.random.shuffle(index)

        shuffle_fold_texts = []
        shuffle_fold_labels = []
        for i in index:
            shuffle_fold_texts.append(fold_texts[i])
            shuffle_fold_labels.append(fold_labels[i])

        data = {'label': shuffle_fold_labels, 'text': shuffle_fold_texts}
        fold_data.append(data)

    logging.info("Fold lens %s", str([len(data['label']) for data in fold_data]))

    return fold_data


fold_data = all_data2fold(10)

2020-08-05 22:17:43,921 INFO: Fold lens [20000, 20000, 20000, 20000, 20000, 20000, 20000, 20000, 20000, 20000]


In [51]:
# build train, dev, test data
fold_id = 9

# dev
dev_data = fold_data[fold_id]

# train
train_texts = []
train_labels = []
for i in range(0, fold_id):
    data = fold_data[i]
    train_texts.extend(data['text'])
    train_labels.extend(data['label'])

train_data = {'label': train_labels, 'text': train_texts}

# test
test_data_file = '../input/test_a.csv'
f = pd.read_csv(test_data_file, sep='\t', encoding='UTF-8')
texts = f['text'].tolist()
test_data = {'label': [0] * len(texts), 'text': texts}

In [52]:
# build vocab
from collections import Counter
from transformers import BasicTokenizer

basic_tokenizer = BasicTokenizer()


class Vocab():
    def __init__(self, train_data):
        self.min_count = 5
        self.pad = 0
        self.unk = 1
        self._id2word = ['[PAD]', '[UNK]']
        self._id2extword = ['[PAD]', '[UNK]']

        self._id2label = []
        self.target_names = []

        self.build_vocab(train_data)

        reverse = lambda x: dict(zip(x, range(len(x))))
        self._word2id = reverse(self._id2word)
        self._label2id = reverse(self._id2label)

        logging.info("Build vocab: words %d, labels %d." % (self.word_size, self.label_size))

    def build_vocab(self, data):
        self.word_counter = Counter()

        for text in data['text']:
            words = text.split()
            for word in words:
                self.word_counter[word] += 1

        for word, count in self.word_counter.most_common():
            if count >= self.min_count:
                self._id2word.append(word)

        label2name = {0: '科技', 1: '股票', 2: '体育', 3: '娱乐', 4: '时政', 5: '社会', 6: '教育', 7: '财经',
                      8: '家居', 9: '游戏', 10: '房产', 11: '时尚', 12: '彩票', 13: '星座'}

        self.label_counter = Counter(data['label'])

        for label in range(len(self.label_counter)):
            count = self.label_counter[label]
            self._id2label.append(label)
            self.target_names.append(label2name[label])

    def load_pretrained_embs(self, embfile):
        with open(embfile, encoding='utf-8') as f:
            lines = f.readlines()
            items = lines[0].split()
            word_count, embedding_dim = int(items[0]), int(items[1])

        index = len(self._id2extword)
        embeddings = np.zeros((word_count + index, embedding_dim))
        for line in lines[1:]:
            values = line.split()
            self._id2extword.append(values[0])
            vector = np.array(values[1:], dtype='float64')
            embeddings[self.unk] += vector
            embeddings[index] = vector
            index += 1

        embeddings[self.unk] = embeddings[self.unk] / word_count
        embeddings = embeddings / np.std(embeddings)

        reverse = lambda x: dict(zip(x, range(len(x))))
        self._extword2id = reverse(self._id2extword)

        assert len(set(self._id2extword)) == len(self._id2extword)

        return embeddings

    def word2id(self, xs):
        if isinstance(xs, list):
            return [self._word2id.get(x, self.unk) for x in xs]
        return self._word2id.get(xs, self.unk)

    def extword2id(self, xs):
        if isinstance(xs, list):
            return [self._extword2id.get(x, self.unk) for x in xs]
        return self._extword2id.get(xs, self.unk)

    def label2id(self, xs):
        if isinstance(xs, list):
            return [self._label2id.get(x, self.unk) for x in xs]
        return self._label2id.get(xs, self.unk)

    @property
    def word_size(self):
        return len(self._id2word)

    @property
    def extword_size(self):
        return len(self._id2extword)

    @property
    def label_size(self):
        return len(self._id2label)


vocab = Vocab(train_data)

2020-08-05 22:18:36,918 INFO: Build vocab: words 5978, labels 14.


In [53]:
# build module
import torch.nn as nn
import torch.nn.functional as F


class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.weight = nn.Parameter(torch.Tensor(hidden_size, hidden_size))
        self.weight.data.normal_(mean=0.0, std=0.05)

        self.bias = nn.Parameter(torch.Tensor(hidden_size))
        b = np.zeros(hidden_size, dtype=np.float32)
        self.bias.data.copy_(torch.from_numpy(b))

        self.query = nn.Parameter(torch.Tensor(hidden_size))
        self.query.data.normal_(mean=0.0, std=0.05)

    def forward(self, batch_hidden, batch_masks):
        # batch_hidden: b x len x hidden_size (2 * hidden_size of lstm)
        # batch_masks:  b x len

        # linear
        key = torch.matmul(batch_hidden, self.weight) + self.bias  # b x len x hidden

        # compute attention
        outputs = torch.matmul(key, self.query)  # b x len

        masked_outputs = outputs.masked_fill((1 - batch_masks).bool(), float(-1e32))

        attn_scores = F.softmax(masked_outputs, dim=1)  # b x len

        # 对于全零向量，-1e32的结果为 1/len, -inf为nan, 额外补0
        masked_attn_scores = attn_scores.masked_fill((1 - batch_masks).bool(), 0.0)

        # sum weighted sources
        batch_outputs = torch.bmm(masked_attn_scores.unsqueeze(1), key).squeeze(1)  # b x hidden

        return batch_outputs, attn_scores


# build word encoder
bert_path = '../emb/bert-mini/'
dropout = 0.5 #0.15->0.5

from transformers import BertModel


class WordBertEncoder(nn.Module):
    def __init__(self):
        super(WordBertEncoder, self).__init__()
        self.dropout = nn.Dropout(dropout)

        self.tokenizer = WhitespaceTokenizer()
        self.bert = BertModel.from_pretrained(bert_path)

        self.pooled = False
        logging.info('Build Bert encoder with pooled {}.'.format(self.pooled))

    def encode(self, tokens):
        tokens = self.tokenizer.tokenize(tokens)
        return tokens

    def get_bert_parameters(self):
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_parameters = [
            {'params': [p for n, p in self.bert.named_parameters() if not any(nd in n for nd in no_decay)],
             'weight_decay': 0.01},
            {'params': [p for n, p in self.bert.named_parameters() if any(nd in n for nd in no_decay)],
             'weight_decay': 0.0}
        ]
        return optimizer_parameters
    #微调将最后一层的第一个token即[CLS]的隐藏向量作为句子的表示，然后输入到softmax层进行分类。
    def forward(self, input_ids, token_type_ids):
        # input_ids: sen_num x bert_len
        # token_type_ids: sen_num  x bert_len

        # sen_num x bert_len x 256, sen_num x 256
        sequence_output, pooled_output = self.bert(input_ids=input_ids, token_type_ids=token_type_ids)

        if self.pooled:
            reps = pooled_output
        else:
            reps = sequence_output[:, 0, :]  # sen_num x 256

        if self.training:
            reps = self.dropout(reps)

        return reps


class WhitespaceTokenizer():
    """WhitespaceTokenizer with vocab."""

    def __init__(self):
        vocab_file = bert_path + 'vocab.txt'
        self._token2id = self.load_vocab(vocab_file)
        self._id2token = {v: k for k, v in self._token2id.items()}
        self.max_len = 256
        self.unk = 1

        logging.info("Build Bert vocab with size %d." % (self.vocab_size))

    def load_vocab(self, vocab_file):
        f = open(vocab_file, 'r')
        lines = f.readlines()
        lines = list(map(lambda x: x.strip(), lines))
        vocab = dict(zip(lines, range(len(lines))))
        return vocab

    def tokenize(self, tokens):
        assert len(tokens) <= self.max_len - 2
        tokens = ["[CLS]"] + tokens + ["[SEP]"]
        output_tokens = self.token2id(tokens)
        return output_tokens

    def token2id(self, xs):
        if isinstance(xs, list):
            return [self._token2id.get(x, self.unk) for x in xs]
        return self._token2id.get(xs, self.unk)

    @property
    def vocab_size(self):
        return len(self._id2token)


# build sent encoder
sent_hidden_size = 256
sent_num_layers = 2


class SentEncoder(nn.Module):
    def __init__(self, sent_rep_size):
        super(SentEncoder, self).__init__()
        self.dropout = nn.Dropout(dropout)

        self.sent_lstm = nn.LSTM(
            input_size=sent_rep_size,
            hidden_size=sent_hidden_size,
            num_layers=sent_num_layers,
            batch_first=True,
            bidirectional=True
        )

    def forward(self, sent_reps, sent_masks):
        # sent_reps:  b x doc_len x sent_rep_size
        # sent_masks: b x doc_len

        sent_hiddens, _ = self.sent_lstm(sent_reps)  # b x doc_len x hidden*2
        sent_hiddens = sent_hiddens * sent_masks.unsqueeze(2)

        if self.training:
            sent_hiddens = self.dropout(sent_hiddens)

        return sent_hiddens

In [54]:
# build model
class Model(nn.Module):
    def __init__(self, vocab):
        super(Model, self).__init__()
        self.sent_rep_size = 256
        self.doc_rep_size = sent_hidden_size * 2
        self.all_parameters = {}
        parameters = []
        self.word_encoder = WordBertEncoder()
        bert_parameters = self.word_encoder.get_bert_parameters()

        self.sent_encoder = SentEncoder(self.sent_rep_size)
        self.sent_attention = Attention(self.doc_rep_size)
        parameters.extend(list(filter(lambda p: p.requires_grad, self.sent_encoder.parameters())))
        parameters.extend(list(filter(lambda p: p.requires_grad, self.sent_attention.parameters())))

        self.out = nn.Linear(self.doc_rep_size, vocab.label_size, bias=True)
        parameters.extend(list(filter(lambda p: p.requires_grad, self.out.parameters())))

        if use_cuda:
            self.to(device)

        if len(parameters) > 0:
            self.all_parameters["basic_parameters"] = parameters
        self.all_parameters["bert_parameters"] = bert_parameters

        logging.info('Build model with bert word encoder, lstm sent encoder.')

        para_num = sum([np.prod(list(p.size())) for p in self.parameters()])
        logging.info('Model param num: %.2f M.' % (para_num / 1e6))

    def forward(self, batch_inputs):
        # batch_inputs(batch_inputs1, batch_inputs2): b x doc_len x sent_len
        # batch_masks : b x doc_len x sent_len
        batch_inputs1, batch_inputs2, batch_masks = batch_inputs
        batch_size, max_doc_len, max_sent_len = batch_inputs1.shape[0], batch_inputs1.shape[1], batch_inputs1.shape[2]
        batch_inputs1 = batch_inputs1.view(batch_size * max_doc_len, max_sent_len)  # sen_num x sent_len
        batch_inputs2 = batch_inputs2.view(batch_size * max_doc_len, max_sent_len)  # sen_num x sent_len
        batch_masks = batch_masks.view(batch_size * max_doc_len, max_sent_len)  # sen_num x sent_len

        sent_reps = self.word_encoder(batch_inputs1, batch_inputs2)  # sen_num x sent_rep_size

        sent_reps = sent_reps.view(batch_size, max_doc_len, self.sent_rep_size)  # b x doc_len x sent_rep_size
        batch_masks = batch_masks.view(batch_size, max_doc_len, max_sent_len)  # b x doc_len x max_sent_len
        sent_masks = batch_masks.bool().any(2).float()  # b x doc_len

        sent_hiddens = self.sent_encoder(sent_reps, sent_masks)  # b x doc_len x doc_rep_size
        doc_reps, atten_scores = self.sent_attention(sent_hiddens, sent_masks)  # b x doc_rep_size

        batch_outputs = self.out(doc_reps)  # b x num_labels

        return batch_outputs
    
model = Model(vocab)

2020-08-05 22:18:36,962 INFO: Build Bert vocab with size 5981.
2020-08-05 22:18:36,963 INFO: loading configuration file ../emb/bert-mini/config.json
2020-08-05 22:18:36,964 INFO: Model config BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 256,
  "model_type": "bert",
  "num_attention_heads": 4,
  "num_hidden_layers": 4,
  "pad_token_id": 0,
  "type_vocab_size": 2,
  "vocab_size": 5981
}

2020-08-05 22:18:36,966 INFO: loading weights file ../emb/bert-mini/pytorch_model.bin
2020-08-05 22:18:37,044 INFO: All model checkpoint weights were used when initializing BertModel.

2020-08-05 22:18:37,046 INFO: All the weights of BertModel were initialized from the model checkpoint at ../emb/bert-mini/.
If your task is similar to the task the model of the ckeckpoint was tr

In [55]:
# build optimizer
learning_rate = 2e-4
bert_lr = 5e-5
decay = .75
decay_step = 1000
from transformers import AdamW, get_linear_schedule_with_warmup


class Optimizer:
    def __init__(self, model_parameters, steps):
        self.all_params = []
        self.optims = []
        self.schedulers = []

        for name, parameters in model_parameters.items():
            if name.startswith("basic"):
                optim = torch.optim.Adam(parameters, lr=learning_rate)
                self.optims.append(optim)

                l = lambda step: decay ** (step // decay_step)
                scheduler = torch.optim.lr_scheduler.LambdaLR(optim, lr_lambda=l)
                self.schedulers.append(scheduler)
                self.all_params.extend(parameters)
            elif name.startswith("bert"):
                optim_bert = AdamW(parameters, bert_lr, eps=1e-8)
                self.optims.append(optim_bert)

                scheduler_bert = get_linear_schedule_with_warmup(optim_bert, 0, steps)
                self.schedulers.append(scheduler_bert)

                for group in parameters:
                    for p in group['params']:
                        self.all_params.append(p)
            else:
                Exception("no nameed parameters.")

        self.num = len(self.optims)

    def step(self):
        for optim, scheduler in zip(self.optims, self.schedulers):
            optim.step()
            scheduler.step()
            optim.zero_grad()

    def zero_grad(self):
        for optim in self.optims:
            optim.zero_grad()

    def get_lr(self):
        lrs = tuple(map(lambda x: x.get_lr()[-1], self.schedulers))
        lr = ' %.5f' * self.num
        res = lr % lrs
        return res

In [56]:
# build dataset
def sentence_split(text, vocab, max_sent_len=256, max_segment=16):
    words = text.strip().split()
    document_len = len(words)

    index = list(range(0, document_len, max_sent_len))
    index.append(document_len)

    segments = []
    for i in range(len(index) - 1):
        segment = words[index[i]: index[i + 1]]
        assert len(segment) > 0
        segment = [word if word in vocab._id2word else '<UNK>' for word in segment]
        segments.append([len(segment), segment])

    assert len(segments) > 0
    if len(segments) > max_segment:
        segment_ = int(max_segment / 2)
        return segments[:segment_] + segments[-segment_:]
    else:
        return segments


def get_examples(data, word_encoder, vocab, max_sent_len=256, max_segment=8):
    label2id = vocab.label2id
    examples = []

    for text, label in zip(data['text'], data['label']):
        # label
        id = label2id(label)

        # words
        sents_words = sentence_split(text, vocab, max_sent_len-2, max_segment)
        doc = []
        for sent_len, sent_words in sents_words:
            token_ids = word_encoder.encode(sent_words)
            sent_len = len(token_ids)
            token_type_ids = [0] * sent_len
            doc.append([sent_len, token_ids, token_type_ids])
        examples.append([id, len(doc), doc])

    logging.info('Total %d docs.' % len(examples))
    return examples

In [57]:
# build loader

def batch_slice(data, batch_size):
    batch_num = int(np.ceil(len(data) / float(batch_size)))
    for i in range(batch_num):
        cur_batch_size = batch_size if i < batch_num - 1 else len(data) - batch_size * i
        docs = [data[i * batch_size + b] for b in range(cur_batch_size)]

        yield docs


def data_iter(data, batch_size, shuffle=True, noise=1.0):
    """
    randomly permute data, then sort by source length, and partition into batches
    ensure that the length of  sentences in each batch
    """

    batched_data = []
    if shuffle:
        np.random.shuffle(data)

        lengths = [example[1] for example in data]
        noisy_lengths = [- (l + np.random.uniform(- noise, noise)) for l in lengths]
        sorted_indices = np.argsort(noisy_lengths).tolist()
        sorted_data = [data[i] for i in sorted_indices]
    else:
        sorted_data =data
        
    batched_data.extend(list(batch_slice(sorted_data, batch_size)))

    if shuffle:
        np.random.shuffle(batched_data)

    for batch in batched_data:
        yield batch

In [58]:
# some function
from sklearn.metrics import f1_score, precision_score, recall_score


def get_score(y_ture, y_pred):
    y_ture = np.array(y_ture)
    y_pred = np.array(y_pred)
    f1 = f1_score(y_ture, y_pred, average='macro') * 100
    p = precision_score(y_ture, y_pred, average='macro') * 100
    r = recall_score(y_ture, y_pred, average='macro') * 100

    return str((reformat(p, 2), reformat(r, 2), reformat(f1, 2))), reformat(f1, 2)


def reformat(num, n):
    return float(format(num, '0.' + str(n) + 'f'))

In [59]:
# build trainer

import time
from sklearn.metrics import classification_report

clip = 5.0
epochs = 1
early_stops = 3
log_interval = 50

test_batch_size = 16
train_batch_size = 16

save_model = '../output/bert.bin'
save_test = '../output/bert.csv'
save_pred = '../output/SubmitBert.csv'

class Trainer():
    def __init__(self, model, vocab):
        self.model = model
        self.report = True
        
        self.train_data = get_examples(train_data, model.word_encoder, vocab)
        self.batch_num = int(np.ceil(len(self.train_data) / float(train_batch_size)))
        self.dev_data = get_examples(dev_data, model.word_encoder, vocab)
        self.test_data = get_examples(test_data, model.word_encoder, vocab)

        # criterion
        self.criterion = nn.CrossEntropyLoss()

        # label name
        self.target_names = vocab.target_names

        # optimizer
        self.optimizer = Optimizer(model.all_parameters, steps=self.batch_num * epochs)

        # count
        self.step = 0
        self.early_stop = -1
        self.best_train_f1, self.best_dev_f1 = 0, 0
        self.last_epoch = epochs

    def train(self):
        logging.info('Start training...')
        for epoch in range(1, epochs + 1):
            train_f1 = self._train(epoch)

            dev_f1 = self._eval(epoch)

            if self.best_dev_f1 <= dev_f1:
                logging.info(
                    "Exceed history dev = %.2f, current dev = %.2f" % (self.best_dev_f1, dev_f1))
                torch.save(self.model.state_dict(), save_model)

                self.best_train_f1 = train_f1
                self.best_dev_f1 = dev_f1
                self.early_stop = 0
            else:
                self.early_stop += 1
                if self.early_stop == early_stops:
                    logging.info(
                        "Eearly stop in epoch %d, best train: %.2f, dev: %.2f" % (
                            epoch - early_stops, self.best_train_f1, self.best_dev_f1))
                    self.last_epoch = epoch
                    break
    def test(self):
        self.model.load_state_dict(torch.load(save_model))
        self._eval(self.last_epoch + 1, test=True)

    def _train(self, epoch):
        self.optimizer.zero_grad()
        self.model.train()

        start_time = time.time()
        epoch_start_time = time.time()
        overall_losses = 0
        losses = 0
        batch_idx = 1
        y_pred = []
        y_true = []
        for batch_data in data_iter(self.train_data, train_batch_size, shuffle=True):
            torch.cuda.empty_cache()
            batch_inputs, batch_labels = self.batch2tensor(batch_data)
            batch_outputs = self.model(batch_inputs)
            loss = self.criterion(batch_outputs, batch_labels)
            loss.backward()

            loss_value = loss.detach().cpu().item()
            losses += loss_value
            overall_losses += loss_value

            y_pred.extend(torch.max(batch_outputs, dim=1)[1].cpu().numpy().tolist())
            y_true.extend(batch_labels.cpu().numpy().tolist())

            nn.utils.clip_grad_norm_(self.optimizer.all_params, max_norm=clip)
            for optimizer, scheduler in zip(self.optimizer.optims, self.optimizer.schedulers):
                optimizer.step()
                scheduler.step()
            self.optimizer.zero_grad()

            self.step += 1

            if batch_idx % log_interval == 0:
                elapsed = time.time() - start_time

                lrs = self.optimizer.get_lr()
                logging.info(
                    '| epoch {:3d} | step {:3d} | batch {:3d}/{:3d} | lr{} | loss {:.4f} | s/batch {:.2f}'.format(
                        epoch, self.step, batch_idx, self.batch_num, lrs,
                        losses / log_interval,
                        elapsed / log_interval))

                losses = 0
                start_time = time.time()

            batch_idx += 1

        overall_losses /= self.batch_num
        during_time = time.time() - epoch_start_time

        # reformat
        overall_losses = reformat(overall_losses, 4)
        score, f1 = get_score(y_true, y_pred)

        logging.info(
            '| epoch {:3d} | score {} | f1 {} | loss {:.4f} | time {:.2f}'.format(epoch, score, f1,
                                                                                  overall_losses,
                                                                                  during_time))
        if set(y_true) == set(y_pred) and self.report:
            report = classification_report(y_true, y_pred, digits=4, target_names=self.target_names)
            logging.info('\n' + report)

        return f1

    def _eval(self, epoch, test=False):
        self.model.eval()
        start_time = time.time()
        data = self.test_data if test else self.dev_data
        y_pred = []
        y_true = []
        with torch.no_grad():
            for batch_data in data_iter(data, test_batch_size, shuffle=False):
                torch.cuda.empty_cache()
                batch_inputs, batch_labels = self.batch2tensor(batch_data)
                batch_outputs = self.model(batch_inputs)
                y_pred.extend(torch.max(batch_outputs, dim=1)[1].cpu().numpy().tolist())
                y_true.extend(batch_labels.cpu().numpy().tolist())

            score, f1 = get_score(y_true, y_pred)

            during_time = time.time() - start_time
            
            if test:
                df = pd.DataFrame({'label': y_pred})
                df.to_csv(save_test, index=False, sep=',')
            else:
                logging.info(
                    '| epoch {:3d} | dev | score {} | f1 {} | time {:.2f}'.format(epoch, score, f1,
                                                                              during_time))
                if set(y_true) == set(y_pred) and self.report:
                    report = classification_report(y_true, y_pred, digits=4, target_names=self.target_names)
                    logging.info('\n' + report)

        return f1

    def batch2tensor(self, batch_data):
        '''
            [[label, doc_len, [[sent_len, [sent_id0, ...], [sent_id1, ...]], ...]]
        '''
        batch_size = len(batch_data)
        doc_labels = []
        doc_lens = []
        doc_max_sent_len = []
        for doc_data in batch_data:
            doc_labels.append(doc_data[0])
            doc_lens.append(doc_data[1])
            sent_lens = [sent_data[0] for sent_data in doc_data[2]]
            max_sent_len = max(sent_lens)
            doc_max_sent_len.append(max_sent_len)

        max_doc_len = max(doc_lens)
        max_sent_len = max(doc_max_sent_len)

        batch_inputs1 = torch.zeros((batch_size, max_doc_len, max_sent_len), dtype=torch.int64)
        batch_inputs2 = torch.zeros((batch_size, max_doc_len, max_sent_len), dtype=torch.int64)
        batch_masks = torch.zeros((batch_size, max_doc_len, max_sent_len), dtype=torch.float32)
        batch_labels = torch.LongTensor(doc_labels)

        for b in range(batch_size):
            for sent_idx in range(doc_lens[b]):
                sent_data = batch_data[b][2][sent_idx]
                for word_idx in range(sent_data[0]):
                    batch_inputs1[b, sent_idx, word_idx] = sent_data[1][word_idx]
                    batch_inputs2[b, sent_idx, word_idx] = sent_data[2][word_idx]
                    batch_masks[b, sent_idx, word_idx] = 1

        if use_cuda:
            batch_inputs1 = batch_inputs1.to(device)
            batch_inputs2 = batch_inputs2.to(device)
            batch_masks = batch_masks.to(device)
            batch_labels = batch_labels.to(device)

        return (batch_inputs1, batch_inputs2, batch_masks), batch_labels

In [22]:
# train
trainer = Trainer(model, vocab)

2020-08-05 20:25:29,202 INFO: Total 9000 docs.
2020-08-05 20:25:33,808 INFO: Total 1000 docs.
2020-08-05 20:29:35,751 INFO: Total 50000 docs.


In [23]:
trainer.train()

2020-08-05 20:29:35,756 INFO: Start training...
2020-08-05 20:30:10,666 INFO: | epoch   1 | step  50 | batch  50/563 | lr 0.00020 0.00005 | loss 2.0914 | s/batch 0.70
2020-08-05 20:30:43,995 INFO: | epoch   1 | step 100 | batch 100/563 | lr 0.00020 0.00004 | loss 1.2988 | s/batch 0.67
2020-08-05 20:31:15,396 INFO: | epoch   1 | step 150 | batch 150/563 | lr 0.00020 0.00004 | loss 0.8630 | s/batch 0.63
2020-08-05 20:31:44,046 INFO: | epoch   1 | step 200 | batch 200/563 | lr 0.00020 0.00003 | loss 0.8021 | s/batch 0.57
2020-08-05 20:32:16,674 INFO: | epoch   1 | step 250 | batch 250/563 | lr 0.00020 0.00003 | loss 0.7161 | s/batch 0.65
2020-08-05 20:32:47,482 INFO: | epoch   1 | step 300 | batch 300/563 | lr 0.00020 0.00002 | loss 0.6842 | s/batch 0.62
2020-08-05 20:33:21,307 INFO: | epoch   1 | step 350 | batch 350/563 | lr 0.00020 0.00002 | loss 0.4465 | s/batch 0.68
2020-08-05 20:33:54,678 INFO: | epoch   1 | step 400 | batch 400/563 | lr 0.00020 0.00001 | loss 0.6726 | s/batch 0.67


In [24]:
epochs = 2
test_batch_size = 16
train_batch_size = 16

trainer.train()

2020-08-05 20:37:35,047 INFO: Start training...
2020-08-05 20:38:04,320 INFO: | epoch   1 | step 613 | batch  50/563 | lr 0.00020 0.00000 | loss 0.4142 | s/batch 0.59
2020-08-05 20:38:36,416 INFO: | epoch   1 | step 663 | batch 100/563 | lr 0.00020 0.00000 | loss 0.4278 | s/batch 0.64
2020-08-05 20:39:14,541 INFO: | epoch   1 | step 713 | batch 150/563 | lr 0.00020 0.00000 | loss 0.4002 | s/batch 0.76
2020-08-05 20:39:49,871 INFO: | epoch   1 | step 763 | batch 200/563 | lr 0.00020 0.00000 | loss 0.3876 | s/batch 0.71
2020-08-05 20:40:21,396 INFO: | epoch   1 | step 813 | batch 250/563 | lr 0.00020 0.00000 | loss 0.4394 | s/batch 0.63
2020-08-05 20:40:56,025 INFO: | epoch   1 | step 863 | batch 300/563 | lr 0.00020 0.00000 | loss 0.3988 | s/batch 0.69
2020-08-05 20:41:27,124 INFO: | epoch   1 | step 913 | batch 350/563 | lr 0.00020 0.00000 | loss 0.3317 | s/batch 0.62
2020-08-05 20:41:58,650 INFO: | epoch   1 | step 963 | batch 400/563 | lr 0.00020 0.00000 | loss 0.4484 | s/batch 0.63


In [25]:
epochs = 3
test_batch_size = 16
train_batch_size = 16

trainer.train()

2020-08-05 20:56:13,833 INFO: Start training...
2020-08-05 20:56:45,732 INFO: | epoch   1 | step 1739 | batch  50/563 | lr 0.00015 0.00000 | loss 0.3370 | s/batch 0.64
2020-08-05 20:57:19,243 INFO: | epoch   1 | step 1789 | batch 100/563 | lr 0.00015 0.00000 | loss 0.3598 | s/batch 0.67
2020-08-05 20:57:56,755 INFO: | epoch   1 | step 1839 | batch 150/563 | lr 0.00015 0.00000 | loss 0.2991 | s/batch 0.75
2020-08-05 20:58:31,096 INFO: | epoch   1 | step 1889 | batch 200/563 | lr 0.00015 0.00000 | loss 0.3400 | s/batch 0.69
2020-08-05 20:59:03,468 INFO: | epoch   1 | step 1939 | batch 250/563 | lr 0.00015 0.00000 | loss 0.2985 | s/batch 0.65
2020-08-05 20:59:34,389 INFO: | epoch   1 | step 1989 | batch 300/563 | lr 0.00015 0.00000 | loss 0.3709 | s/batch 0.62
2020-08-05 21:00:03,308 INFO: | epoch   1 | step 2039 | batch 350/563 | lr 0.00011 0.00000 | loss 0.3739 | s/batch 0.58
2020-08-05 21:00:36,041 INFO: | epoch   1 | step 2089 | batch 400/563 | lr 0.00011 0.00000 | loss 0.2811 | s/bat

2020-08-05 21:14:13,286 INFO: | epoch   3 | step 3265 | batch 450/563 | lr 0.00008 0.00000 | loss 0.3000 | s/batch 0.63
2020-08-05 21:14:43,958 INFO: | epoch   3 | step 3315 | batch 500/563 | lr 0.00008 0.00000 | loss 0.2692 | s/batch 0.61
2020-08-05 21:15:15,577 INFO: | epoch   3 | step 3365 | batch 550/563 | lr 0.00008 0.00000 | loss 0.2482 | s/batch 0.63
2020-08-05 21:15:23,652 INFO: | epoch   3 | score (88.56, 85.69, 87.0) | f1 87.0 | loss 0.2860 | time 363.95
2020-08-05 21:15:23,672 INFO: 
              precision    recall  f1-score   support

          科技     0.9247    0.9258    0.9252      1697
          股票     0.9056    0.9304    0.9178      1680
          体育     0.9698    0.9815    0.9756      1405
          娱乐     0.9221    0.9392    0.9306       971
          时政     0.8815    0.8901    0.8858       710
          社会     0.8471    0.8638    0.8554       558
          教育     0.9341    0.9033    0.9184       455
          财经     0.8314    0.7318    0.7784       384
          家居 

In [26]:
epochs = 4
test_batch_size = 16
train_batch_size = 16

trainer.train()

2020-08-05 21:18:56,675 INFO: Start training...
2020-08-05 21:19:32,634 INFO: | epoch   1 | step 3428 | batch  50/563 | lr 0.00008 0.00000 | loss 0.2673 | s/batch 0.72
2020-08-05 21:20:04,644 INFO: | epoch   1 | step 3478 | batch 100/563 | lr 0.00008 0.00000 | loss 0.2813 | s/batch 0.64
2020-08-05 21:20:37,147 INFO: | epoch   1 | step 3528 | batch 150/563 | lr 0.00008 0.00000 | loss 0.2791 | s/batch 0.65
2020-08-05 21:21:05,177 INFO: | epoch   1 | step 3578 | batch 200/563 | lr 0.00008 0.00000 | loss 0.2947 | s/batch 0.56
2020-08-05 21:21:40,157 INFO: | epoch   1 | step 3628 | batch 250/563 | lr 0.00008 0.00000 | loss 0.2932 | s/batch 0.70
2020-08-05 21:22:12,603 INFO: | epoch   1 | step 3678 | batch 300/563 | lr 0.00008 0.00000 | loss 0.2396 | s/batch 0.65
2020-08-05 21:22:42,573 INFO: | epoch   1 | step 3728 | batch 350/563 | lr 0.00008 0.00000 | loss 0.3062 | s/batch 0.60
2020-08-05 21:23:13,128 INFO: | epoch   1 | step 3778 | batch 400/563 | lr 0.00008 0.00000 | loss 0.2482 | s/bat

2020-08-05 21:37:26,299 INFO: | epoch   3 | step 5004 | batch 500/563 | lr 0.00005 0.00000 | loss 0.2465 | s/batch 0.67
2020-08-05 21:37:57,304 INFO: | epoch   3 | step 5054 | batch 550/563 | lr 0.00005 0.00000 | loss 0.3313 | s/batch 0.62
2020-08-05 21:38:05,958 INFO: | epoch   3 | score (90.78, 88.79, 89.73) | f1 89.73 | loss 0.2565 | time 362.29
2020-08-05 21:38:05,978 INFO: 
              precision    recall  f1-score   support

          科技     0.9350    0.9328    0.9339      1697
          股票     0.9170    0.9464    0.9315      1680
          体育     0.9780    0.9822    0.9801      1405
          娱乐     0.9270    0.9547    0.9406       971
          时政     0.8966    0.8915    0.8941       710
          社会     0.8677    0.8817    0.8747       558
          教育     0.9546    0.9253    0.9397       455
          财经     0.8676    0.7682    0.8149       384
          家居     0.9016    0.8824    0.8919       374
          游戏     0.9088    0.8925    0.9005       279
          房产     0.8962

## 超参重置

In [60]:
epochs = 3
trainer = Trainer(model, vocab)
trainer.train()

2020-08-05 22:33:52,956 INFO: Total 180000 docs.
2020-08-05 22:35:33,362 INFO: Total 20000 docs.
2020-08-05 22:39:35,427 INFO: Total 50000 docs.
2020-08-05 22:39:35,680 INFO: Start training...
2020-08-05 22:40:13,343 INFO: | epoch   1 | step  50 | batch  50/11250 | lr 0.00020 0.00005 | loss 2.2432 | s/batch 0.75
2020-08-05 22:40:46,629 INFO: | epoch   1 | step 100 | batch 100/11250 | lr 0.00020 0.00005 | loss 1.3210 | s/batch 0.67
2020-08-05 22:41:21,178 INFO: | epoch   1 | step 150 | batch 150/11250 | lr 0.00020 0.00005 | loss 1.0208 | s/batch 0.69
2020-08-05 22:41:52,593 INFO: | epoch   1 | step 200 | batch 200/11250 | lr 0.00020 0.00005 | loss 0.8483 | s/batch 0.63
2020-08-05 22:42:26,534 INFO: | epoch   1 | step 250 | batch 250/11250 | lr 0.00020 0.00005 | loss 0.7456 | s/batch 0.68
2020-08-05 22:42:57,711 INFO: | epoch   1 | step 300 | batch 300/11250 | lr 0.00020 0.00005 | loss 0.6130 | s/batch 0.62
2020-08-05 22:43:23,291 INFO: | epoch   1 | step 350 | batch 350/11250 | lr 0.000

2020-08-05 23:15:02,596 INFO: | epoch   1 | step 3350 | batch 3350/11250 | lr 0.00008 0.00005 | loss 0.2454 | s/batch 0.62
2020-08-05 23:15:32,447 INFO: | epoch   1 | step 3400 | batch 3400/11250 | lr 0.00008 0.00004 | loss 0.2067 | s/batch 0.60
2020-08-05 23:16:02,393 INFO: | epoch   1 | step 3450 | batch 3450/11250 | lr 0.00008 0.00004 | loss 0.2750 | s/batch 0.60
2020-08-05 23:16:36,081 INFO: | epoch   1 | step 3500 | batch 3500/11250 | lr 0.00008 0.00004 | loss 0.3266 | s/batch 0.67
2020-08-05 23:17:06,252 INFO: | epoch   1 | step 3550 | batch 3550/11250 | lr 0.00008 0.00004 | loss 0.2249 | s/batch 0.60
2020-08-05 23:17:35,731 INFO: | epoch   1 | step 3600 | batch 3600/11250 | lr 0.00008 0.00004 | loss 0.2465 | s/batch 0.59
2020-08-05 23:18:08,428 INFO: | epoch   1 | step 3650 | batch 3650/11250 | lr 0.00008 0.00004 | loss 0.3542 | s/batch 0.65
2020-08-05 23:18:43,067 INFO: | epoch   1 | step 3700 | batch 3700/11250 | lr 0.00008 0.00004 | loss 0.2844 | s/batch 0.69
2020-08-05 23:19

2020-08-05 23:51:16,157 INFO: | epoch   1 | step 6700 | batch 6700/11250 | lr 0.00004 0.00004 | loss 0.2156 | s/batch 0.71
2020-08-05 23:51:50,252 INFO: | epoch   1 | step 6750 | batch 6750/11250 | lr 0.00004 0.00004 | loss 0.2336 | s/batch 0.68
2020-08-05 23:52:24,736 INFO: | epoch   1 | step 6800 | batch 6800/11250 | lr 0.00004 0.00004 | loss 0.2157 | s/batch 0.69
2020-08-05 23:52:56,217 INFO: | epoch   1 | step 6850 | batch 6850/11250 | lr 0.00004 0.00004 | loss 0.2169 | s/batch 0.63
2020-08-05 23:53:28,341 INFO: | epoch   1 | step 6900 | batch 6900/11250 | lr 0.00004 0.00004 | loss 0.1728 | s/batch 0.64
2020-08-05 23:54:02,469 INFO: | epoch   1 | step 6950 | batch 6950/11250 | lr 0.00004 0.00004 | loss 0.2027 | s/batch 0.68
2020-08-05 23:54:34,832 INFO: | epoch   1 | step 7000 | batch 7000/11250 | lr 0.00003 0.00004 | loss 0.2482 | s/batch 0.65
2020-08-05 23:55:06,825 INFO: | epoch   1 | step 7050 | batch 7050/11250 | lr 0.00003 0.00004 | loss 0.2430 | s/batch 0.64
2020-08-05 23:55

2020-08-06 00:27:30,659 INFO: | epoch   1 | step 10050 | batch 10050/11250 | lr 0.00001 0.00004 | loss 0.1871 | s/batch 0.73
2020-08-06 00:28:00,292 INFO: | epoch   1 | step 10100 | batch 10100/11250 | lr 0.00001 0.00004 | loss 0.2441 | s/batch 0.59
2020-08-06 00:28:31,986 INFO: | epoch   1 | step 10150 | batch 10150/11250 | lr 0.00001 0.00003 | loss 0.2121 | s/batch 0.63
2020-08-06 00:29:04,497 INFO: | epoch   1 | step 10200 | batch 10200/11250 | lr 0.00001 0.00003 | loss 0.2144 | s/batch 0.65
2020-08-06 00:29:36,419 INFO: | epoch   1 | step 10250 | batch 10250/11250 | lr 0.00001 0.00003 | loss 0.1902 | s/batch 0.64
2020-08-06 00:30:10,081 INFO: | epoch   1 | step 10300 | batch 10300/11250 | lr 0.00001 0.00003 | loss 0.2173 | s/batch 0.67
2020-08-06 00:30:42,039 INFO: | epoch   1 | step 10350 | batch 10350/11250 | lr 0.00001 0.00003 | loss 0.2350 | s/batch 0.64
2020-08-06 00:31:15,193 INFO: | epoch   1 | step 10400 | batch 10400/11250 | lr 0.00001 0.00003 | loss 0.2043 | s/batch 0.66


2020-08-06 01:02:13,507 INFO: | epoch   2 | step 12450 | batch 1200/11250 | lr 0.00001 0.00003 | loss 0.1629 | s/batch 0.62
2020-08-06 01:02:50,674 INFO: | epoch   2 | step 12500 | batch 1250/11250 | lr 0.00001 0.00003 | loss 0.1769 | s/batch 0.74
2020-08-06 01:03:23,270 INFO: | epoch   2 | step 12550 | batch 1300/11250 | lr 0.00001 0.00003 | loss 0.1908 | s/batch 0.65
2020-08-06 01:03:56,192 INFO: | epoch   2 | step 12600 | batch 1350/11250 | lr 0.00001 0.00003 | loss 0.1816 | s/batch 0.66
2020-08-06 01:04:26,265 INFO: | epoch   2 | step 12650 | batch 1400/11250 | lr 0.00001 0.00003 | loss 0.1823 | s/batch 0.60
2020-08-06 01:04:58,980 INFO: | epoch   2 | step 12700 | batch 1450/11250 | lr 0.00001 0.00003 | loss 0.1871 | s/batch 0.65
2020-08-06 01:05:32,079 INFO: | epoch   2 | step 12750 | batch 1500/11250 | lr 0.00001 0.00003 | loss 0.1259 | s/batch 0.66
2020-08-06 01:06:06,614 INFO: | epoch   2 | step 12800 | batch 1550/11250 | lr 0.00001 0.00003 | loss 0.2132 | s/batch 0.69
2020-08-

2020-08-06 01:37:58,171 INFO: | epoch   2 | step 15800 | batch 4550/11250 | lr 0.00000 0.00003 | loss 0.1328 | s/batch 0.66
2020-08-06 01:38:28,675 INFO: | epoch   2 | step 15850 | batch 4600/11250 | lr 0.00000 0.00003 | loss 0.1736 | s/batch 0.61
2020-08-06 01:38:59,233 INFO: | epoch   2 | step 15900 | batch 4650/11250 | lr 0.00000 0.00003 | loss 0.1771 | s/batch 0.61
2020-08-06 01:39:33,591 INFO: | epoch   2 | step 15950 | batch 4700/11250 | lr 0.00000 0.00003 | loss 0.1565 | s/batch 0.69
2020-08-06 01:40:03,388 INFO: | epoch   2 | step 16000 | batch 4750/11250 | lr 0.00000 0.00003 | loss 0.1443 | s/batch 0.60
2020-08-06 01:40:36,729 INFO: | epoch   2 | step 16050 | batch 4800/11250 | lr 0.00000 0.00003 | loss 0.1422 | s/batch 0.67
2020-08-06 01:41:07,550 INFO: | epoch   2 | step 16100 | batch 4850/11250 | lr 0.00000 0.00003 | loss 0.1852 | s/batch 0.62
2020-08-06 01:41:42,193 INFO: | epoch   2 | step 16150 | batch 4900/11250 | lr 0.00000 0.00003 | loss 0.1664 | s/batch 0.69
2020-08-

2020-08-06 02:13:50,221 INFO: | epoch   2 | step 19150 | batch 7900/11250 | lr 0.00000 0.00002 | loss 0.1563 | s/batch 0.58
2020-08-06 02:14:21,166 INFO: | epoch   2 | step 19200 | batch 7950/11250 | lr 0.00000 0.00002 | loss 0.1567 | s/batch 0.62
2020-08-06 02:14:50,366 INFO: | epoch   2 | step 19250 | batch 8000/11250 | lr 0.00000 0.00002 | loss 0.1876 | s/batch 0.58
2020-08-06 02:15:23,000 INFO: | epoch   2 | step 19300 | batch 8050/11250 | lr 0.00000 0.00002 | loss 0.1867 | s/batch 0.65
2020-08-06 02:15:56,461 INFO: | epoch   2 | step 19350 | batch 8100/11250 | lr 0.00000 0.00002 | loss 0.1407 | s/batch 0.67
2020-08-06 02:16:27,596 INFO: | epoch   2 | step 19400 | batch 8150/11250 | lr 0.00000 0.00002 | loss 0.1219 | s/batch 0.62
2020-08-06 02:16:59,793 INFO: | epoch   2 | step 19450 | batch 8200/11250 | lr 0.00000 0.00002 | loss 0.1673 | s/batch 0.64
2020-08-06 02:17:38,433 INFO: | epoch   2 | step 19500 | batch 8250/11250 | lr 0.00000 0.00002 | loss 0.1564 | s/batch 0.77
2020-08-

2020-08-06 02:49:14,584 INFO: | epoch   2 | step 22450 | batch 11200/11250 | lr 0.00000 0.00002 | loss 0.1882 | s/batch 0.63
2020-08-06 02:49:45,068 INFO: | epoch   2 | step 22500 | batch 11250/11250 | lr 0.00000 0.00002 | loss 0.1451 | s/batch 0.61
2020-08-06 02:49:45,295 INFO: | epoch   2 | score (94.48, 93.92, 94.2) | f1 94.2 | loss 0.1630 | time 7207.89
2020-08-06 02:49:45,624 INFO: 
              precision    recall  f1-score   support

          科技     0.9483    0.9523    0.9503     35027
          股票     0.9546    0.9561    0.9553     33251
          体育     0.9889    0.9887    0.9888     28283
          娱乐     0.9637    0.9702    0.9670     19920
          时政     0.9136    0.9296    0.9216     13515
          社会     0.9111    0.9093    0.9102     11009
          教育     0.9550    0.9495    0.9522      8987
          财经     0.8991    0.8726    0.8856      7957
          家居     0.9442    0.9393    0.9417      7063
          游戏     0.9356    0.9178    0.9266      5291
          房产  

2020-08-06 03:24:32,291 INFO: | epoch   3 | step 24850 | batch 2350/11250 | lr 0.00000 0.00001 | loss 0.0947 | s/batch 0.62
2020-08-06 03:25:05,125 INFO: | epoch   3 | step 24900 | batch 2400/11250 | lr 0.00000 0.00001 | loss 0.1289 | s/batch 0.66
2020-08-06 03:25:36,823 INFO: | epoch   3 | step 24950 | batch 2450/11250 | lr 0.00000 0.00001 | loss 0.1343 | s/batch 0.63
2020-08-06 03:26:08,444 INFO: | epoch   3 | step 25000 | batch 2500/11250 | lr 0.00000 0.00001 | loss 0.1907 | s/batch 0.63
2020-08-06 03:26:38,389 INFO: | epoch   3 | step 25050 | batch 2550/11250 | lr 0.00000 0.00001 | loss 0.1238 | s/batch 0.60
2020-08-06 03:27:11,231 INFO: | epoch   3 | step 25100 | batch 2600/11250 | lr 0.00000 0.00001 | loss 0.1462 | s/batch 0.66
2020-08-06 03:27:43,895 INFO: | epoch   3 | step 25150 | batch 2650/11250 | lr 0.00000 0.00001 | loss 0.1308 | s/batch 0.65
2020-08-06 03:28:14,492 INFO: | epoch   3 | step 25200 | batch 2700/11250 | lr 0.00000 0.00001 | loss 0.1157 | s/batch 0.61
2020-08-

2020-08-06 04:00:07,979 INFO: | epoch   3 | step 28200 | batch 5700/11250 | lr 0.00000 0.00001 | loss 0.1662 | s/batch 0.59
2020-08-06 04:00:40,847 INFO: | epoch   3 | step 28250 | batch 5750/11250 | lr 0.00000 0.00001 | loss 0.1194 | s/batch 0.66
2020-08-06 04:01:12,284 INFO: | epoch   3 | step 28300 | batch 5800/11250 | lr 0.00000 0.00001 | loss 0.0892 | s/batch 0.63
2020-08-06 04:01:43,412 INFO: | epoch   3 | step 28350 | batch 5850/11250 | lr 0.00000 0.00001 | loss 0.0953 | s/batch 0.62
2020-08-06 04:02:18,101 INFO: | epoch   3 | step 28400 | batch 5900/11250 | lr 0.00000 0.00001 | loss 0.1335 | s/batch 0.69
2020-08-06 04:02:44,314 INFO: | epoch   3 | step 28450 | batch 5950/11250 | lr 0.00000 0.00001 | loss 0.1505 | s/batch 0.52
2020-08-06 04:03:18,228 INFO: | epoch   3 | step 28500 | batch 6000/11250 | lr 0.00000 0.00001 | loss 0.1243 | s/batch 0.68
2020-08-06 04:03:50,211 INFO: | epoch   3 | step 28550 | batch 6050/11250 | lr 0.00000 0.00001 | loss 0.1271 | s/batch 0.64
2020-08-

2020-08-06 04:36:07,271 INFO: | epoch   3 | step 31550 | batch 9050/11250 | lr 0.00000 0.00000 | loss 0.1320 | s/batch 0.61
2020-08-06 04:36:40,728 INFO: | epoch   3 | step 31600 | batch 9100/11250 | lr 0.00000 0.00000 | loss 0.1125 | s/batch 0.67
2020-08-06 04:37:11,435 INFO: | epoch   3 | step 31650 | batch 9150/11250 | lr 0.00000 0.00000 | loss 0.1010 | s/batch 0.61
2020-08-06 04:37:45,127 INFO: | epoch   3 | step 31700 | batch 9200/11250 | lr 0.00000 0.00000 | loss 0.1286 | s/batch 0.67
2020-08-06 04:38:19,204 INFO: | epoch   3 | step 31750 | batch 9250/11250 | lr 0.00000 0.00000 | loss 0.1315 | s/batch 0.68
2020-08-06 04:38:50,602 INFO: | epoch   3 | step 31800 | batch 9300/11250 | lr 0.00000 0.00000 | loss 0.1734 | s/batch 0.63
2020-08-06 04:39:19,357 INFO: | epoch   3 | step 31850 | batch 9350/11250 | lr 0.00000 0.00000 | loss 0.1130 | s/batch 0.58
2020-08-06 04:39:51,132 INFO: | epoch   3 | step 31900 | batch 9400/11250 | lr 0.00000 0.00000 | loss 0.0945 | s/batch 0.64
2020-08-

In [61]:

# test
trainer.test()

  _warn_prf(average, modifier, msg_start, len(result))
