# BERT
微调将最后一层的第一个token即[CLS]的隐藏向量作为句子的表示，然后输入到softmax层进行分类。

In [21]:
!pip install transformers



In [22]:
import logging
import random

import numpy as np
import torch
import pandas as pd
from collections import Counter
from transformers import BasicTokenizer
import torch.nn as nn
import torch.nn.functional as F
from transformers import BertModel
from transformers import AdamW, get_linear_schedule_with_warmup,get_cosine_with_hard_restarts_schedule_with_warmup
from sklearn.metrics import f1_score, precision_score, recall_score
import time
from sklearn.metrics import classification_report


logging.basicConfig(level=logging.INFO, format='%(asctime)-15s %(levelname)s: %(message)s')

# set seed
seed = 666
random.seed(seed)
np.random.seed(seed)
torch.cuda.manual_seed(seed)
torch.manual_seed(seed)

# set cuda
gpu = 0
use_cuda = gpu >= 0 and torch.cuda.is_available()
if use_cuda:
    torch.cuda.set_device(gpu)
    device = torch.device("cuda", gpu)
else:
    device = torch.device("cpu")
logging.info("Use cuda: %s, gpu id: %d.", use_cuda, gpu)

2020-08-16 23:52:24,263 INFO: Use cuda: True, gpu id: 0.


In [23]:
# split data to 10 fold

def all_data2fold(fold_num, num=10000):
    fold_data = []
    f = pd.read_csv(data_file, sep='\t', encoding='UTF-8')
    texts = f['text'].tolist()[:num]
    labels = f['label'].tolist()[:num]
    
    total = len(labels)
#     #TODO data enhancement START
#     new_textlist = []
#     new_labellist = []
#     for i in range(total):
#         label = labels[i]
#         text = texts[i]
#         new_arr = text.split()
#         np.random.shuffle(new_arr)
#         new_text = ' '.join(test)
#         new_labellist.append(label)
#         new_textlist.append(new_text)
    
#     labels.extend(new_labellist)
#     texts.extend(new_textlist)

#     #TODO data enhancement END
#     total = len(labels)
        
    index = list(range(total))
    np.random.shuffle(index)

    all_texts = []
    all_labels = []
    for i in index:
        all_texts.append(texts[i])
        all_labels.append(labels[i])

    label2id = {}
    for i in range(total):
        #print("{0},{1}".format(i,all_labels[i]))
        label = all_labels[i]
        if label not in label2id:
            label2id[label] = [i]
        else:
            label2id[label].append(i)

    all_index = [[] for _ in range(fold_num)]
    for label, data in label2id.items():
        # print(label, len(data))
        batch_size = int(len(data) / fold_num)
        other = len(data) - batch_size * fold_num
        for i in range(fold_num):
            cur_batch_size = batch_size + 1 if i < other else batch_size
            # print(cur_batch_size)
            batch_data = [data[i * batch_size + b] for b in range(cur_batch_size)]
            all_index[i].extend(batch_data)

    batch_size = int(total / fold_num)
    other_texts = []
    other_labels = []
    other_num = 0
    start = 0
    for fold in range(fold_num):
        num = len(all_index[fold])
        texts = [all_texts[i] for i in all_index[fold]]
        labels = [all_labels[i] for i in all_index[fold]]

        if num > batch_size:
            fold_texts = texts[:batch_size]
            other_texts.extend(texts[batch_size:])
            fold_labels = labels[:batch_size]
            other_labels.extend(labels[batch_size:])
            other_num += num - batch_size
        elif num < batch_size:
            end = start + batch_size - num
            fold_texts = texts + other_texts[start: end]
            fold_labels = labels + other_labels[start: end]
            start = end
        else:
            fold_texts = texts
            fold_labels = labels

        assert batch_size == len(fold_labels)

        # shuffle
        index = list(range(batch_size))
        np.random.shuffle(index)

        shuffle_fold_texts = []
        shuffle_fold_labels = []
        for i in index:
            shuffle_fold_texts.append(fold_texts[i])
            shuffle_fold_labels.append(fold_labels[i])

        data = {'label': shuffle_fold_labels, 'text': shuffle_fold_texts}
        fold_data.append(data)

    logging.info("Fold lens %s", [len(data['label']) for data in fold_data])

    return fold_data

In [24]:
# build vocab
basic_tokenizer = BasicTokenizer()

class Vocab():
    def __init__(self, train_data):
        self.min_count = 5
        self.pad = 0
        self.unk = 1
        self._id2word = ['[PAD]', '[UNK]']
        self._id2extword = ['[PAD]', '[UNK]']

        self._id2label = []
        self.target_names = []

        self.build_vocab(train_data)

        reverse = lambda x: dict(zip(x, range(len(x))))
        self._word2id = reverse(self._id2word)
        self._label2id = reverse(self._id2label)

        logging.info("Build vocab: words %d, labels %d." % (self.word_size, self.label_size))

    def build_vocab(self, data):
        self.word_counter = Counter()

        for text in data['text']:
            words = text.split()
            for word in words:
                self.word_counter[word] += 1

        for word, count in self.word_counter.most_common():
            if count >= self.min_count:
                self._id2word.append(word)

        label2name = {0: '科技', 1: '股票', 2: '体育', 3: '娱乐', 4: '时政', 5: '社会', 6: '教育', 7: '财经',
                      8: '家居', 9: '游戏', 10: '房产', 11: '时尚', 12: '彩票', 13: '星座'}

        self.label_counter = Counter(data['label'])

        for label in range(len(self.label_counter)):
            count = self.label_counter[label]
            self._id2label.append(label)
            self.target_names.append(label2name[label])

    def load_pretrained_embs(self, embfile):
        with open(embfile, encoding='utf-8') as f:
            lines = f.readlines()
            items = lines[0].split()
            word_count, embedding_dim = int(items[0]), int(items[1])

        index = len(self._id2extword)
        embeddings = np.zeros((word_count + index, embedding_dim))
        for line in lines[1:]:
            values = line.split()
            self._id2extword.append(values[0])
            vector = np.array(values[1:], dtype='float64')
            embeddings[self.unk] += vector
            embeddings[index] = vector
            index += 1

        embeddings[self.unk] = embeddings[self.unk] / word_count
        embeddings = embeddings / np.std(embeddings)

        reverse = lambda x: dict(zip(x, range(len(x))))
        self._extword2id = reverse(self._id2extword)

        assert len(set(self._id2extword)) == len(self._id2extword)

        return embeddings

    def word2id(self, xs):
        if isinstance(xs, list):
            return [self._word2id.get(x, self.unk) for x in xs]
        return self._word2id.get(xs, self.unk)

    def extword2id(self, xs):
        if isinstance(xs, list):
            return [self._extword2id.get(x, self.unk) for x in xs]
        return self._extword2id.get(xs, self.unk)

    def label2id(self, xs):
        if isinstance(xs, list):
            return [self._label2id.get(x, self.unk) for x in xs]
        return self._label2id.get(xs, self.unk)

    @property
    def word_size(self):
        return len(self._id2word)

    @property
    def extword_size(self):
        return len(self._id2extword)

    @property
    def label_size(self):
        return len(self._id2label)

In [25]:
# build module

class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.weight = nn.Parameter(torch.Tensor(hidden_size, hidden_size))
        self.weight.data.normal_(mean=0.0, std=0.05)

        self.bias = nn.Parameter(torch.Tensor(hidden_size))
        b = np.zeros(hidden_size, dtype=np.float32)
        self.bias.data.copy_(torch.from_numpy(b))

        self.query = nn.Parameter(torch.Tensor(hidden_size))
        self.query.data.normal_(mean=0.0, std=0.05)

    def forward(self, batch_hidden, batch_masks):
        # batch_hidden: b x len x hidden_size (2 * hidden_size of lstm)
        # batch_masks:  b x len

        # linear
        key = torch.matmul(batch_hidden, self.weight) + self.bias  # b x len x hidden

        # compute attention
        outputs = torch.matmul(key, self.query)  # b x len

        masked_outputs = outputs.masked_fill((1 - batch_masks).bool(), float(-1e32))

        attn_scores = F.softmax(masked_outputs, dim=1)  # b x len

        # 对于全零向量，-1e32的结果为 1/len, -inf为nan, 额外补0
        masked_attn_scores = attn_scores.masked_fill((1 - batch_masks).bool(), 0.0)

        # sum weighted sources
        batch_outputs = torch.bmm(masked_attn_scores.unsqueeze(1), key).squeeze(1)  # b x hidden

        return batch_outputs, attn_scores


class WordBertEncoder(nn.Module):
    def __init__(self):
        super(WordBertEncoder, self).__init__()
        self.dropout = nn.Dropout(dropout)

        self.tokenizer = WhitespaceTokenizer()
        
        self.bert = BertModel.from_pretrained(bert_path)

        self.pooled = False
        logging.info('Build Bert encoder with pooled {}.'.format(self.pooled))

    def encode(self, tokens):
        tokens = self.tokenizer.tokenize(tokens)
        return tokens

    def get_bert_parameters(self):
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_parameters = [
            {'params': [p for n, p in self.bert.named_parameters() if not any(nd in n for nd in no_decay)],
             'weight_decay': 0.01},
            {'params': [p for n, p in self.bert.named_parameters() if any(nd in n for nd in no_decay)],
             'weight_decay': 0.0}
        ]
        return optimizer_parameters
    #微调将最后一层的第一个token即[CLS]的隐藏向量作为句子的表示，然后输入到softmax层进行分类。
    def forward(self, input_ids, token_type_ids):
        # input_ids: sen_num x bert_len
        # token_type_ids: sen_num  x bert_len

        # sen_num x bert_len x 256, sen_num x 256
        sequence_output, pooled_output = self.bert(input_ids=input_ids, token_type_ids=token_type_ids)

        if self.pooled:
            reps = pooled_output
        else:
            reps = sequence_output[:, 0, :]  # sen_num x 256

        if self.training:
            reps = self.dropout(reps)

        return reps


class WhitespaceTokenizer():
    """WhitespaceTokenizer with vocab."""

    def __init__(self):
        vocab_file = bert_path + 'vocab.txt'
        self._token2id = self.load_vocab(vocab_file)
        self._id2token = {v: k for k, v in self._token2id.items()}
        self.max_len = 256
        self.unk = 1

        logging.info("Build Bert vocab with size %d." % (self.vocab_size))

    def load_vocab(self, vocab_file):
        f = open(vocab_file, 'r')
        lines = f.readlines()
        lines = list(map(lambda x: x.strip(), lines))
        vocab = dict(zip(lines, range(len(lines))))
        return vocab

    def tokenize(self, tokens):
        assert len(tokens) <= self.max_len - 2
        tokens = ["[CLS]"] + tokens + ["[SEP]"]
        output_tokens = self.token2id(tokens)
        return output_tokens

    def token2id(self, xs):
        if isinstance(xs, list):
            return [self._token2id.get(x, self.unk) for x in xs]
        return self._token2id.get(xs, self.unk)

    @property
    def vocab_size(self):
        return len(self._id2token)


class SentEncoder(nn.Module):
    def __init__(self, sent_rep_size):
        super(SentEncoder, self).__init__()
        self.dropout = nn.Dropout(dropout)

        self.sent_lstm = nn.LSTM(
            input_size=sent_rep_size,
            hidden_size=sent_hidden_size,
            num_layers=sent_num_layers,
            batch_first=True,
            bidirectional=True
        )

    def forward(self, sent_reps, sent_masks):
        # sent_reps:  b x doc_len x sent_rep_size
        # sent_masks: b x doc_len

        sent_hiddens, _ = self.sent_lstm(sent_reps)  # b x doc_len x hidden*2
        sent_hiddens = sent_hiddens * sent_masks.unsqueeze(2)

        if self.training:
            sent_hiddens = self.dropout(sent_hiddens)

        return sent_hiddens

In [26]:
# build model
class Model(nn.Module):
    def __init__(self, vocab):
        super(Model, self).__init__()
        self.sent_rep_size = 256
        self.doc_rep_size = sent_hidden_size * 2
        self.all_parameters = {}
        parameters = []
        self.word_encoder = WordBertEncoder()
        bert_parameters = self.word_encoder.get_bert_parameters()

        self.sent_encoder = SentEncoder(self.sent_rep_size)
        self.sent_attention = Attention(self.doc_rep_size)
        parameters.extend(list(filter(lambda p: p.requires_grad, self.sent_encoder.parameters())))
        parameters.extend(list(filter(lambda p: p.requires_grad, self.sent_attention.parameters())))

        self.out = nn.Linear(self.doc_rep_size, vocab.label_size, bias=True)
        parameters.extend(list(filter(lambda p: p.requires_grad, self.out.parameters())))

        if use_cuda:
            self.to(device)

        if len(parameters) > 0:
            self.all_parameters["basic_parameters"] = parameters
        self.all_parameters["bert_parameters"] = bert_parameters

        logging.info('Build model with bert word encoder, lstm sent encoder.')

        para_num = sum([np.prod(list(p.size())) for p in self.parameters()])
        logging.info('Model param num: %.2f M.' % (para_num / 1e6))

    def forward(self, batch_inputs):
        # batch_inputs(batch_inputs1, batch_inputs2): b x doc_len x sent_len
        # batch_masks : b x doc_len x sent_len
        batch_inputs1, batch_inputs2, batch_masks = batch_inputs
        batch_size, max_doc_len, max_sent_len = batch_inputs1.shape[0], batch_inputs1.shape[1], batch_inputs1.shape[2]
        batch_inputs1 = batch_inputs1.view(batch_size * max_doc_len, max_sent_len)  # sen_num x sent_len
        batch_inputs2 = batch_inputs2.view(batch_size * max_doc_len, max_sent_len)  # sen_num x sent_len
        batch_masks = batch_masks.view(batch_size * max_doc_len, max_sent_len)  # sen_num x sent_len

        sent_reps = self.word_encoder(batch_inputs1, batch_inputs2)  # sen_num x sent_rep_size

        sent_reps = sent_reps.view(batch_size, max_doc_len, self.sent_rep_size)  # b x doc_len x sent_rep_size
        batch_masks = batch_masks.view(batch_size, max_doc_len, max_sent_len)  # b x doc_len x max_sent_len
        sent_masks = batch_masks.bool().any(2).float()  # b x doc_len

        sent_hiddens = self.sent_encoder(sent_reps, sent_masks)  # b x doc_len x doc_rep_size
        doc_reps, atten_scores = self.sent_attention(sent_hiddens, sent_masks)  # b x doc_rep_size

        batch_outputs = self.out(doc_reps)  # b x num_labels

        return batch_outputs

In [27]:

class Optimizer:
    def __init__(self, model_parameters, steps):
        self.all_params = []
        self.optims = []
        self.schedulers = []

        for name, parameters in model_parameters.items():
            if name.startswith("basic"):
                #optim = torch.optim.Adam(parameters, lr=learning_rate)
                #optim = torch.optim.SGD(parameters, lr=learning_rate)
                optim = torch.optim.RMSprop(parameters, lr=learning_rate)
                #optim = torch.optim.Adagrad(parameters, lr=learning_rate)
                
                self.optims.append(optim)

                l = lambda step: decay ** (step // decay_step)
                scheduler = torch.optim.lr_scheduler.LambdaLR(optim, lr_lambda=l)
                self.schedulers.append(scheduler)
                self.all_params.extend(parameters)
            elif name.startswith("bert"):
                optim_bert = AdamW(parameters, bert_lr, eps=1e-8)
                self.optims.append(optim_bert)

                scheduler_bert = get_linear_schedule_with_warmup(optim_bert, 0, steps)
                self.schedulers.append(scheduler_bert)

                for group in parameters:
                    for p in group['params']:
                        self.all_params.append(p)
            else:
                Exception("no nameed parameters.")

        self.num = len(self.optims)

    def step(self):
        for optim, scheduler in zip(self.optims, self.schedulers):
            optim.step()
            scheduler.step()
            optim.zero_grad()

    def zero_grad(self):
        for optim in self.optims:
            optim.zero_grad()

    def get_lr(self):
        lrs = tuple(map(lambda x: x.get_lr()[-1], self.schedulers))
        lr = ' %.5f' * self.num
        res = lr % lrs
        return res

In [28]:
# build dataset
def sentence_split(text, vocab, max_sent_len=256, max_segment=16):
    words = text.strip().split()
    document_len = len(words)

    index = list(range(0, document_len, max_sent_len))
    index.append(document_len)

    segments = []
    for i in range(len(index) - 1):
        segment = words[index[i]: index[i + 1]]
        assert len(segment) > 0
        segment = [word if word in vocab._id2word else '<UNK>' for word in segment]
        segments.append([len(segment), segment])

    assert len(segments) > 0
    if len(segments) > max_segment:
        segment_ = int(max_segment / 2)
        return segments[:segment_] + segments[-segment_:]
    else:
        return segments


def get_examples(data, word_encoder, vocab, max_sent_len=256, max_segment=8):
    label2id = vocab.label2id
    examples = []

    for text, label in zip(data['text'], data['label']):
        # label
        id = label2id(label)

        # words
        sents_words = sentence_split(text, vocab, max_sent_len-2, max_segment)
        doc = []
        for sent_len, sent_words in sents_words:
            token_ids = word_encoder.encode(sent_words)
            sent_len = len(token_ids)
            token_type_ids = [0] * sent_len
            doc.append([sent_len, token_ids, token_type_ids])
        examples.append([id, len(doc), doc])

    logging.info('Total %d docs.' % len(examples))
    return examples

In [29]:
# build loader

def batch_slice(data, batch_size):
    batch_num = int(np.ceil(len(data) / float(batch_size)))
    for i in range(batch_num):
        cur_batch_size = batch_size if i < batch_num - 1 else len(data) - batch_size * i
        docs = [data[i * batch_size + b] for b in range(cur_batch_size)]

        yield docs


def data_iter(data, batch_size, shuffle=True, noise=1.0):
    """
    randomly permute data, then sort by source length, and partition into batches
    ensure that the length of  sentences in each batch
    """

    batched_data = []
    if shuffle:
        np.random.shuffle(data)

        lengths = [example[1] for example in data]
        noisy_lengths = [- (l + np.random.uniform(- noise, noise)) for l in lengths]
        sorted_indices = np.argsort(noisy_lengths).tolist()
        sorted_data = [data[i] for i in sorted_indices]
    else:
        sorted_data =data
        
    batched_data.extend(list(batch_slice(sorted_data, batch_size)))

    if shuffle:
        np.random.shuffle(batched_data)

    for batch in batched_data:
        yield batch

In [30]:
# some function

def get_score(y_ture, y_pred):
    y_ture = np.array(y_ture)
    y_pred = np.array(y_pred)
    f1 = f1_score(y_ture, y_pred, average='macro') * 100
    p = precision_score(y_ture, y_pred, average='macro') * 100
    r = recall_score(y_ture, y_pred, average='macro') * 100

    return ((reformat(p), reformat(r), reformat(f1))), reformat(f1)


def reformat(num):
    return float("{:.2f}".format(num))

def reformat4(num):
    return float("{:.4f}".format(num))

In [31]:
# build trainer

class Trainer():
    def __init__(self, model, vocab):
        self.model = model
        self.report = True
        
        self.train_data = get_examples(train_data, model.word_encoder, vocab)
        self.batch_num = int(np.ceil(len(self.train_data) / float(train_batch_size)))
        self.dev_data = get_examples(dev_data, model.word_encoder, vocab)
        self.test_data = get_examples(test_data, model.word_encoder, vocab)

        # criterion
        self.criterion = nn.CrossEntropyLoss()

        # label name
        self.target_names = vocab.target_names

        # optimizer
        self.optimizer = Optimizer(model.all_parameters, steps=self.batch_num * epochs)

        # count
        self.step = 0
        self.early_stop = -1
        self.best_train_f1, self.best_dev_f1 = 0, 0
        self.last_epoch = epochs

    def train(self):
        logging.info('Start training...')
        for epoch in range(1, epochs + 1):
            train_f1 = self._train(epoch)

            dev_f1 = self._eval(epoch)

            if self.best_dev_f1 <= dev_f1:
                logging.info(
                    "Exceed history dev = %.2f, current dev = %.2f" % (self.best_dev_f1, dev_f1))
                torch.save(self.model.state_dict(), save_model)

                self.best_train_f1 = train_f1
                self.best_dev_f1 = dev_f1
                self.early_stop = 0
            else:
                self.early_stop += 1
                if self.early_stop == early_stops:
                    logging.info(
                        "Eearly stop in epoch %d, best train: %.2f, dev: %.2f" % (
                            epoch - early_stops, self.best_train_f1, self.best_dev_f1))
                    self.last_epoch = epoch
                    break
    def test(self):
        self.model.load_state_dict(torch.load(save_model))
        self._eval(self.last_epoch + 1, test=True)

    def _train(self, epoch):
        self.optimizer.zero_grad()
        self.model.train()

        start_time = time.time()
        epoch_start_time = time.time()
        overall_losses = 0
        losses = 0
        batch_idx = 1
        y_pred = []
        y_true = []
        for batch_data in data_iter(self.train_data, train_batch_size, shuffle=True):
            torch.cuda.empty_cache()
            batch_inputs, batch_labels = self.batch2tensor(batch_data)
            batch_outputs = self.model(batch_inputs)
            loss = self.criterion(batch_outputs, batch_labels)
            loss.backward()

            loss_value = loss.detach().cpu().item()
            losses += loss_value
            overall_losses += loss_value

            y_pred.extend(torch.max(batch_outputs, dim=1)[1].cpu().numpy().tolist())
            y_true.extend(batch_labels.cpu().numpy().tolist())

            nn.utils.clip_grad_norm_(self.optimizer.all_params, max_norm=clip)
            for optimizer, scheduler in zip(self.optimizer.optims, self.optimizer.schedulers):
                optimizer.step()
                scheduler.step()
            self.optimizer.zero_grad()

            self.step += 1

            if batch_idx % log_interval == 0:
                elapsed = time.time() - start_time

                lrs = self.optimizer.get_lr()
                logging.info(
                    '| epoch {:3d} | step {:3d} | batch {:3d}/{:3d} | lr{} | loss {:.4f} | s/batch {:.2f}'.format(
                        epoch, self.step, batch_idx, self.batch_num, lrs,
                        losses / log_interval,
                        elapsed / log_interval))

                losses = 0
                start_time = time.time()

            batch_idx += 1

        overall_losses /= self.batch_num
        during_time = time.time() - epoch_start_time

        # reformat
        overall_losses = reformat4(overall_losses)
        score, f1 = get_score(y_true, y_pred)

        logging.info(
            '| epoch {:3d} | score {} | f1 {} | loss {:.4f} | time {:.2f}'.format(epoch, score, f1,
                                                                                  overall_losses,
                                                                                  during_time))
        if set(y_true) == set(y_pred) and self.report:
            report = classification_report(y_true, y_pred, digits=4, target_names=self.target_names)
            logging.info('\n' + report)

        return f1

    def _eval(self, epoch, test=False):
        self.model.eval()
        start_time = time.time()
        data = self.test_data if test else self.dev_data
        y_pred = []
        y_true = []
        with torch.no_grad():
            for batch_data in data_iter(data, test_batch_size, shuffle=False):
                torch.cuda.empty_cache()
                batch_inputs, batch_labels = self.batch2tensor(batch_data)
                batch_outputs = self.model(batch_inputs)
                y_pred.extend(torch.max(batch_outputs, dim=1)[1].cpu().numpy().tolist())
                y_true.extend(batch_labels.cpu().numpy().tolist())

            score, f1 = get_score(y_true, y_pred)

            during_time = time.time() - start_time
            
            if test:
                df = pd.DataFrame({'label': y_pred})
                df.to_csv(save_test, index=False, sep=',')
            else:
                logging.info(
                    '| epoch {:3d} | dev | score {} | f1 {} | time {:.2f}'.format(epoch, score, f1,
                                                                              during_time))
                if set(y_true) == set(y_pred) and self.report:
                    report = classification_report(y_true, y_pred, digits=4, target_names=self.target_names)
                    logging.info('\n' + report)

        return f1

    def batch2tensor(self, batch_data):
        '''
            [[label, doc_len, [[sent_len, [sent_id0, ...], [sent_id1, ...]], ...]]
        '''
        batch_size = len(batch_data)
        doc_labels = []
        doc_lens = []
        doc_max_sent_len = []
        for doc_data in batch_data:
            doc_labels.append(doc_data[0])
            doc_lens.append(doc_data[1])
            sent_lens = [sent_data[0] for sent_data in doc_data[2]]
            max_sent_len = max(sent_lens)
            doc_max_sent_len.append(max_sent_len)

        max_doc_len = max(doc_lens)
        max_sent_len = max(doc_max_sent_len)

        batch_inputs1 = torch.zeros((batch_size, max_doc_len, max_sent_len), dtype=torch.int64)
        batch_inputs2 = torch.zeros((batch_size, max_doc_len, max_sent_len), dtype=torch.int64)
        batch_masks = torch.zeros((batch_size, max_doc_len, max_sent_len), dtype=torch.float32)
        batch_labels = torch.LongTensor(doc_labels)

        for b in range(batch_size):
            for sent_idx in range(doc_lens[b]):
                sent_data = batch_data[b][2][sent_idx]
                for word_idx in range(sent_data[0]):
                    batch_inputs1[b, sent_idx, word_idx] = sent_data[1][word_idx]
                    batch_inputs2[b, sent_idx, word_idx] = sent_data[2][word_idx]
                    batch_masks[b, sent_idx, word_idx] = 1

        if use_cuda:
            batch_inputs1 = batch_inputs1.to(device)
            batch_inputs2 = batch_inputs2.to(device)
            batch_masks = batch_masks.to(device)
            batch_labels = batch_labels.to(device)

        return (batch_inputs1, batch_inputs2, batch_masks), batch_labels

## 初始参数设定

In [35]:
fold_num = 10
fold_id = fold_num - 1
data_file = '../input/train_set.csv'
test_data_file = '../input/test_a.csv'

save_model = '../output/bert20200817.bin'
save_test = '../output/bert20200817.csv'

# build word encoder
bert_path = '../emb/bert-mini/'

# build sent encoder
sent_hidden_size = 256
sent_num_layers = 2

In [36]:
fold_data = all_data2fold(fold_num,200000)

2020-08-16 23:52:48,215 INFO: Fold lens [20000, 20000, 20000, 20000, 20000, 20000, 20000, 20000, 20000, 20000]


# build train, dev, test data

In [37]:
# dev
dev_data = fold_data[fold_id]

# train
train_texts = []
train_labels = []
for i in range(0, fold_id):
    data = fold_data[i]
    train_texts.extend(data['text'])
    train_labels.extend(data['label'])

train_data = {'label': train_labels, 'text': train_texts}

# test
f = pd.read_csv(test_data_file, sep='\t', encoding='UTF-8')
texts = f['text'].tolist()
test_data = {'label': [0] * len(texts), 'text': texts}

In [38]:
vocab = Vocab(train_data)

2020-08-16 23:53:42,901 INFO: Build vocab: words 5983, labels 14.


## 设置超参

In [39]:
# build optimizer
learning_rate = 2e-4
dropout = 0.25
bert_lr = 5e-5
decay = .75
decay_step = 1000
clip = 5.0
epochs = 3
early_stops = 3
log_interval = 50
test_batch_size = 10
train_batch_size = 10

In [40]:
model = Model(vocab)

2020-08-16 23:53:42,938 INFO: Build Bert vocab with size 5981.
2020-08-16 23:53:42,941 INFO: loading configuration file ../emb/bert-mini/config.json
2020-08-16 23:53:42,944 INFO: Model config {
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "is_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 256,
  "num_attention_heads": 4,
  "num_hidden_layers": 4,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 2,
  "use_bfloat16": false,
  "vocab_size": 5981
}

2020-08-16 23:53:42,947 INFO: loading weights file ../emb/bert-mini/pytorch_model.bin
2020-08-16 23:53:43,028 INFO: Build Bert e

In [41]:
# train
trainer = Trainer(model, vocab)

2020-08-17 00:08:50,473 INFO: Total 180000 docs.
2020-08-17 00:10:29,734 INFO: Total 20000 docs.
2020-08-17 00:14:43,636 INFO: Total 50000 docs.


## 目标使loss尽量最小之后，再进行预测

In [42]:
trainer.train()

2020-08-17 00:14:46,015 INFO: Start training...
2020-08-17 00:15:12,415 INFO: | epoch   1 | step  50 | batch  50/18000 | lr 0.00020 0.00005 | loss 1.8312 | s/batch 0.53
2020-08-17 00:15:34,471 INFO: | epoch   1 | step 100 | batch 100/18000 | lr 0.00020 0.00005 | loss 1.0157 | s/batch 0.44
2020-08-17 00:15:57,242 INFO: | epoch   1 | step 150 | batch 150/18000 | lr 0.00020 0.00005 | loss 0.7807 | s/batch 0.46
2020-08-17 00:16:21,822 INFO: | epoch   1 | step 200 | batch 200/18000 | lr 0.00020 0.00005 | loss 0.7633 | s/batch 0.49
2020-08-17 00:16:45,623 INFO: | epoch   1 | step 250 | batch 250/18000 | lr 0.00020 0.00005 | loss 0.7405 | s/batch 0.48
2020-08-17 00:17:08,194 INFO: | epoch   1 | step 300 | batch 300/18000 | lr 0.00020 0.00005 | loss 0.5798 | s/batch 0.45
2020-08-17 00:17:31,828 INFO: | epoch   1 | step 350 | batch 350/18000 | lr 0.00020 0.00005 | loss 0.5903 | s/batch 0.47
2020-08-17 00:17:55,783 INFO: | epoch   1 | step 400 | batch 400/18000 | lr 0.00020 0.00005 | loss 0.4864

2020-08-17 00:41:40,306 INFO: | epoch   1 | step 3400 | batch 3400/18000 | lr 0.00008 0.00005 | loss 0.3152 | s/batch 0.51
2020-08-17 00:42:03,890 INFO: | epoch   1 | step 3450 | batch 3450/18000 | lr 0.00008 0.00005 | loss 0.3905 | s/batch 0.47
2020-08-17 00:42:27,096 INFO: | epoch   1 | step 3500 | batch 3500/18000 | lr 0.00008 0.00005 | loss 0.3249 | s/batch 0.46
2020-08-17 00:42:52,343 INFO: | epoch   1 | step 3550 | batch 3550/18000 | lr 0.00008 0.00005 | loss 0.3488 | s/batch 0.50
2020-08-17 00:43:15,038 INFO: | epoch   1 | step 3600 | batch 3600/18000 | lr 0.00008 0.00005 | loss 0.3331 | s/batch 0.45
2020-08-17 00:43:38,816 INFO: | epoch   1 | step 3650 | batch 3650/18000 | lr 0.00008 0.00005 | loss 0.2393 | s/batch 0.48
2020-08-17 00:44:03,492 INFO: | epoch   1 | step 3700 | batch 3700/18000 | lr 0.00008 0.00005 | loss 0.2966 | s/batch 0.49
2020-08-17 00:44:26,885 INFO: | epoch   1 | step 3750 | batch 3750/18000 | lr 0.00008 0.00005 | loss 0.3827 | s/batch 0.47
2020-08-17 00:44

2020-08-17 01:07:45,413 INFO: | epoch   1 | step 6750 | batch 6750/18000 | lr 0.00004 0.00004 | loss 0.2847 | s/batch 0.43
2020-08-17 01:08:07,147 INFO: | epoch   1 | step 6800 | batch 6800/18000 | lr 0.00004 0.00004 | loss 0.2452 | s/batch 0.43
2020-08-17 01:08:27,659 INFO: | epoch   1 | step 6850 | batch 6850/18000 | lr 0.00004 0.00004 | loss 0.2017 | s/batch 0.41
2020-08-17 01:08:49,003 INFO: | epoch   1 | step 6900 | batch 6900/18000 | lr 0.00004 0.00004 | loss 0.2438 | s/batch 0.43
2020-08-17 01:09:11,457 INFO: | epoch   1 | step 6950 | batch 6950/18000 | lr 0.00004 0.00004 | loss 0.1810 | s/batch 0.45
2020-08-17 01:09:35,740 INFO: | epoch   1 | step 7000 | batch 7000/18000 | lr 0.00003 0.00004 | loss 0.2358 | s/batch 0.49
2020-08-17 01:09:59,828 INFO: | epoch   1 | step 7050 | batch 7050/18000 | lr 0.00003 0.00004 | loss 0.2500 | s/batch 0.48
2020-08-17 01:10:20,579 INFO: | epoch   1 | step 7100 | batch 7100/18000 | lr 0.00003 0.00004 | loss 0.2326 | s/batch 0.41
2020-08-17 01:10

2020-08-17 01:34:00,324 INFO: | epoch   1 | step 10100 | batch 10100/18000 | lr 0.00001 0.00004 | loss 0.2781 | s/batch 0.55
2020-08-17 01:34:23,554 INFO: | epoch   1 | step 10150 | batch 10150/18000 | lr 0.00001 0.00004 | loss 0.2379 | s/batch 0.46
2020-08-17 01:34:44,320 INFO: | epoch   1 | step 10200 | batch 10200/18000 | lr 0.00001 0.00004 | loss 0.1466 | s/batch 0.42
2020-08-17 01:35:07,105 INFO: | epoch   1 | step 10250 | batch 10250/18000 | lr 0.00001 0.00004 | loss 0.2260 | s/batch 0.46
2020-08-17 01:35:30,512 INFO: | epoch   1 | step 10300 | batch 10300/18000 | lr 0.00001 0.00004 | loss 0.1925 | s/batch 0.47
2020-08-17 01:35:52,736 INFO: | epoch   1 | step 10350 | batch 10350/18000 | lr 0.00001 0.00004 | loss 0.2543 | s/batch 0.44
2020-08-17 01:36:16,017 INFO: | epoch   1 | step 10400 | batch 10400/18000 | lr 0.00001 0.00004 | loss 0.2486 | s/batch 0.47
2020-08-17 01:36:38,957 INFO: | epoch   1 | step 10450 | batch 10450/18000 | lr 0.00001 0.00004 | loss 0.2336 | s/batch 0.46


2020-08-17 01:59:37,139 INFO: | epoch   1 | step 13400 | batch 13400/18000 | lr 0.00000 0.00004 | loss 0.1803 | s/batch 0.46
2020-08-17 02:00:02,112 INFO: | epoch   1 | step 13450 | batch 13450/18000 | lr 0.00000 0.00004 | loss 0.2223 | s/batch 0.50
2020-08-17 02:00:25,864 INFO: | epoch   1 | step 13500 | batch 13500/18000 | lr 0.00000 0.00004 | loss 0.2231 | s/batch 0.47
2020-08-17 02:00:49,060 INFO: | epoch   1 | step 13550 | batch 13550/18000 | lr 0.00000 0.00004 | loss 0.2474 | s/batch 0.46
2020-08-17 02:01:11,991 INFO: | epoch   1 | step 13600 | batch 13600/18000 | lr 0.00000 0.00004 | loss 0.2172 | s/batch 0.46
2020-08-17 02:01:33,442 INFO: | epoch   1 | step 13650 | batch 13650/18000 | lr 0.00000 0.00004 | loss 0.1821 | s/batch 0.43
2020-08-17 02:01:57,236 INFO: | epoch   1 | step 13700 | batch 13700/18000 | lr 0.00000 0.00004 | loss 0.2518 | s/batch 0.48
2020-08-17 02:02:19,566 INFO: | epoch   1 | step 13750 | batch 13750/18000 | lr 0.00000 0.00004 | loss 0.2038 | s/batch 0.45


2020-08-17 02:25:18,774 INFO: | epoch   1 | step 16700 | batch 16700/18000 | lr 0.00000 0.00003 | loss 0.1464 | s/batch 0.47
2020-08-17 02:25:41,669 INFO: | epoch   1 | step 16750 | batch 16750/18000 | lr 0.00000 0.00003 | loss 0.2280 | s/batch 0.46
2020-08-17 02:26:07,351 INFO: | epoch   1 | step 16800 | batch 16800/18000 | lr 0.00000 0.00003 | loss 0.2047 | s/batch 0.51
2020-08-17 02:26:30,099 INFO: | epoch   1 | step 16850 | batch 16850/18000 | lr 0.00000 0.00003 | loss 0.1569 | s/batch 0.45
2020-08-17 02:26:55,255 INFO: | epoch   1 | step 16900 | batch 16900/18000 | lr 0.00000 0.00003 | loss 0.2223 | s/batch 0.50
2020-08-17 02:27:19,489 INFO: | epoch   1 | step 16950 | batch 16950/18000 | lr 0.00000 0.00003 | loss 0.1519 | s/batch 0.48
2020-08-17 02:27:45,897 INFO: | epoch   1 | step 17000 | batch 17000/18000 | lr 0.00000 0.00003 | loss 0.2087 | s/batch 0.53
2020-08-17 02:28:09,946 INFO: | epoch   1 | step 17050 | batch 17050/18000 | lr 0.00000 0.00003 | loss 0.2173 | s/batch 0.48


2020-08-17 02:54:46,204 INFO: | epoch   2 | step 19100 | batch 1100/18000 | lr 0.00000 0.00003 | loss 0.2087 | s/batch 0.46
2020-08-17 02:55:07,835 INFO: | epoch   2 | step 19150 | batch 1150/18000 | lr 0.00000 0.00003 | loss 0.1364 | s/batch 0.43
2020-08-17 02:55:30,506 INFO: | epoch   2 | step 19200 | batch 1200/18000 | lr 0.00000 0.00003 | loss 0.1769 | s/batch 0.45
2020-08-17 02:55:51,757 INFO: | epoch   2 | step 19250 | batch 1250/18000 | lr 0.00000 0.00003 | loss 0.1839 | s/batch 0.42
2020-08-17 02:56:15,329 INFO: | epoch   2 | step 19300 | batch 1300/18000 | lr 0.00000 0.00003 | loss 0.1490 | s/batch 0.47
2020-08-17 02:56:36,307 INFO: | epoch   2 | step 19350 | batch 1350/18000 | lr 0.00000 0.00003 | loss 0.1664 | s/batch 0.42
2020-08-17 02:56:57,793 INFO: | epoch   2 | step 19400 | batch 1400/18000 | lr 0.00000 0.00003 | loss 0.1815 | s/batch 0.43
2020-08-17 02:57:22,128 INFO: | epoch   2 | step 19450 | batch 1450/18000 | lr 0.00000 0.00003 | loss 0.1288 | s/batch 0.49
2020-08-

2020-08-17 03:21:07,243 INFO: | epoch   2 | step 22450 | batch 4450/18000 | lr 0.00000 0.00003 | loss 0.1606 | s/batch 0.50
2020-08-17 03:21:29,811 INFO: | epoch   2 | step 22500 | batch 4500/18000 | lr 0.00000 0.00003 | loss 0.1504 | s/batch 0.45
2020-08-17 03:21:54,083 INFO: | epoch   2 | step 22550 | batch 4550/18000 | lr 0.00000 0.00003 | loss 0.1494 | s/batch 0.49
2020-08-17 03:22:16,843 INFO: | epoch   2 | step 22600 | batch 4600/18000 | lr 0.00000 0.00003 | loss 0.1512 | s/batch 0.46
2020-08-17 03:22:40,275 INFO: | epoch   2 | step 22650 | batch 4650/18000 | lr 0.00000 0.00003 | loss 0.1952 | s/batch 0.47
2020-08-17 03:23:00,929 INFO: | epoch   2 | step 22700 | batch 4700/18000 | lr 0.00000 0.00003 | loss 0.1953 | s/batch 0.41
2020-08-17 03:23:20,733 INFO: | epoch   2 | step 22750 | batch 4750/18000 | lr 0.00000 0.00003 | loss 0.0758 | s/batch 0.40
2020-08-17 03:23:43,573 INFO: | epoch   2 | step 22800 | batch 4800/18000 | lr 0.00000 0.00003 | loss 0.1189 | s/batch 0.46
2020-08-

2020-08-17 03:47:04,481 INFO: | epoch   2 | step 25800 | batch 7800/18000 | lr 0.00000 0.00003 | loss 0.1404 | s/batch 0.48
2020-08-17 03:47:26,551 INFO: | epoch   2 | step 25850 | batch 7850/18000 | lr 0.00000 0.00003 | loss 0.1695 | s/batch 0.44
2020-08-17 03:47:47,787 INFO: | epoch   2 | step 25900 | batch 7900/18000 | lr 0.00000 0.00003 | loss 0.1872 | s/batch 0.42
2020-08-17 03:48:08,288 INFO: | epoch   2 | step 25950 | batch 7950/18000 | lr 0.00000 0.00003 | loss 0.1615 | s/batch 0.41
2020-08-17 03:48:32,901 INFO: | epoch   2 | step 26000 | batch 8000/18000 | lr 0.00000 0.00003 | loss 0.1470 | s/batch 0.49
2020-08-17 03:48:55,346 INFO: | epoch   2 | step 26050 | batch 8050/18000 | lr 0.00000 0.00003 | loss 0.2089 | s/batch 0.45
2020-08-17 03:49:22,011 INFO: | epoch   2 | step 26100 | batch 8100/18000 | lr 0.00000 0.00003 | loss 0.2394 | s/batch 0.53
2020-08-17 03:49:43,817 INFO: | epoch   2 | step 26150 | batch 8150/18000 | lr 0.00000 0.00003 | loss 0.1551 | s/batch 0.44
2020-08-

2020-08-17 04:13:07,825 INFO: | epoch   2 | step 29100 | batch 11100/18000 | lr 0.00000 0.00002 | loss 0.1311 | s/batch 0.49
2020-08-17 04:13:31,715 INFO: | epoch   2 | step 29150 | batch 11150/18000 | lr 0.00000 0.00002 | loss 0.1083 | s/batch 0.48
2020-08-17 04:13:54,378 INFO: | epoch   2 | step 29200 | batch 11200/18000 | lr 0.00000 0.00002 | loss 0.1840 | s/batch 0.45
2020-08-17 04:14:16,093 INFO: | epoch   2 | step 29250 | batch 11250/18000 | lr 0.00000 0.00002 | loss 0.1501 | s/batch 0.43
2020-08-17 04:14:37,926 INFO: | epoch   2 | step 29300 | batch 11300/18000 | lr 0.00000 0.00002 | loss 0.1298 | s/batch 0.44
2020-08-17 04:15:03,360 INFO: | epoch   2 | step 29350 | batch 11350/18000 | lr 0.00000 0.00002 | loss 0.1774 | s/batch 0.51
2020-08-17 04:15:31,164 INFO: | epoch   2 | step 29400 | batch 11400/18000 | lr 0.00000 0.00002 | loss 0.1378 | s/batch 0.56
2020-08-17 04:15:54,851 INFO: | epoch   2 | step 29450 | batch 11450/18000 | lr 0.00000 0.00002 | loss 0.1511 | s/batch 0.47


2020-08-17 04:38:48,521 INFO: | epoch   2 | step 32400 | batch 14400/18000 | lr 0.00000 0.00002 | loss 0.1083 | s/batch 0.45
2020-08-17 04:39:13,009 INFO: | epoch   2 | step 32450 | batch 14450/18000 | lr 0.00000 0.00002 | loss 0.1698 | s/batch 0.49
2020-08-17 04:39:34,039 INFO: | epoch   2 | step 32500 | batch 14500/18000 | lr 0.00000 0.00002 | loss 0.1325 | s/batch 0.42
2020-08-17 04:39:57,063 INFO: | epoch   2 | step 32550 | batch 14550/18000 | lr 0.00000 0.00002 | loss 0.1371 | s/batch 0.46
2020-08-17 04:40:21,802 INFO: | epoch   2 | step 32600 | batch 14600/18000 | lr 0.00000 0.00002 | loss 0.1667 | s/batch 0.49
2020-08-17 04:40:43,978 INFO: | epoch   2 | step 32650 | batch 14650/18000 | lr 0.00000 0.00002 | loss 0.1923 | s/batch 0.44
2020-08-17 04:41:08,329 INFO: | epoch   2 | step 32700 | batch 14700/18000 | lr 0.00000 0.00002 | loss 0.1553 | s/batch 0.49
2020-08-17 04:41:34,974 INFO: | epoch   2 | step 32750 | batch 14750/18000 | lr 0.00000 0.00002 | loss 0.2058 | s/batch 0.53


2020-08-17 05:04:37,111 INFO: | epoch   2 | step 35700 | batch 17700/18000 | lr 0.00000 0.00002 | loss 0.1551 | s/batch 0.53
2020-08-17 05:05:00,046 INFO: | epoch   2 | step 35750 | batch 17750/18000 | lr 0.00000 0.00002 | loss 0.1214 | s/batch 0.46
2020-08-17 05:05:24,251 INFO: | epoch   2 | step 35800 | batch 17800/18000 | lr 0.00000 0.00002 | loss 0.1832 | s/batch 0.48
2020-08-17 05:05:49,273 INFO: | epoch   2 | step 35850 | batch 17850/18000 | lr 0.00000 0.00002 | loss 0.1913 | s/batch 0.50
2020-08-17 05:06:12,793 INFO: | epoch   2 | step 35900 | batch 17900/18000 | lr 0.00000 0.00002 | loss 0.1685 | s/batch 0.47
2020-08-17 05:06:34,073 INFO: | epoch   2 | step 35950 | batch 17950/18000 | lr 0.00000 0.00002 | loss 0.1147 | s/batch 0.43
2020-08-17 05:06:58,157 INFO: | epoch   2 | step 36000 | batch 18000/18000 | lr 0.00000 0.00002 | loss 0.1393 | s/batch 0.48
2020-08-17 05:06:58,385 INFO: | epoch   2 | score (94.83, 94.39, 94.61) | f1 94.61 | loss 0.1550 | time 8450.74
2020-08-17 05

2020-08-17 05:33:56,907 INFO: | epoch   3 | step 38100 | batch 2100/18000 | lr 0.00000 0.00001 | loss 0.1230 | s/batch 0.49
2020-08-17 05:34:18,350 INFO: | epoch   3 | step 38150 | batch 2150/18000 | lr 0.00000 0.00001 | loss 0.1182 | s/batch 0.43
2020-08-17 05:34:42,526 INFO: | epoch   3 | step 38200 | batch 2200/18000 | lr 0.00000 0.00001 | loss 0.1258 | s/batch 0.48
2020-08-17 05:35:04,720 INFO: | epoch   3 | step 38250 | batch 2250/18000 | lr 0.00000 0.00001 | loss 0.0904 | s/batch 0.44
2020-08-17 05:35:28,100 INFO: | epoch   3 | step 38300 | batch 2300/18000 | lr 0.00000 0.00001 | loss 0.1052 | s/batch 0.47
2020-08-17 05:35:51,352 INFO: | epoch   3 | step 38350 | batch 2350/18000 | lr 0.00000 0.00001 | loss 0.0846 | s/batch 0.46
2020-08-17 05:36:16,817 INFO: | epoch   3 | step 38400 | batch 2400/18000 | lr 0.00000 0.00001 | loss 0.1018 | s/batch 0.51
2020-08-17 05:36:40,079 INFO: | epoch   3 | step 38450 | batch 2450/18000 | lr 0.00000 0.00001 | loss 0.1351 | s/batch 0.47
2020-08-

2020-08-17 06:00:06,072 INFO: | epoch   3 | step 41450 | batch 5450/18000 | lr 0.00000 0.00001 | loss 0.1017 | s/batch 0.51
2020-08-17 06:00:29,298 INFO: | epoch   3 | step 41500 | batch 5500/18000 | lr 0.00000 0.00001 | loss 0.0918 | s/batch 0.46
2020-08-17 06:00:52,472 INFO: | epoch   3 | step 41550 | batch 5550/18000 | lr 0.00000 0.00001 | loss 0.1077 | s/batch 0.46
2020-08-17 06:01:18,198 INFO: | epoch   3 | step 41600 | batch 5600/18000 | lr 0.00000 0.00001 | loss 0.1886 | s/batch 0.51
2020-08-17 06:01:42,610 INFO: | epoch   3 | step 41650 | batch 5650/18000 | lr 0.00000 0.00001 | loss 0.1408 | s/batch 0.49
2020-08-17 06:02:03,974 INFO: | epoch   3 | step 41700 | batch 5700/18000 | lr 0.00000 0.00001 | loss 0.0961 | s/batch 0.43
2020-08-17 06:02:29,535 INFO: | epoch   3 | step 41750 | batch 5750/18000 | lr 0.00000 0.00001 | loss 0.1383 | s/batch 0.51
2020-08-17 06:02:51,392 INFO: | epoch   3 | step 41800 | batch 5800/18000 | lr 0.00000 0.00001 | loss 0.0928 | s/batch 0.44
2020-08-

2020-08-17 06:25:57,484 INFO: | epoch   3 | step 44800 | batch 8800/18000 | lr 0.00000 0.00001 | loss 0.1128 | s/batch 0.46
2020-08-17 06:26:20,408 INFO: | epoch   3 | step 44850 | batch 8850/18000 | lr 0.00000 0.00001 | loss 0.0690 | s/batch 0.46
2020-08-17 06:26:46,383 INFO: | epoch   3 | step 44900 | batch 8900/18000 | lr 0.00000 0.00001 | loss 0.1429 | s/batch 0.52
2020-08-17 06:27:08,149 INFO: | epoch   3 | step 44950 | batch 8950/18000 | lr 0.00000 0.00001 | loss 0.1164 | s/batch 0.44
2020-08-17 06:27:30,041 INFO: | epoch   3 | step 45000 | batch 9000/18000 | lr 0.00000 0.00001 | loss 0.1058 | s/batch 0.44
2020-08-17 06:27:53,564 INFO: | epoch   3 | step 45050 | batch 9050/18000 | lr 0.00000 0.00001 | loss 0.1134 | s/batch 0.47
2020-08-17 06:28:17,108 INFO: | epoch   3 | step 45100 | batch 9100/18000 | lr 0.00000 0.00001 | loss 0.1267 | s/batch 0.47
2020-08-17 06:28:39,107 INFO: | epoch   3 | step 45150 | batch 9150/18000 | lr 0.00000 0.00001 | loss 0.0868 | s/batch 0.44
2020-08-

2020-08-17 06:51:44,165 INFO: | epoch   3 | step 48100 | batch 12100/18000 | lr 0.00000 0.00001 | loss 0.1351 | s/batch 0.45
2020-08-17 06:52:08,436 INFO: | epoch   3 | step 48150 | batch 12150/18000 | lr 0.00000 0.00001 | loss 0.1154 | s/batch 0.49
2020-08-17 06:52:29,198 INFO: | epoch   3 | step 48200 | batch 12200/18000 | lr 0.00000 0.00001 | loss 0.1429 | s/batch 0.42
2020-08-17 06:52:50,751 INFO: | epoch   3 | step 48250 | batch 12250/18000 | lr 0.00000 0.00001 | loss 0.1093 | s/batch 0.43
2020-08-17 06:53:15,721 INFO: | epoch   3 | step 48300 | batch 12300/18000 | lr 0.00000 0.00001 | loss 0.0878 | s/batch 0.50
2020-08-17 06:53:40,237 INFO: | epoch   3 | step 48350 | batch 12350/18000 | lr 0.00000 0.00001 | loss 0.1126 | s/batch 0.49
2020-08-17 06:54:03,896 INFO: | epoch   3 | step 48400 | batch 12400/18000 | lr 0.00000 0.00001 | loss 0.0648 | s/batch 0.47
2020-08-17 06:54:24,894 INFO: | epoch   3 | step 48450 | batch 12450/18000 | lr 0.00000 0.00001 | loss 0.1266 | s/batch 0.42


2020-08-17 07:17:46,471 INFO: | epoch   3 | step 51400 | batch 15400/18000 | lr 0.00000 0.00000 | loss 0.0702 | s/batch 0.47
2020-08-17 07:18:10,809 INFO: | epoch   3 | step 51450 | batch 15450/18000 | lr 0.00000 0.00000 | loss 0.1050 | s/batch 0.49
2020-08-17 07:18:37,757 INFO: | epoch   3 | step 51500 | batch 15500/18000 | lr 0.00000 0.00000 | loss 0.0915 | s/batch 0.54
2020-08-17 07:19:01,177 INFO: | epoch   3 | step 51550 | batch 15550/18000 | lr 0.00000 0.00000 | loss 0.0615 | s/batch 0.47
2020-08-17 07:19:27,778 INFO: | epoch   3 | step 51600 | batch 15600/18000 | lr 0.00000 0.00000 | loss 0.1688 | s/batch 0.53
2020-08-17 07:19:50,442 INFO: | epoch   3 | step 51650 | batch 15650/18000 | lr 0.00000 0.00000 | loss 0.1385 | s/batch 0.45
2020-08-17 07:20:14,128 INFO: | epoch   3 | step 51700 | batch 15700/18000 | lr 0.00000 0.00000 | loss 0.1031 | s/batch 0.47
2020-08-17 07:20:40,193 INFO: | epoch   3 | step 51750 | batch 15750/18000 | lr 0.00000 0.00000 | loss 0.0992 | s/batch 0.52


2020-08-17 07:48:48,191 INFO: Exceed history dev = 95.09, current dev = 95.49


In [43]:

# test
trainer.test()

# Bert 优化参数比较

以rows=2000为测试行数，进行调优

| 优化器              | sent_hidden_size | batch_size | loss   | f1       | time   | fold_num | 排行 |
| :------------------- | :---------------- | :---------- | :------ | :-------- | :------ | :-------- | :---- |
| torch.optim.SGD     | 256              | 10         | 2.4983 | 8.3      | 99.81  | 10       |10|
| torch.optim.RMSprop | 256              | 10         | 1.8028 | 26.84    | 99.65  | 10       | 3 |
| torch.optim.Adagrad | 256              | 10         | 1.9230 | 14.65    | 98.70  | 10       |9|
| torch.optim.RMSprop | 512              | 10         | 1.9720 | 23.27    | 101.48 | 10       |7|
|                     | 128              | 10         | 1.7905 | 23.56 | 105.51 | 10       |5 |
|                     | 256              | 13         | 1.8140 | 22.94    | 97.86  | 10       |8|
|                     | 256              | 16         | 1.9063 | 23.56    | 96.54  | 10       |6|
|                     | 256              | 6          | 1.8069 | 25.22    | 123.51 | 10       | 4 |
|                     | 256              | 10         | 1.3445 | 54.43    | 111.90 | 15       | 1 |
|                     | 256              | 10         | 1.3791 | 52.17    | 111.60 | 20       | 2 |


由以上对比结果，选择排名第一的组合：

| 优化器              | sent_hidden_size | batch_size | loss   | f1       | time   | fold_num | 排行 |
| :------------------- | :---------------- | :---------- | :------ | :-------- | :------ | :-------- | :---- |
| torch.optim.RMSprop  | 256              | 10         | 1.3445 | 54.43    | 111.90 | 15       | 1 |

# 以此训练后，得分：0.9562