# 思路
先把transformers官方版本复现一遍

In [1]:
import torch
from torch import nn
from torch.utils.data.dataset import Dataset
from torch.utils.data.dataloader import DataLoader
from transformers.data.data_collator import default_data_collator
from loguru import logger

import os
from dataclasses import dataclass
from enum import Enum
from typing import List, Optional, Union

from filelock import FileLock
from code.config import args
import pandas as pd
import numpy as np

from torch.nn import CrossEntropyLoss, MSELoss

from transformers.modeling_outputs import TokenClassifierOutput
# from transformers import PreTrainedTokenizer#, is_torch_available

from transformers import Trainer, AutoModelForTokenClassification, AutoConfig
from transformers.modeling_bert import BertPreTrainedModel, BertModel
from datetime import datetime

from seqeval.metrics import f1_score, precision_score, accuracy_score, recall_score, classification_report

# api

## config

In [3]:
max_seq_length = 250

# model_name_or_path = 'bert-base-cased'
model_name_or_path = 'clue/roberta_chinese_base'
epochs = 10

## data prepare

### InputExample | InputFeatures | Split

In [4]:
@dataclass
class InputExample:
    """
    A single training/test example for token classification.
    Args:
        guid: Unique id for the example.
        words: list. The words of the sequence.
        labels: (Optional) list. The labels for each word of the sequence. This should be
        specified for train and dev examples, but not for test examples.
    """

    guid: str
    words: List[str]
    labels: Optional[List[str]]


@dataclass
class InputFeatures:
    """
    A single set of features of data.
    Property names are the same names as the corresponding inputs to a model.
    """

    input_ids: List[int]
    attention_mask: List[int]
    token_type_ids: Optional[List[int]] = None
    label_ids: Optional[List[int]] = None


class Split(Enum):
    train = "train"
    dev = "val"
    test = "test"


### read_examples_from_file

In [5]:
def read_examples_from_file(fp) -> List[InputExample]:
    """
    :fp: corpus路径
    """
    file_path = os.path.join(fp)
    guid_index = 1
    examples = []
    with open(file_path, encoding="utf-8") as f:
        words = []
        labels = []
        for line in f:
            if line.startswith("-DOCSTART-") or line == "" or line == "\n":
                if words:
                    examples.append(InputExample(guid=f"{guid_index}", words=words, labels=labels))
                    guid_index += 1
                    words = []
                    labels = []
            else:
                splits = line.split(" ")
                words.append(splits[0])
                if len(splits) > 1:
                    labels.append(splits[-1].replace("\n", ""))
                else:
                    # Examples could have no label for mode = "test"
                    labels.append("O")
        if words:
            examples.append(InputExample(guid=f"{mode}-{guid_index}", words=words, labels=labels))
    return examples

In [6]:
examples = read_examples_from_file(f'{args.DATA_GEN}train_val.txt')
examples[:2]

[InputExample(guid='1', words=['7', '.', '服', '药', '1', '个', '月', '症', '状', '无', '缓', '解', '，', '应', '去', '医', '院', '就', '诊', '。', '8', '.', '对', '本', '品', '过', '敏', '者', '禁', '用', '，', '过', '敏', '体', '质', '者', '慎', '用', '。', '9', '.', '本', '品', '性', '状', '发', '生', '改', '变', '时', '禁', '止', '使', '用', '。', '1', '0', '.', '请', '将', '本', '品', '放', '在', '儿', '童', '不', '能', '接', '触', '的', '地', '方', '。', '1', '1', '.', '如', '正', '在', '使', '用', '其', '他', '药', '品', '，', '使', '用', '本', '品', '前', '请', '咨', '询', '医', '师', '或', '药', '师', '。', '丸', '剂', '(', '水', '蜜', '丸', ')', '镀', '铝', '复', '合', '膜', '，', '每', '袋', '装', '6', '克', '，', '每', '盒', '装', '1', '0', '袋', '。', '补', '气', '养', '血', '，', '调', '经', '止', '带', '。', '用', '于', '气', '血', '两', '虚', '，', '身', '体', '瘦', '弱', '，', '腰', '膝', '酸', '软', '，', '月', '经', '量', '少', '、', '后', '错', '，', '带', '下', '补', '气', '养', '血', '，', '调', '经', '止', '带', '6', 'g', '*', '1', '0', '袋', '孕', '妇', '禁', '用', '。', '补', '气', '养', '血', '、', '调', '经', '止', '带', '，',

In [6]:
len(examples)

1957

### get_labels

In [7]:
def get_labels(path: str) -> List[str]:
    if path:
        with open(path, "r") as f:
            labels = f.read().splitlines()
        if "O" not in labels:
            labels = ["O"] + labels
        return labels
    else:
        return ["O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]

In [8]:
labels = get_labels(f'{args.DATA_GEN}torch_ner_data/labels.txt')
labels

['O',
 'B-DRUG',
 'I-DRUG',
 'B-DRUG_INGREDIENT',
 'I-DRUG_INGREDIENT',
 'B-DISEASE',
 'I-DISEASE',
 'B-SYMPTOM',
 'I-SYMPTOM',
 'B-SYNDROME',
 'I-SYNDROME',
 'B-DISEASE_GROUP',
 'I-DISEASE_GROUP',
 'B-FOOD',
 'I-FOOD',
 'B-FOOD_GROUP',
 'I-FOOD_GROUP',
 'B-PERSON_GROUP',
 'I-PERSON_GROUP',
 'B-DRUG_GROUP',
 'I-DRUG_GROUP',
 'B-DRUG_DOSAGE',
 'I-DRUG_DOSAGE',
 'B-DRUG_TASTE',
 'I-DRUG_TASTE',
 'B-DRUG_EFFICACY',
 'I-DRUG_EFFICACY']

In [9]:
label_map = {i: label for i, label in enumerate(labels)}

### convert_examples_to_features

In [10]:
def convert_examples_to_features(
    examples: List[InputExample],
    label_list: List[str],
    max_seq_length: int,
    tokenizer,#: PreTrainedTokenizer,
    cls_token_at_end=False,
    cls_token="[CLS]",
    cls_token_segment_id=0,
    sep_token="[SEP]",
    sep_token_extra=False,
    pad_on_left=False,
    pad_token=0,
    pad_token_segment_id=0,
    pad_token_label_id=-100,
    sequence_a_segment_id=0,
    mask_padding_with_zero=True,
) -> List[InputFeatures]:
    """ Loads a data file into a list of `InputFeatures`
        `cls_token_at_end` define the location of the CLS token:
            - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
            - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
        `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
    """
    # TODO clean up all this to leverage built-in features of tokenizers

    label_map = {label: i for i, label in enumerate(label_list)}

    features = []
    for (ex_index, example) in enumerate(examples):
        if ex_index % 10_000 == 0:
            logger.info("Writing example %d of %d", ex_index, len(examples))

        tokens = []
        label_ids = []
        for word, label in zip(example.words, example.labels):
            word_tokens = tokenizer.tokenize(word)

            # bert-base-multilingual-cased sometimes output "nothing ([]) when calling tokenize with just a space.
            if len(word_tokens) > 0:
                tokens.extend(word_tokens)
                # Use the real label id for the first token of the word, and padding ids for the remaining tokens
                label_ids.extend([label_map[label]] + [pad_token_label_id] * (len(word_tokens) - 1))

        # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa.
        special_tokens_count = tokenizer.num_special_tokens_to_add()
        if len(tokens) > max_seq_length - special_tokens_count:
            tokens = tokens[: (max_seq_length - special_tokens_count)]
            label_ids = label_ids[: (max_seq_length - special_tokens_count)]

        # The convention in BERT is:
        # (a) For sequence pairs:
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
        #  type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1
        # (b) For single sequences:
        #  tokens:   [CLS] the dog is hairy . [SEP]
        #  type_ids:   0   0   0   0  0     0   0
        #
        # Where "type_ids" are used to indicate whether this is the first
        # sequence or the second sequence. The embedding vectors for `type=0` and
        # `type=1` were learned during pre-training and are added to the wordpiece
        # embedding vector (and position vector). This is not *strictly* necessary
        # since the [SEP] token unambiguously separates the sequences, but it makes
        # it easier for the model to learn the concept of sequences.
        #
        # For classification tasks, the first vector (corresponding to [CLS]) is
        # used as as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.
        tokens += [sep_token]
        label_ids += [pad_token_label_id]
        if sep_token_extra:
            # roberta uses an extra separator b/w pairs of sentences
            tokens += [sep_token]
            label_ids += [pad_token_label_id]
        segment_ids = [sequence_a_segment_id] * len(tokens)

        if cls_token_at_end:
            tokens += [cls_token]
            label_ids += [pad_token_label_id]
            segment_ids += [cls_token_segment_id]
        else:
            tokens = [cls_token] + tokens
            label_ids = [pad_token_label_id] + label_ids
            segment_ids = [cls_token_segment_id] + segment_ids

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding_length = max_seq_length - len(input_ids)
        if pad_on_left:
            input_ids = ([pad_token] * padding_length) + input_ids
            input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask
            segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids
            label_ids = ([pad_token_label_id] * padding_length) + label_ids
        else:
            input_ids += [pad_token] * padding_length
            input_mask += [0 if mask_padding_with_zero else 1] * padding_length
            segment_ids += [pad_token_segment_id] * padding_length
            label_ids += [pad_token_label_id] * padding_length

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length
        assert len(label_ids) == max_seq_length

#         if ex_index < 5:
#             logger.info("*** Example ***")
#             logger.info("guid: %s", example.guid)
#             logger.info("tokens: %s", " ".join([str(x) for x in tokens]))
#             logger.info("input_ids: %s", " ".join([str(x) for x in input_ids]))
#             logger.info("input_mask: %s", " ".join([str(x) for x in input_mask]))
#             logger.info("segment_ids: %s", " ".join([str(x) for x in segment_ids]))
#             logger.info("label_ids: %s", " ".join([str(x) for x in label_ids]))

        if "token_type_ids" not in tokenizer.model_input_names:
            segment_ids = None

            
            
        features.append(
            InputFeatures(
                input_ids=input_ids, attention_mask=input_mask, token_type_ids=segment_ids, label_ids=label_ids
            )
        )
    return features

In [11]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

In [53]:
features = convert_examples_to_features(examples, labels, max_length, tokenizer)

2020-09-29 17:25:37.340 | INFO     | __main__:convert_examples_to_features:31 - Writing example %d of %d
2020-09-29 17:25:37.346 | INFO     | __main__:convert_examples_to_features:110 - *** Example ***
2020-09-29 17:25:37.346 | INFO     | __main__:convert_examples_to_features:111 - guid: %s
2020-09-29 17:25:37.346 | INFO     | __main__:convert_examples_to_features:112 - tokens: %s
2020-09-29 17:25:37.347 | INFO     | __main__:convert_examples_to_features:113 - input_ids: %s
2020-09-29 17:25:37.347 | INFO     | __main__:convert_examples_to_features:114 - input_mask: %s
2020-09-29 17:25:37.348 | INFO     | __main__:convert_examples_to_features:115 - segment_ids: %s
2020-09-29 17:25:37.348 | INFO     | __main__:convert_examples_to_features:116 - label_ids: %s
2020-09-29 17:25:37.353 | INFO     | __main__:convert_examples_to_features:110 - *** Example ***
2020-09-29 17:25:37.353 | INFO     | __main__:convert_examples_to_features:111 - guid: %s
2020-09-29 17:25:37.353 | INFO     | __main__:

In [54]:
features[1]

InputFeatures(input_ids=[101, 1020, 100, 1034, 100, 886, 100, 995, 100, 1052, 1010, 100, 100, 100, 100, 100, 100, 100, 991, 999, 1020, 100, 1034, 100, 886, 1039, 100, 100, 100, 100, 100, 1099, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 1099, 100, 100, 1099, 100, 100, 1099, 100, 100, 886, 100, 100, 100, 121, 119, 125, 176, 886, 100, 100, 1099, 122, 121, 100, 120, 100, 115, 124, 100, 120, 100, 886, 100, 100, 100, 100, 886, 100, 100, 1102, 100, 100, 100, 100, 100, 1099, 100, 100, 100, 1006, 886, 133, 171, 187, 120, 135, 980, 100, 1102, 100, 100, 886, 100, 100, 1099, 100, 100, 100, 100, 886, 100, 100, 100, 100, 100, 100, 885, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 1056, 100, 100, 886, 100, 100, 1099, 100, 100, 100, 100, 886, 100, 100, 100, 100, 100, 100, 885, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 1056, 886, 997, 100, 1099, 976, 100, 124, 118, 126, 100, 1099, 976, 1033, 124, 100, 100, 100, 100, 100, 100, 886, 1037, 100, 100, 122, 1

### NerDataset

In [11]:
class NerDataset(Dataset):
    """
    This will be superseded by a framework-agnostic approach
    soon.
    """

    features: List[InputFeatures]
    pad_token_label_id: int = nn.CrossEntropyLoss().ignore_index
    # Use cross entropy ignore_index as padding label id so that only
    # real label ids contribute to the loss later.

    def __init__(
        self,
        examples,
        tokenizer, #: PreTrainedTokenizer,
        labels: List[str],
        model_type: str,
        max_seq_length: Optional[int] = None,
    ):
        
        # TODO clean up all this to leverage built-in features of tokenizers
        self.features = convert_examples_to_features(
            examples,
            labels,
            max_seq_length,
            tokenizer,
            cls_token_at_end=bool(model_type in ["xlnet"]),
            # xlnet has a cls token at the end
            cls_token=tokenizer.cls_token,
            cls_token_segment_id=2 if model_type in ["xlnet"] else 0,
            sep_token=tokenizer.sep_token,
            sep_token_extra=False,
            # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
            pad_on_left=bool(tokenizer.padding_side == "left"),
            pad_token=tokenizer.pad_token_id,
            pad_token_segment_id=tokenizer.pad_token_type_id,
            pad_token_label_id=self.pad_token_label_id,
        )

    def __len__(self):
        return len(self.features)

    def __getitem__(self, i) -> InputFeatures:
        return self.features[i]


### data loader

In [11]:
train_data_loader = DataLoader(features, batch_size=6, shuffle=True,collate_fn=default_data_collator)

NameError: name 'features' is not defined

In [12]:
len(train_data_loader)  # size of features / batch_size

2

In [32]:
# for step, inputs in enumerate(train_data_loader):
#     print(step)
#     print(inputs)
#     break

0
{'labels': tensor([[-100,    0,    0,  ...,    0,    0, -100],
        [-100,    0,    0,  ..., -100, -100, -100],
        [-100,    0,    0,  ...,    0,    0, -100],
        [-100,    0,    0,  ..., -100, -100, -100],
        [-100,    0,    0,  ...,    0,    0, -100],
        [-100,    0,    0,  ...,    0,    0, -100]]), 'input_ids': tensor([[ 101,  125,  119,  ...,  100,  100,  102],
        [ 101,  100,  100,  ...,    0,    0,    0],
        [ 101,  100,  100,  ...,  176,  115,  102],
        [ 101,  100,  100,  ...,    0,    0,    0],
        [ 101, 1039,  100,  ...,  100,  100,  102],
        [ 101,  100,  100,  ...,  100,  100,  102]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
  

## Trainer

In [12]:
from transformers.trainer_utils import nested_concat, nested_numpify, EvalPrediction
from typing import Dict, List, Optional, Tuple

In [13]:
def compute_metrics(p: EvalPrediction) -> Dict:
    preds_list, out_label_list = align_predictions(p.predictions, p.label_ids)
#     print(preds_list)
#     print(out_label_list)
    return {
        "precision": precision_score(out_label_list, preds_list),
        "recall": recall_score(out_label_list, preds_list),
        "f1": f1_score(out_label_list, preds_list),
    }

In [14]:
def align_predictions(predictions: np.ndarray, label_ids: np.ndarray) -> Tuple[List[int], List[int]]:
    preds = np.argmax(predictions, axis=2)

    batch_size, seq_len = preds.shape

    out_label_list = [[] for _ in range(batch_size)]
    preds_list = [[] for _ in range(batch_size)]

    for i in range(batch_size):
        for j in range(seq_len):
            if label_ids[i, j] != nn.CrossEntropyLoss().ignore_index:
                out_label_list[i].append(label_map[label_ids[i][j]])
                preds_list[i].append(label_map[preds[i][j]])
#         print(len(out_label_list[i]), len(preds_list[i]))
    return preds_list, out_label_list


### train_args

In [15]:
config = AutoConfig.from_pretrained(
        model_name_or_path,
        num_labels=len(labels),
        id2label= label_map,
        label2id={label: i for i, label in enumerate(labels)},
    )

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=621.0, style=ProgressStyle(description_…




In [27]:
config

BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-DRUG",
    "2": "I-DRUG",
    "3": "B-DRUG_INGREDIENT",
    "4": "I-DRUG_INGREDIENT",
    "5": "B-DISEASE",
    "6": "I-DISEASE",
    "7": "B-SYMPTOM",
    "8": "I-SYMPTOM",
    "9": "B-SYNDROME",
    "10": "I-SYNDROME",
    "11": "B-DISEASE_GROUP",
    "12": "I-DISEASE_GROUP",
    "13": "B-FOOD",
    "14": "I-FOOD",
    "15": "B-FOOD_GROUP",
    "16": "I-FOOD_GROUP",
    "17": "B-PERSON_GROUP",
    "18": "I-PERSON_GROUP",
    "19": "B-DRUG_GROUP",
    "20": "I-DRUG_GROUP",
    "21": "B-DRUG_DOSAGE",
    "22": "I-DRUG_DOSAGE",
    "23": "B-DRUG_TASTE",
    "24": "I-DRUG_TASTE",
    "25": "B-DRUG_EFFICACY",
    "26": "I-DRUG_EFFICACY"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-DISEASE": 5,
    "B

### MyBertForTokenClassification

In [16]:
class MyBertForTokenClassification(BertPreTrainedModel):

    authorized_unexpected_keys = [r"pooler"]

    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        self.bert = BertModel(config, add_pooling_layer=False)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        self.init_weights()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=True,
        return_dict=True,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
            Labels for computing the token classification loss.
            Indices should be in ``[0, ..., config.num_labels - 1]``.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = outputs[0]

        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)
        
#         return outputs, logits
        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            # Only keep active parts of the loss
            if attention_mask is not None:
                active_loss = attention_mask.view(-1) == 1
                active_logits = logits.view(-1, self.num_labels)
                active_labels = torch.where(
                    active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
                )
                loss = loss_fct(active_logits, active_labels)
            else:
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

In [14]:
model = MyBertForTokenClassification(config)

model = model.to('cuda')

model.train()

MyBertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementw

In [23]:
for step, inputs in enumerate(train_data_loader):
#     print(step)
#     print(inputs)
    for k, v in inputs.items():
        if isinstance(v, torch.Tensor):
            inputs[k] = v.to('cuda')
    outputs, logits = model(**inputs)
#     print(outputs[0])
    break

In [33]:
logits.size()

torch.Size([6, 200, 27])

In [34]:
inputs['attention_mask'].size()

torch.Size([6, 200])

In [36]:
active_loss = inputs['attention_mask'].view(-1) == 1

In [37]:
active_loss

tensor([True, True, True,  ..., True, True, True], device='cuda:0')

In [38]:
active_logits = logits.view(-1, 27)

In [40]:
active_logits.size()

torch.Size([1200, 27])

In [24]:
len(outputs)

2

In [30]:
len(outputs[1]),outputs[1]

(13,
 (tensor([[[ 0.9684,  2.7308,  0.0000,  ..., -0.6839, -2.0568,  0.1047],
           [-0.4525,  0.7580,  1.0821,  ..., -0.9978, -1.4021,  0.0256],
           [-0.1976,  0.0000,  1.2650,  ..., -2.1700, -2.6246, -0.7749],
           ...,
           [ 0.4136, -0.3217, -0.2158,  ..., -1.3179, -1.3585, -0.5520],
           [ 0.9265, -0.1301, -0.5426,  ..., -0.3856, -1.7954, -1.3358],
           [ 0.5968, -0.1118,  0.0000,  ..., -1.6809, -1.4990,  0.1104]],
  
          [[ 0.9684,  2.7308, -0.0140,  ..., -0.6839, -2.0568,  0.1047],
           [ 1.4905,  2.9093,  2.0309,  ..., -0.8984, -1.8409,  0.7107],
           [-1.2150,  1.2836,  1.4268,  ..., -0.6602, -2.2829,  0.0000],
           ...,
           [ 0.3063,  1.0189,  0.3453,  ..., -1.9695, -2.3599, -0.3869],
           [-0.0974, -0.5618,  0.7042,  ...,  0.6677, -1.9689,  0.3352],
           [-0.2496,  0.0063,  0.5047,  ..., -1.9593, -0.9733,  0.0000]],
  
          [[ 0.9684,  2.7308, -0.0140,  ..., -0.6839, -2.0568,  0.1047],
      

In [28]:
outputs[0].size(),outputs[0]

(torch.Size([6, 200, 768]),
 tensor([[[ 1.7437,  0.9291,  1.2363,  ...,  1.0962,  0.7286, -0.3223],
          [ 1.2185,  0.3670,  0.5582,  ...,  0.8326,  0.1728, -0.2502],
          [ 1.6449,  0.0837,  0.7441,  ...,  2.2655, -1.0283, -0.6514],
          ...,
          [ 2.3095, -0.4218, -0.2199,  ...,  2.2289, -0.5987, -0.3178],
          [ 1.4366,  1.5790, -1.5793,  ...,  0.5487, -0.8128,  0.2379],
          [ 1.3789, -0.1577, -0.0923,  ...,  1.7618, -0.0452, -0.0863]],
 
         [[ 1.5035,  3.0063,  0.0811,  ...,  1.2835, -0.4432, -0.2661],
          [ 2.1502,  1.5924,  0.1841,  ..., -0.1408,  0.2129,  0.5527],
          [ 0.2781,  0.0848, -0.6194,  ...,  1.3902, -0.7994, -0.9657],
          ...,
          [ 1.5841, -0.5406, -0.3822,  ...,  1.6574, -0.0868, -0.0596],
          [ 2.2570, -0.5259, -1.0676,  ...,  0.9329, -0.8196, -0.1864],
          [ 1.2071,  0.7652,  0.2261,  ...,  0.9018,  1.1688,  0.2020]],
 
         [[ 1.1645,  1.2092,  0.8128,  ...,  1.3815,  0.6917,  0.1494],


## 5 fold

In [17]:
from sklearn.model_selection import KFold

In [18]:
folds = KFold(5, shuffle=True, random_state=2019)

In [20]:
!pip freeze | grep transf

transformers==3.3.0


In [21]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('hfl/chinese-bert-wwm')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=647.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=109540.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=19.0, style=ProgressStyle(description_w…




In [22]:
model_name_or_path = 'hfl/chinese-bert-wwm'

In [None]:
for fold_, (trn_idx, val_idx) in enumerate(folds.split(examples)):
#     print(trn_idx, val_idx)
    examples_train = pd.Series(examples).iloc[trn_idx].tolist()
    examples_val = pd.Series(examples).iloc[val_idx].tolist()
    trainset = NerDataset(examples_train, tokenizer, labels, config.model_type, max_seq_length)
    train_data_loader = DataLoader(trainset, batch_size=6, shuffle=True,collate_fn=default_data_collator)
    valset = NerDataset(examples_val, tokenizer, labels, config.model_type, max_seq_length)
    val_data_loader = DataLoader(valset, batch_size=6, shuffle=True,collate_fn=default_data_collator)
    
#     model = MyBertForTokenClassification(config)
    model = AutoModelForTokenClassification.from_pretrained(model_name_or_path, 
                                                            config = config)
    model = model.to('cuda')
    optimizer = torch.optim.Adam(model.parameters(), lr=0.00001)
    
    time_start = datetime.now()
    
    epochs = 50
    for epoch in range(epochs):
        losst = 0
        for step, inputs in enumerate(train_data_loader):
            optimizer.zero_grad()
        #     print(step)
        #     print(inputs)
            for k, v in inputs.items():
                if isinstance(v, torch.Tensor):
                    inputs[k] = v.to('cuda')
#             print(inputs)
#             break
            outputs = model(**inputs)
            loss = outputs[0]
            losst += loss
            loss.backward()
            optimizer.step()
#             if step % 100 == 0:
#                 print(epoch, step, loss.item())
        
        with torch.no_grad():
            model.eval()
            lossv = 0
            preds: torch.Tensor = None
            label_ids: torch.Tensor = None
            for stepv, inputsv in enumerate(val_data_loader):
                for k, v in inputsv.items():
                    if isinstance(v, torch.Tensor):
                        inputsv[k] = v.to('cuda')
                outputs = model(**inputsv)
                lossv += outputs[0] 
                logits = outputs[1]
                preds = logits if preds is None else nested_concat(preds, logits, dim=0)
                label_ids = inputsv['labels'] if label_ids is None else nested_concat(label_ids, inputsv['labels'], dim=0)
            preds = nested_numpify(preds)
            label_ids = nested_numpify(label_ids)
            metrics = compute_metrics(EvalPrediction(predictions=preds, label_ids=label_ids))
            now = datetime.now()
            print(f'{now.strftime("%Y-%m-%d %H:%M:%S")} {int((now-time_start).total_seconds())}s epoch: {epoch}; train loss: {losst}; val loss: {lossv}\nmetrics:{metrics}')
        model.train()
        time_start = now
    break

2020-09-30 17:20:16.027 | INFO     | __main__:convert_examples_to_features:31 - Writing example %d of %d
2020-09-30 17:20:21.302 | INFO     | __main__:convert_examples_to_features:31 - Writing example %d of %d


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=411578458.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at hfl/chinese-bert-wwm were not used when initializing RobertaForTokenClassification: ['bert.embeddings.word_embeddings.weight', 'bert.embeddings.position_embeddings.weight', 'bert.embeddings.token_type_embeddings.weight', 'bert.embeddings.LayerNorm.weight', 'bert.embeddings.LayerNorm.bias', 'bert.encoder.layer.0.attention.self.query.weight', 'bert.encoder.layer.0.attention.self.query.bias', 'bert.encoder.layer.0.attention.self.key.weight', 'bert.encoder.layer.0.attention.self.key.bias', 'bert.encoder.layer.0.attention.self.value.weight', 'bert.encoder.layer.0.attention.self.value.bias', 'bert.encoder.layer.0.attention.output.dense.weight', 'bert.encoder.layer.0.attention.output.dense.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.encoder.layer.0.intermediate.dense.weight', 'bert.encoder.layer.0.intermediate.dense.bias', 'bert.encoder.layer.0.output.dense.weight', 'bert

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at hfl/chinese-bert-wwm and are newly initialized: ['embeddings.word_embeddings.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.LayerNorm.weight', 'embeddings.LayerNorm.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.value.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.output.dense.weight', 'encoder.layer.0.output.dense.bias', 'encoder.layer.0.output.LayerNorm.weig

2020-09-30 17:21:18 34s epoch: 0; train loss: 194.565673828125; val loss: 36.21962356567383
metrics:{'precision': 0.07949977668602054, 'recall': 0.099302649930265, 'f1': 0.08830460126503782}
2020-09-30 17:21:53 35s epoch: 1; train loss: 138.8235321044922; val loss: 33.32533264160156
metrics:{'precision': 0.10346611484738748, 'recall': 0.16736401673640167, 'f1': 0.12787723785166238}
2020-09-30 17:22:29 35s epoch: 2; train loss: 124.23117065429688; val loss: 31.883255004882812
metrics:{'precision': 0.09140182310914018, 'recall': 0.10348675034867504, 'f1': 0.09706959706959707}
2020-09-30 17:23:04 35s epoch: 3; train loss: 116.52886962890625; val loss: 30.178844451904297
metrics:{'precision': 0.11733258300506472, 'recall': 0.23263598326359833, 'f1': 0.15598989993453663}


In [23]:
for fold_, (trn_idx, val_idx) in enumerate(folds.split(examples)):
#     print(trn_idx, val_idx)
    examples_train = pd.Series(examples).iloc[trn_idx].tolist()
    examples_val = pd.Series(examples).iloc[val_idx].tolist()
    trainset = NerDataset(examples_train, tokenizer, labels, config.model_type, max_seq_length)
    train_data_loader = DataLoader(trainset, batch_size=6, shuffle=True,collate_fn=default_data_collator)
    valset = NerDataset(examples_val, tokenizer, labels, config.model_type, max_seq_length)
    val_data_loader = DataLoader(valset, batch_size=6, shuffle=True,collate_fn=default_data_collator)
    
    model = MyBertForTokenClassification.from_pretrained(model_name_or_path, config=config)
    
    model = model.to('cuda')
    optimizer = torch.optim.Adam(model.parameters(), lr=0.00001)
    
    time_start = datetime.now()
    
    epochs = 50
    for epoch in range(epochs):
        losst = 0
        model.train()
        for step, inputs in enumerate(train_data_loader):
            optimizer.zero_grad()
        #     print(step)
        #     print(inputs)
            for k, v in inputs.items():
                if isinstance(v, torch.Tensor):
                    inputs[k] = v.to('cuda')
#             print(inputs)
#             break
            outputs = model(**inputs)
            loss = outputs[0]
            losst += loss.item()
            loss.backward()
            optimizer.step()
#             if step % 100 == 0:
#                 print(epoch, step, loss.item())
        
        with torch.no_grad():
            model.eval()
            lossv = 0
            preds: torch.Tensor = None
            label_ids: torch.Tensor = None
            for stepv, inputsv in enumerate(val_data_loader):
                for k, v in inputsv.items():
                    if isinstance(v, torch.Tensor):
                        inputsv[k] = v.to('cuda')
                outputs = model(**inputsv)
                lossv += outputs[0] 
                logits = outputs[1]
                preds = logits if preds is None else nested_concat(preds, logits, dim=0)
                label_ids = inputsv['labels'] if label_ids is None else nested_concat(label_ids, inputsv['labels'], dim=0)
            preds = nested_numpify(preds)
            label_ids = nested_numpify(label_ids)
            metrics = compute_metrics(EvalPrediction(predictions=preds, label_ids=label_ids))
            now = datetime.now()
            print(f'{now.strftime("%Y-%m-%d %H:%M:%S")} {int((now-time_start).total_seconds())}s epoch: {epoch}; train loss: {losst}; val loss: {lossv}\nmetrics:{metrics}')
        
        time_start = now
    break

2020-09-30 16:58:08.923 | INFO     | __main__:convert_examples_to_features:31 - Writing example %d of %d
2020-09-30 16:58:13.903 | INFO     | __main__:convert_examples_to_features:31 - Writing example %d of %d
Some weights of the model checkpoint at bert-base-cased were not used when initializing MyBertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing MyBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing MyBertForTokenClassification from the checkpoint of a model that you expect to be exact

2020-09-30 16:58:53 35s epoch: 0; train loss: 233.0487336218357; val loss: 46.893863677978516
metrics:{'precision': 0.15670969678775143, 'recall': 0.14560669456066946, 'f1': 0.15095430884904568}
2020-09-30 16:59:28 35s epoch: 1; train loss: 166.47629860043526; val loss: 39.51923751831055
metrics:{'precision': 0.2821650399290151, 'recall': 0.26610878661087867, 'f1': 0.2739018087855297}
2020-09-30 17:00:04 35s epoch: 2; train loss: 138.5117626786232; val loss: 33.81781768798828
metrics:{'precision': 0.33513206475433116, 'recall': 0.3291492329149233, 'f1': 0.3321137067267098}
2020-09-30 17:00:39 35s epoch: 3; train loss: 124.96450391411781; val loss: 32.58758544921875
metrics:{'precision': 0.3378901212277534, 'recall': 0.36541143654114366, 'f1': 0.35111230233181456}
2020-09-30 17:01:15 35s epoch: 4; train loss: 115.32277190685272; val loss: 31.041973114013672
metrics:{'precision': 0.3672627235213205, 'recall': 0.3723849372384937, 'f1': 0.3698060941828255}
2020-09-30 17:01:51 35s epoch: 5;

KeyboardInterrupt: 

In [90]:
for fold_, (trn_idx, val_idx) in enumerate(folds.split(examples)):
#     print(trn_idx, val_idx)
    examples_train = pd.Series(examples).iloc[trn_idx].tolist()
    examples_val = pd.Series(examples).iloc[val_idx].tolist()
    trainset = NerDataset(examples_train, tokenizer, labels, config.model_type, max_seq_length)
    train_data_loader = DataLoader(trainset, batch_size=6, shuffle=True,collate_fn=default_data_collator)
    valset = NerDataset(examples_val, tokenizer, labels, config.model_type, max_seq_length)
    val_data_loader = DataLoader(valset, batch_size=6, shuffle=True,collate_fn=default_data_collator)
    
#     model = MyBertForTokenClassification(config)

#     model = model.to('cuda')
    optimizer = torch.optim.Adam(model.parameters(), lr=0.00001)
    
    time_start = datetime.now()
    
    epochs = 50
    for epoch in range(epochs):
        losst = 0
        for step, inputs in enumerate(train_data_loader):
            optimizer.zero_grad()
        #     print(step)
        #     print(inputs)
            for k, v in inputs.items():
                if isinstance(v, torch.Tensor):
                    inputs[k] = v.to('cuda')
#             print(inputs)
#             break
            outputs = model(**inputs)
            loss = outputs[0]
            losst += loss.item()
            loss.backward()
            optimizer.step()
#             if step % 100 == 0:
#                 print(epoch, step, loss.item())
        
        with torch.no_grad():
            model.eval()
            lossv = 0
            preds: torch.Tensor = None
            label_ids: torch.Tensor = None
            for stepv, inputsv in enumerate(val_data_loader):
                for k, v in inputsv.items():
                    if isinstance(v, torch.Tensor):
                        inputsv[k] = v.to('cuda')
                outputs = model(**inputsv)
                lossv += outputs[0] 
                logits = outputs[1]
                preds = logits if preds is None else nested_concat(preds, logits, dim=0)
                label_ids = inputsv['labels'] if label_ids is None else nested_concat(label_ids, inputsv['labels'], dim=0)
            preds = nested_numpify(preds)
            label_ids = nested_numpify(label_ids)
            metrics = compute_metrics(EvalPrediction(predictions=preds, label_ids=label_ids))
            now = datetime.now()
            print(f'{now.strftime("%Y-%m-%d %H:%M:%S")} {int((now-time_start).total_seconds())}s epoch: {epoch}; train loss: {losst}; val loss: {lossv}\nmetrics:{metrics}')
        model.train()
        time_start = now
    break

2020-09-30 16:14:24.209 | INFO     | __main__:convert_examples_to_features:31 - Writing example %d of %d
2020-09-30 16:14:29.097 | INFO     | __main__:convert_examples_to_features:31 - Writing example %d of %d


2020-09-30 16:15:05 35s epoch: 0; train loss: 199.09524354338646; val loss: 52.30827713012695
metrics:{'precision': 0.0, 'recall': 0.0, 'f1': 0}
2020-09-30 16:15:40 35s epoch: 1; train loss: 197.51129357516766; val loss: 52.45339584350586
metrics:{'precision': 0.0, 'recall': 0.0, 'f1': 0}
2020-09-30 16:16:16 35s epoch: 2; train loss: 195.32498371601105; val loss: 52.4373664855957
metrics:{'precision': 0.01778496362166532, 'recall': 0.006136680613668061, 'f1': 0.009124844462878473}
2020-09-30 16:16:51 35s epoch: 3; train loss: 193.34185588359833; val loss: 51.69658279418945
metrics:{'precision': 0.012589928057553957, 'recall': 0.003905160390516039, 'f1': 0.005961251862891207}
2020-09-30 16:17:27 35s epoch: 4; train loss: 190.9053521603346; val loss: 51.945770263671875
metrics:{'precision': 0.029885057471264367, 'recall': 0.010878661087866108, 'f1': 0.01595092024539877}
2020-09-30 16:18:03 35s epoch: 5; train loss: 189.11482363939285; val loss: 52.24580764770508
metrics:{'precision': 0.0

KeyboardInterrupt: 

In [27]:
for stepv, inputsv in enumerate(val_data_loader):
    for k, v in inputsv.items():
        if isinstance(v, torch.Tensor):
            inputsv[k] = v.to('cuda')
    outputs = model(**inputsv)
    break

In [28]:
outputs[0]

tensor(0.7820, device='cuda:0', grad_fn=<NllLossBackward>)

In [33]:
inputsv['labels'].size()

torch.Size([6, 250])

In [30]:
outputs[1].size()

torch.Size([6, 250, 27])

In [60]:
outputs[1]

tensor([[[ 7.2478, -0.9541,  0.3502,  ..., -1.7577,  0.8842,  1.6488],
         [ 9.1770,  0.4352,  1.2137,  ..., -1.2776,  1.2094, -1.8445],
         [ 9.2644,  1.6467,  1.2957,  ..., -2.3641, -0.5144,  0.7105],
         ...,
         [ 7.5256, -0.1242,  0.6146,  ..., -1.9995, -0.8282,  0.6976],
         [ 6.6889, -0.8332,  0.5084,  ..., -1.7485, -0.5956,  0.9563],
         [ 7.7120,  0.1404,  0.7082,  ..., -0.8425, -0.8540,  0.2866]],

        [[ 7.6796,  0.0823,  0.4245,  ..., -1.7960,  0.1496,  0.8459],
         [10.2698,  0.8696,  1.0195,  ..., -1.0480,  1.3240, -1.1397],
         [ 8.5301,  1.4456,  0.8132,  ..., -2.6058, -1.2604,  1.3601],
         ...,
         [ 8.3323, -0.8301,  0.1058,  ..., -1.7747, -0.8403, -0.2943],
         [ 6.8183, -0.8213,  0.8348,  ..., -1.5840, -0.7093,  0.0195],
         [ 7.3432,  0.2935,  0.7771,  ..., -1.3801, -1.3548, -0.1197]],

        [[ 7.1986, -1.1691, -1.4432,  ..., -1.6779,  0.7274,  1.7633],
         [ 5.2698, -0.9619, -2.1792,  ..., -1

In [64]:
preds = nested_numpify(outputs[1].detach())

In [65]:
label_ids = nested_numpify(inputsv['labels'])

In [83]:
metrics = compute_metrics(EvalPrediction(predictions=preds, label_ids=label_ids))

229 229
229 229
81 81
217 217
214 214
88 88
[['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',

In [74]:
metrics

{'precision': 0.0, 'recall': 0.0, 'f1': 0}

In [49]:
id2label= {i: label for i, label in enumerate(labels)}
label2id={label: i for i, label in enumerate(labels)}

In [50]:
y_true =inputsv['labels'].cpu().detach().numpy()

In [51]:
preds = np.argmax(outputs[1].cpu().detach().numpy(), 2)

In [52]:
preds.shape

(6, 250)

In [53]:
y_true.shape

(6, 250)