In [1]:
!pip install fugashi ipadic

[0m

In [2]:
import os
import time
import json
import unicodedata
import itertools
from functools import reduce
from tqdm import tqdm
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertJapaneseTokenizer, BertForTokenClassification

# 日本語学習済みモデル
MODEL_NAME = 'cl-tohoku/bert-base-japanese-whole-word-masking'
EPOCHS = 3
MAX_WORDS_LEN = 64

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [4]:
!git clone https://github.com/stockmarkteam/ner-wikipedia-dataset ../data

fatal: destination path '../data' already exists and is not an empty directory.


In [5]:
# データのロード
dataset = json.load(open('../data/ner-wikipedia-dataset/ner.json','r'))

# 固有表現のタイプとIDを対応付る辞書 
type_id_dict = {
    "人名": 1,
    "法人名": 2,
    "政治的組織名": 3,
    "その他の組織名": 4,
    "地名": 5,
    "施設名": 6,
    "製品名": 7,
    "イベント名": 8
}
type_name_dict = {v: k for k, v in type_id_dict.items()}

dataset[1]

{'curid': '2415078',
 'text': 'レッドフォックス株式会社は、東京都千代田区に本社を置くITサービス企業である。',
 'entities': [{'name': 'レッドフォックス株式会社', 'span': [0, 12], 'type': '法人名'},
  {'name': '東京都千代田区', 'span': [14, 21], 'type': '地名'}]}

In [6]:
# カテゴリーをラベルに変更、文字列の正規化する。
for sample in dataset:
    sample['text'] = unicodedata.normalize('NFKC', sample['text'])
    for e in sample["entities"]:
        e['type_id'] = type_id_dict[e['type']]
        del e['type']
dataset[1]

{'curid': '2415078',
 'text': 'レッドフォックス株式会社は、東京都千代田区に本社を置くITサービス企業である。',
 'entities': [{'name': 'レッドフォックス株式会社', 'span': [0, 12], 'type_id': 2},
  {'name': '東京都千代田区', 'span': [14, 21], 'type_id': 5}]}

In [7]:
# データセットの分割
n = len(dataset)
n_train = int(3500)
n_val = int(1000)
dataset_train = dataset[:n_train]
dataset_val = dataset[n_train:n_train+n_val]
dataset_test = dataset[n_train+n_val:]

print(f"Length of train: {len(dataset_train)}")
print(f"Length of val: {len(dataset_val)}")
print(f"Length of test: {len(dataset_test)}")

Length of train: 3500
Length of val: 1000
Length of test: 843


In [8]:
class NerTokenizerForTrain(BertJapaneseTokenizer):

  def create_tokens_and_labels(self, splitted):
      """分割された文字列をトークン化し、ラベルを付与
      Args：
        splitted: 分割された文字列
          例：
          [{'text': 'レッドフォックス株式会社', 'label': 2},
          {'text': 'は、', 'label': 0},
          {'text': '東京都千代田区', 'label': 5},
          {'text': 'に本社を置くITサービス企業である。', 'label': 0}]
      Return:
        tokens, labels
          例：
          ['レッド', 'フォックス', '株式会社', 'は', '、', '東京', '都', '千代田', '区', 'に', '本社', 'を', '置く', 'IT', 'サービス', '企業', 'で', 'ある', '。']
          [2, 2, 2, 0, 0, 5, 5, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
      """
      tokens = [] # トークン格納用
      labels = [] # トークンに対応するラベル格納用
      for s in splitted:
          text = s['text']
          label = s['label']
          tokens_splitted = self.tokenize(text) # BertJapaneseTokenizerのトークナイザを使ってトークンに分割
          labels_splitted = [label] * len(tokens_splitted)
          tokens.extend(tokens_splitted)
          labels.extend(labels_splitted)
      
      return tokens, labels


  def encoding_for_bert(self, tokens, labels, max_length):
      """符号化を行いBERTに入力できる形式にする
      Args:
        tokens: トークン列
        labels: トークンに対応するラベルの列
      Returns: 
        encoding: BERTに入力できる形式
        例：
        {'input_ids': [2, 3990, 13779, 1275, 9, 6, 391, 409, 9674, 280, 7, 2557, 11, 3045, 8267, 1645, 1189, 12, 31, 8, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
        　'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
        　'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]},
          'labels': [0, 2, 2, 2, 0, 0, 5, 5, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}

      """
      encoding = self.encode_plus(
          tokens, 
          max_length=max_length, 
          padding='max_length', 
          truncation=True
      ) 
      # トークン[CLS]、[SEP]のラベルを0
      labels = [0] + labels[:max_length-2] + [0] 
      # トークン[PAD]のラベルを0
      labels = labels + [0]*( max_length - len(labels) ) 
      encoding['labels'] = labels

      return encoding


  def encode_plus_tagged(self, text, entities, max_length):
      """文章とそれに含まれる固有表現が与えられた時に、符号化とラベル列の作成
      Args:
        text: 元の文章
        entities: 文章中の固有表現の位置(span)とラベル(type_id)の情報

      """
      # 固有表現の前後でtextを分割し、それぞれのラベルをつけておく。
      entities = sorted(entities, key=lambda x: x['span'][0]) # 固有表現の位置の昇順でソート
      splitted = [] # 分割後の文字列格納用
      position = 0
      for entity in entities:
          start = entity['span'][0]
          end = entity['span'][1]
          label = entity['type_id']
          # 固有表現ではないものには0のラベルを付与
          splitted.append({'text': text[position:start], 'label':0}) 
          # 固有表現には、固有表現のタイプに対応するIDをラベルとして付与
          splitted.append({'text': text[start:end], 'label':label}) 
          position = end

      # 最後の固有表現から文末に、0のラベルを付与
      splitted.append({'text': text[position:], 'label':0})
      # positionとspan[0]の値が同じだと空白文字にラベル0が付与されるため、長さ0の文字列は除く（例：{'text': '', 'label': 0}）
      splitted = [ s for s in splitted if s['text'] ] 

      # 分割された文字列をトークン化し、ラベルを付与
      tokens, labels = self.create_tokens_and_labels(splitted)

      # 符号化を行いBERTに入力できる形式にする
      encoding = self.encoding_for_bert(tokens, labels, max_length)

      return encoding

In [9]:
tokenizer = NerTokenizerForTrain.from_pretrained(MODEL_NAME)
import pprint
tmp = dataset_train[1]
pprint.pprint(tmp)
pprint.pprint(tokenizer.encode_plus_tagged(text=tmp["text"], entities=tmp["entities"], max_length=MAX_WORDS_LEN))

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertJapaneseTokenizer'. 
The class this function is called from is 'NerTokenizerForTrain'.


{'curid': '2415078',
 'entities': [{'name': 'レッドフォックス株式会社', 'span': [0, 12], 'type_id': 2},
              {'name': '東京都千代田区', 'span': [14, 21], 'type_id': 5}],
 'text': 'レッドフォックス株式会社は、東京都千代田区に本社を置くITサービス企業である。'}
{'attention_mask': [1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    0,
                    0,
                    0,
                    0,
                    0,
                    0,
                    0,
                    0,
                    0,
                    0,
                    0,
                    0,
                    0,
      

In [10]:
def collate_batch(batch):
    encoding = [tokenizer.encode_plus_tagged(text=ele["text"], entities=ele["entities"], max_length=MAX_WORDS_LEN) for ele in batch]

    def reducer(acc, curr):
        for key, value in curr.items():
            if key not in acc:
                acc[key] = [value]
            else:
                acc[key].append(value)
        return acc

    def toTensor(acc, curr):
        acc[curr[0]] = torch.tensor(curr[1])
        return acc

    return reduce(toTensor, reduce(reducer, encoding, {}).items(), {})
    
# データローダーの作成
train_dataloader = DataLoader(dataset_train, batch_size=35, shuffle=True, pin_memory=True, collate_fn=collate_batch)
val_dataloader = DataLoader(dataset_val, batch_size=100, shuffle=True, pin_memory=True, collate_fn=collate_batch)
train_len = len(dataset_train)
test_len = len(dataset_val)

next(iter(train_dataloader))

{'input_ids': tensor([[   2,  408,    5,  ...,    0,    0,    0],
         [   2,  521,    6,  ...,    0,    0,    0],
         [   2, 4006,    6,  ...,    0,    0,    0],
         ...,
         [   2, 1526,   19,  ...,    0,    0,    0],
         [   2,   70,   19,  ...,    0,    0,    0],
         [   2,  960,   19,  ...,    0,    0,    0]]),
 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'labels': tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
 

In [11]:
# 学習済みモデルのロード
model = BertForTokenClassification.from_pretrained(MODEL_NAME, num_labels=9).to(device)
model

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the m

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, el

In [12]:
optimizer = torch.optim.Adam(params=model.parameters(), lr=2e-5)
torch.backends.cudnn.benchmark = True

for epoch_num in range(EPOCHS):

    total_loss_train = []
    total_loss_val = []

    time_s = time.perf_counter()
    model.train()
    for i, batch in enumerate(train_dataloader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        loss, logits = model(input_ids=input_ids, 
                              token_type_ids=None, 
                              attention_mask=attention_mask, 
                              labels=labels,
                              return_dict=False)

        total_loss_train.append(loss.item())
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        print(f"epoch:{epoch_num+1}({(i + 1) / (train_len / train_dataloader.batch_size):.2%}) | Train Loss: {np.mean(total_loss_train):.3f} | Elapsed: {(time.perf_counter() - time_s):.1f}s", end='\r')

    print()
    time_s = time.perf_counter()
    model.eval()
    with torch.no_grad():
        for i, batch in enumerate(val_dataloader):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            loss, logits = model(input_ids=input_ids, 
                                  token_type_ids=None, 
                                  attention_mask=attention_mask, 
                                  labels=labels,
                                  return_dict=False)
    
            total_loss_val.append(loss.item())

            print(f"epoch:{epoch_num+1}({(i + 1) / (test_len / val_dataloader.batch_size):.2%}) | Train Loss: {np.mean(total_loss_val):.3f} | Elapsed: {(time.perf_counter() - time_s):.1f}s", end='\r')
    print()

epoch:1(100.00%) | Train Loss: 0.352 | Elapsed: 25.1s
epoch:1(100.00%) | Train Loss: 0.060 | Elapsed: 2.4s
epoch:2(100.00%) | Train Loss: 0.047 | Elapsed: 24.7s
epoch:2(100.00%) | Train Loss: 0.048 | Elapsed: 2.5s
epoch:3(100.00%) | Train Loss: 0.023 | Elapsed: 25.0s
epoch:3(100.00%) | Train Loss: 0.047 | Elapsed: 2.5s


In [13]:
torch.save(model, '../models/bert_jp_ner.pth')

In [18]:
model = torch.load('../models/bert_jp_ner.pth')
def predict(text):
    model.eval()
    with torch.no_grad():
        encoded = collate_batch([{'text': text, 'entities': []}])
        input_ids = encoded["input_ids"].to(device)
        attention_mask = encoded["attention_mask"].to(device)
        labels = encoded["labels"].to(device)
        loss, logits = model(input_ids=input_ids, 
                              token_type_ids=None, 
                              attention_mask=attention_mask, 
                              labels=labels,
                              return_dict=False)
    tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
    pred_labels = logits.argmax(dim=2).tolist()[0]
    entities = []
    pos = 0
    for label, group in itertools.groupby(pred_labels):
        end = pos + len(list(group))
        if (label != 0):
            entity = {
                "name": "".join(tokens[pos:end]),
                "span": [pos, end],
                "type_id": type_name_dict[label]
            }
            entities.append(entity)
        pos = end
        
    return { 'text': text, 'entities': entities }

result = predict('レッドフォックス株式会社は、東京都千代田区に本社を置くITサービス企業である。')
result

{'text': 'レッドフォックス株式会社は、東京都千代田区に本社を置くITサービス企業である。',
 'entities': [{'name': 'レッドフォックス株式会社', 'span': [1, 4], 'type_id': '法人名'},
  {'name': '東京都千代田区', 'span': [6, 10], 'type_id': '地名'}]}