# Playing with pytorch-pretrained-BERT for Japanese language

## Load pretrained BERT model for Japanese

We now load [pretrained BERT model](http://nlp.ist.i.kyoto-u.ac.jp/index.php?BERT%E6%97%A5%E6%9C%AC%E8%AA%9EPretrained%E3%83%A2%E3%83%87%E3%83%AB) published by Kyoto University.

In [25]:
import torch
from pytorch_pretrained_bert import BasicTokenizer, BertTokenizer, BertModel, BertForMaskedLM
from pyknp import Juman

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

import logging
logging.basicConfig(level=logging.INFO)

jumanpp = Juman()

def tokenize(text):
    result = jumanpp.analysis(text)
    tokens = []
    for mrph in result.mrph_list():
        tokens.append(mrph.midasi)
    return ' '.join(tokens)
        
path_to_pretrained_model = '/Users/minhpqn/workspace/Japanese_L-12_H-768_A-12_E-30_BPE'

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained(path_to_pretrained_model, kyoto_bert=True)

# Tokenized input
text = "数学の最も普通の定義としては、「数および図形についての学問」というものがある。"
text = tokenize(text)
text = '[CLS] ' + text
tokenized_text = tokenizer.tokenize(text)
print(tokenized_text)

# Mask a token that we will try to predict back with `BertForMaskedLM`
masked_index = 8
tokenized_text[masked_index] = '[MASK]'

print(tokenized_text)

indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
tokens_tensor = torch.tensor([indexed_tokens])
tokens_tensor = tokens_tensor.to(device)

print(indexed_tokens)
print(tokens_tensor.size())

INFO:pytorch_pretrained_bert.tokenization:loading vocabulary file /Users/minhpqn/workspace/Japanese_L-12_H-768_A-12_E-30_BPE/vocab.txt


cpu
Use kyoto bert! text must be tokenized by juman++.
['[CLS]', '数学', 'の', '最も', '普通の', '定義', 'と', 'して', 'は', '、', '「', '数', 'お', '##よ', '##ひ', '図形', 'に', 'ついて', 'の', '学問', '」', 'と', 'いう', 'もの', 'か', 'ある', '。']
['[CLS]', '数学', 'の', '最も', '普通の', '定義', 'と', 'して', '[MASK]', '、', '「', '数', 'お', '##よ', '##ひ', '図形', 'に', 'ついて', 'の', '学問', '」', 'と', 'いう', 'もの', 'か', 'ある', '。']
[2, 2938, 5, 476, 7078, 1315, 12, 19, 4, 6, 24, 145, 273, 4141, 7360, 17201, 8, 130, 5, 5476, 25, 12, 56, 60, 90, 38, 7]
torch.Size([1, 27])


Let's see how to use ```BertModel``` to get hidden states.

In [12]:
# Load pre-trained model (weights)
model = BertModel.from_pretrained(path_to_pretrained_model, cache_dir=None)
model.eval()
model.to(device)

INFO:pytorch_pretrained_bert.modeling:loading archive file /Users/minhpqn/workspace/Japanese_L-12_H-768_A-12_E-30_BPE
INFO:pytorch_pretrained_bert.modeling:Model config {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size": 32006
}



Get hidden states of the input

In [20]:
# Predict hidden states features for each layer
with torch.no_grad():
    encoded_layers, _ = model(tokens_tensor, output_all_encoded_layers=True)
assert len(encoded_layers) == 12
print(encoded_layers[0].size())

torch.Size([1, 27, 768])


Now, we use ```BertForMaskedLM``` to predict tokens.

In [22]:
# Load pre-trained model (weights)
model = BertForMaskedLM.from_pretrained(path_to_pretrained_model, cache_dir=None)
model.eval()
model.to(device)

INFO:pytorch_pretrained_bert.modeling:loading archive file /Users/minhpqn/workspace/Japanese_L-12_H-768_A-12_E-30_BPE
INFO:pytorch_pretrained_bert.modeling:Model config {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size": 32006
}

INFO:pytorch_pretrained_bert.modeling:Weights from pretrained model not used in BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']


BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32006, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1)
            )
          )
          (intermediate): BertInterm

In [24]:
with torch.no_grad():
    predictions = model(tokens_tensor)

predicted_index = torch.argmax(predictions[0, masked_index]).item()
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
print(predicted_token)

は
