# Set parameters

In [1]:
MODEL_NAME = "cl-tohoku/bert-base-japanese-v2"
MODEL_LEARNING_RATE = 1e-5
MODEL_BATCH_SIZE = 128
MODEL_EPOCHS_NUM = 10
MODEL_OUTPUT_DIR = "./trained_model/"

In [2]:
import os
import torch

os.environ["CUDA_VISIBLE_DEVICES"] = "1"
torch.cuda.is_available()

True

# Load data

In [3]:
from datasets import load_dataset

dataset = load_dataset("csv", data_files="data")
dataset

Using custom data configuration default-be7e0937d2f084e2
Found cached dataset csv (/home/bill/.cache/huggingface/datasets/csv/default-be7e0937d2f084e2/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['本文', 'Level', '尊敬語', '謙譲語', '丁寧語', 'フィールド'],
        num_rows: 10007
    })
})

In [4]:
import pandas as pd

df = pd.DataFrame(dataset["train"])
df.head()

Unnamed: 0,本文,Level,尊敬語,謙譲語,丁寧語,フィールド
0,御病気なさったそうですね。回復されたのでしょうか？,1,1,0,0,挨拶
1,本日は温かい気候ですね。,1,1,0,0,挨拶
2,行ってらっしゃいませ。お気を付けて。,1,1,0,0,挨拶
3,皆様、御揃いでお出かけになられるのですか？,1,1,0,0,挨拶
4,本日はお早いのですね、お散歩ですか？,1,1,0,0,挨拶


# Set tokenizer

In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer("本日は温かい気候ですね。")

{'input_ids': [2, 2828, 6855, 897, 3303, 13231, 15652, 12461, 895, 829, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

# Set encoder

In [6]:
df[["Level"]].head()

Unnamed: 0,Level
0,1
1,1
2,1
3,1
4,1


In [7]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder().fit(df[["Level"]])
encoder.categories_

[array([1, 2, 3, 4])]

In [8]:
encoder.transform(pd.DataFrame({"Level": [3]})).toarray()[0]

array([0., 0., 1., 0.])

# Preprocess data

In [9]:
def preprocess(dataslice):
  tokenized_inputs = tokenizer(dataslice["本文"])
  labels = []
  for level in dataslice["Level"]:
    encoded_level = encoder.transform(pd.DataFrame({"Level": [level]})).toarray()[0]
    labels.append(encoded_level)
  tokenized_inputs["label"] = labels
  return tokenized_inputs

processed_dataset = dataset.map(preprocess, batched=True)
processed_dataset

  0%|          | 0/11 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['本文', 'Level', '尊敬語', '謙譲語', '丁寧語', 'フィールド', 'input_ids', 'token_type_ids', 'attention_mask', 'label'],
        num_rows: 10007
    })
})

In [10]:
processed_dataset["train"][1]

{'本文': '本日は温かい気候ですね。',
 'Level': 1,
 '尊敬語': 1,
 '謙譲語': 0,
 '丁寧語': 0,
 'フィールド': '挨拶',
 'input_ids': [2, 2828, 6855, 897, 3303, 13231, 15652, 12461, 895, 829, 3],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'label': [1.0, 0.0, 0.0, 0.0]}

# Set categories

In [11]:
categories = encoder.categories_[0]
categories

array([1, 2, 3, 4])

# Set model

In [12]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(categories))

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-v2 were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification wer

# Split data

In [13]:
train_eval_dataset = processed_dataset["train"].train_test_split(test_size=0.1, seed=42)
train_eval_dataset

DatasetDict({
    train: Dataset({
        features: ['本文', 'Level', '尊敬語', '謙譲語', '丁寧語', 'フィールド', 'input_ids', 'token_type_ids', 'attention_mask', 'label'],
        num_rows: 9006
    })
    test: Dataset({
        features: ['本文', 'Level', '尊敬語', '謙譲語', '丁寧語', 'フィールド', 'input_ids', 'token_type_ids', 'attention_mask', 'label'],
        num_rows: 1001
    })
})

# Set trainer

In [14]:
from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir=MODEL_OUTPUT_DIR,
  learning_rate=MODEL_LEARNING_RATE,
  per_device_train_batch_size=MODEL_BATCH_SIZE,
  per_device_eval_batch_size=MODEL_BATCH_SIZE,
  num_train_epochs=MODEL_EPOCHS_NUM
)

In [15]:
from transformers import Trainer

trainer = Trainer(
  model=model,
  args=training_args,
  train_dataset=train_eval_dataset["train"],
  eval_dataset=train_eval_dataset["test"],
  tokenizer=tokenizer
)

# Train

In [16]:
train_model = True
if train_model:
  trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: 本文, 謙譲語, フィールド, 尊敬語, 丁寧語, Level. If 本文, 謙譲語, フィールド, 尊敬語, 丁寧語, Level are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 9006
  Num Epochs = 10
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 710
  Number of trainable parameters = 111210244


Step,Training Loss
500,0.2956


Saving model checkpoint to ./trained_model/checkpoint-500
Configuration saved in ./trained_model/checkpoint-500/config.json
Model weights saved in ./trained_model/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./trained_model/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./trained_model/checkpoint-500/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




In [17]:
save_model = True
if save_model:
  model.save_pretrained(MODEL_OUTPUT_DIR)

Configuration saved in ./trained_model/config.json
Model weights saved in ./trained_model/pytorch_model.bin


# Evaluate

In [18]:
trained_model = BertForSequenceClassification.from_pretrained(MODEL_OUTPUT_DIR)

loading configuration file ./trained_model/config.json
Model config BertConfig {
  "_name_or_path": "cl-tohoku/bert-base-japanese-v2",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "multi_label_classification",
  "tokenizer_class": "BertJapaneseTokenizer",
  "torch_dtype": "float32",
  "transformers_version": "4.24.0",
  "type_vocab_size": 2,
  "use_cache": true,
  

In [19]:
test_texts = train_eval_dataset["test"]["本文"]
test_texts[:5]

['あなたの言葉も納得出来ます。',
 '不出来な息子ですが、よろしく指導してください。',
 '毎度！ご贔屓頂戴いたしまして、ありがとう存じます。',
 'お困りのようなので、お貸ししましょう。',
 'そのためには、安定した政府の樹立が不可欠だと考えています。']

In [20]:
model_input = tokenizer(test_texts, padding=True, return_tensors="pt")
logits = trained_model(**model_input).logits
logits[:5]

tensor([[-3.4256, -3.6581,  2.8488, -2.6851],
        [-3.1693, -2.7126,  2.9857, -3.5900],
        [ 2.0378, -1.8804, -3.9944, -4.5573],
        [-1.5672,  1.1138, -2.3519, -4.9037],
        [ 0.1852, -3.3480,  0.0877, -3.9275]], grad_fn=<SliceBackward0>)

In [21]:
from torch import nn

predicted_probabilities = nn.functional.softmax(logits, dim=-1)
predicted_probabilities[:5]

tensor([[0.0019, 0.0015, 0.9927, 0.0039],
        [0.0021, 0.0033, 0.9932, 0.0014],
        [0.9769, 0.0194, 0.0023, 0.0013],
        [0.0621, 0.9073, 0.0284, 0.0022],
        [0.5121, 0.0150, 0.4645, 0.0084]], grad_fn=<SliceBackward0>)

In [22]:
import numpy as np

predicted_labels = [categories[label_idx]
                    for label_idx in np.argmax(predicted_probabilities.detach().numpy(), axis=1)]
predicted_labels[:5]

[3, 3, 1, 2, 1]

In [23]:
for idx in range(10):
  print(f"{predicted_labels[idx]}: {test_texts[idx]}")

3: あなたの言葉も納得出来ます。
3: 不出来な息子ですが、よろしく指導してください。
1: 毎度！ご贔屓頂戴いたしまして、ありがとう存じます。
2: お困りのようなので、お貸ししましょう。
1: そのためには、安定した政府の樹立が不可欠だと考えています。
4: 電話口まで来てもらって悪いね。今大丈夫？
4: 弘法大師さんを信仰しているの？
2: 先輩から、お杯をいただいてしまったわ。
1: ご子息は？／ご子息は何人いらっしゃるんですか？
3: おじは、麻雀に夢中になってしまい落第してしまったそうです。


In [24]:
corrects_num = 0
for idx in range(len(predicted_labels)):
  if predicted_labels[idx] == train_eval_dataset["test"]["Level"][idx]:
    corrects_num = corrects_num + 1
corrects_num / len(predicted_labels)

0.7542457542457542