In [1]:
import os
import glob
import pickle
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import BertJapaneseTokenizer, BertForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# ---------------------------------------------------------
# データの読み込み
# ---------------------------------------------------------
def load_corpus(corpus_dir):

  dir_list = []
  news_data = []
  dir_count = 0
  file_count = 0
  read_count = 0

  for f in os.listdir(path=corpus_dir):
    if os.path.isdir(os.path.join(corpus_dir, f)):
      dir_list.append(f)

  for i in range(len(dir_list)):

    if os.path.isdir(os.path.join(corpus_dir, dir_list[i])):
      read_count = 0
      for file_path in glob.glob(os.path.join(corpus_dir, dir_list[i], "*.txt")):
        if os.path.basename(file_path) != "LICENSE.txt":
          with open(file_path, "r", encoding="utf-8") as f:
            content = f.read().splitlines()[2:]       # 最初の 2 行はメタデータなのでスキップ
            text = "".join(content)
            text = text.translate(str.maketrans({"\n":"", "\r":"", "\t":"", "\u3000":""}))
            news_data.append([text, i])
            if (read_count >= 100):       # ディレクトリー当たりの読み込む最大ファイル数
              break
            read_count = read_count + 1
          file_count = file_count + 1
      dir_count = dir_count + 1

  print(dir_list)
  print(f"dir_count = {dir_count}")
  print(f"file_count = {file_count}")

  with open("./label.pkl", "wb") as f: 
    pickle.dump(dir_list, f)

  return news_data

news_data = load_corpus('./text')

train_data, eval_data = train_test_split(news_data, shuffle=True)
train_data = pd.DataFrame(train_data, columns=["text", "label"])
eval_data = pd.DataFrame(eval_data, columns=["text", "label"])

['dokujo-tsushin', 'it-life-hack', 'kaden-channel', 'livedoor-homme', 'movie-enter', 'peachy', 'smax', 'sports-watch', 'topic-news']
dir_count = 9
file_count = 900


In [3]:
# ---------------------------------------------------------
# データの保存
# ---------------------------------------------------------
csv_path = "./csv"

if not os.path.exists(csv_path):
  os.makedirs(csv_path)

train_data.to_csv(os.path.join(csv_path, "train_data.csv"), index=False, encoding='utf-8')
eval_data.to_csv(os.path.join(csv_path, "eval_data.csv"), index=False, encoding='utf-8')

In [4]:
# ---------------------------------------------------------
# トークナイズ
# ---------------------------------------------------------
def tokenize(batch):
  return tokenizer(batch["text"], padding=True, truncation=True, max_length=512)

tokenizer = BertJapaneseTokenizer.from_pretrained("tohoku-nlp/bert-base-japanese")
# tokenizer = BertJapaneseTokenizer.from_pretrained("cl-tohoku/bert-base-japanese")

train_dataset = Dataset.from_pandas(train_data)
eval_dataset = Dataset.from_pandas(eval_data)

train_dataset = train_dataset.map(tokenize, batched=True)
eval_dataset = eval_dataset.map(tokenize, batched=True)

train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
eval_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Map: 100%|██████████| 681/681 [00:03<00:00, 210.74 examples/s]
Map: 100%|██████████| 228/228 [00:01<00:00, 210.04 examples/s]


In [5]:
# ---------------------------------------------------------
# 学習と評価
# ---------------------------------------------------------
model = BertForSequenceClassification.from_pretrained("tohoku-nlp/bert-base-japanese", num_labels=9)
# model = BertForSequenceClassification.from_pretrained("cl-tohoku/bert-base-japanese", num_labels=9)
data_collator = DataCollatorWithPadding(tokenizer)

def compute_metrics(result):
  labels = result.label_ids
  preds = result.predictions.argmax(-1)
  acc = accuracy_score(labels, preds)
  return {
    "accuracy": acc
  }

training_args = TrainingArguments(
  output_dir = "./results",               # 結果を格納するディレクトリー
  logging_dir = "./logs",                 # 途中経過のログを格納するディレクトリー
  num_train_epochs = 2,                   # エポック数
  per_device_train_batch_size = 8,        # 訓練時のバッチサイズ
  per_device_eval_batch_size = 16,        # 評価時のバッチサイズ
  warmup_steps = 500,                     # 学習系数がこのステップ数で徐々に増加
  weight_decay = 0.01,                    # 重みの減衰率
  eval_strategy = "steps",                # 訓練中、一定のステップごとに評価
  save_safetensors = False                # safetensorsの使用を無効化
)

trainer = Trainer(
  model = model,                          # 使用するモデルを指定
  args = training_args,                   # TrainingArgumentsの設定
  compute_metrics = compute_metrics,      # 評価用の関数
  train_dataset = train_dataset,          # 訓練用のデータ
  eval_dataset = eval_dataset,            # 評価用のデータ
  data_collator = data_collator           # データコレーターを設定
)

trainer.train()
trainer.evaluate()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at tohoku-nlp/bert-base-japanese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  attn_output = torch.nn.functional.scaled_dot_product_attention(
  2%|▏         | 3/172 [00:15<17:08,  6.09s/it]

KeyboardInterrupt: 

In [11]:
# ---------------------------------------------------------
# モデルの保存
# ---------------------------------------------------------
model_path = "./model"

if not os.path.exists(model_path):
  os.makedirs(model_path)

for param in model.parameters():
  param.data = param.data.contiguous()

model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

('./model\\tokenizer_config.json',
 './model\\special_tokens_map.json',
 './model\\vocab.txt',
 './model\\added_tokens.json')