In [None]:
# This code was written by Taisei KANDA.
# Mount Google Drive
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
# !pip install transformers["ja"]
!pip install transformers
# Mecab installation, ipadic, mecab-python3 not working without unidic-lite
!pip install mecab-python3
!pip install ipadic
!pip install unidic-lite
!pip install fugashi

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tabulate import tabulate
import numpy as np
from transformers import BertTokenizer, BertModel, BertConfig, BertJapaneseTokenizer
import MeCab

import torch
from torch.utils.data import Dataset, DataLoader
from torch import optim
from torch import cuda

import time
from matplotlib import pyplot as plt
import statistics
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score


In [None]:
# Dataset Definition
class CreateDataset(Dataset):
  def __init__(self, X, y, tokenizer, max_len):
    self.X = X
    self.y = y
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):              # len(Dataset) to specify the value to return
    return len(self.y)

  def __getitem__(self, index):  # Specify the value to return in Dataset[index]
    text = self.X[index]

    # 2022/12/13 BertJapaneseTokenzier can remove [UNK] after changing to BertJapaneseTokenzier.
    sen = tagger.parse(text)

    inputs = self.tokenizer.encode_plus(
      sen,
      add_special_tokens=True,
      max_length=self.max_len,
      # pad_to_max_length=True
      truncation = True,           # for transfomers latest ver
      padding = "max_length"       # for transfomers latest ver
    )
    ids = inputs['input_ids']
    mask = inputs['attention_mask']

    return {
      'ids': torch.LongTensor(ids),
      'mask': torch.LongTensor(mask),
      'labels': torch.Tensor(self.y[index])
    }

In [None]:
# Definition of BERT classification model
class BERTClass(torch.nn.Module):
  def __init__(self, pretrained, pretrained_config, drop_rate, otuput_size):
    super().__init__()
    self.config = BertConfig.from_pretrained(pretrained_config)
    self.bert = BertModel.from_pretrained(pretrained, config = self.config)
    self.drop = torch.nn.Dropout(drop_rate)
    self.fc = torch.nn.Linear(768, otuput_size)                       #768 dimensions to match BERT output

  def forward(self, ids, mask):
    _, out = self.bert(ids, attention_mask=mask, return_dict = False) #In ver4, it does not work without “”return_dict = False“”.
    out = self.fc(self.drop(out))
    return out

In [None]:
def calculate_loss_and_accuracy(model, loader, device, criterion=None):
  """ Calculate loss and accuracy rates"""
  model.eval()
  loss = 0.0
  total = 0
  correct = 0
  with torch.no_grad():
    for data in loader:
      # Device Designation
      ids = data['ids'].to(device)
      mask = data['mask'].to(device)
      labels = data['labels'].to(device)

      # forward propagation
      outputs = model(ids, mask)

      # Calculation of Losses
      if criterion != None:
        loss += criterion(outputs, labels).item()

      # calculation of accuracy 
      pred = torch.argmax(outputs, dim=-1).cpu().numpy()   # Predicted label array for batch size length
      labels = torch.argmax(labels, dim=-1).cpu().numpy()  # Batch size length correct label array
      total += len(labels)
      correct += (pred == labels).sum().item()

  return loss / len(loader), correct / total


def calculate_loss_and_accuracy_test(model, loader, device, criterion=None):
  """ Calculate loss and accuracy rates"""
  model.eval()
  loss = 0.0
  total = 0
  correct = 0
  y_pred = []
  y_true = []

  logits_list = []

  with torch.no_grad():
    for data in loader:
      # Device Designation
      ids = data['ids'].to(device)
      mask = data['mask'].to(device)
      labels = data['labels'].to(device)

      # forward propagation
      outputs = model(ids, mask)

      ## for Ensemble soft voting
      # outputs2 = outputs.tolist()
      logits_list.append(outputs)

      # Calculation of Losses
      if criterion != None:
        loss += criterion(outputs, labels).item()

      # calculation of accuracy 
      pred = torch.argmax(outputs, dim=-1).cpu().numpy()    # Predicted label array for batch size length
      y_pred.append(pred)
      labels = torch.argmax(labels, dim=-1).cpu().numpy()  # Batch size length correct label array
      y_true.append(labels)
      total += len(labels)
      correct += (pred == labels).sum().item()


  acc = accuracy_score(y_true, y_pred)
  recall = recall_score(y_true, y_pred, average = "macro")
  precision = precision_score(y_true, y_pred, average = "macro")
  f1 = f1_score(y_true, y_pred, average = "macro")

  return loss / len(loader), correct / total, acc, recall, precision, f1, logits_list


def train_model(dataset_train, dataset_valid, batch_size, model, criterion, optimizer, num_epochs, device=None):
  """Perform model training and return loss/accuracy logs"""
   # Device Designation
  model.to(device)

  # Creating a dataloader
  dataloader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle=True)
  dataloader_valid = DataLoader(dataset_valid, batch_size=len(dataset_valid), shuffle=False)

  # learning
  log_train = []
  log_valid = []
  for epoch in range(num_epochs):
    # Record start time
    s_time = time.time()

    # Set to training mode
    model.train()
    for data in dataloader_train:
      # Device Designation
      ids = data['ids'].to(device)
      mask = data['mask'].to(device)
      labels = data['labels'].to(device)

      # Initialize slope at zero
      optimizer.zero_grad()

      # Forward propagation + error back propagation + weight update
      outputs = model(ids, mask)
      loss = criterion(outputs, labels)
      loss.backward()
      optimizer.step()

    # Calculation of loss and accuracy
    loss_train, acc_train = calculate_loss_and_accuracy(model, dataloader_train, device, criterion=criterion)
    loss_valid, acc_valid = calculate_loss_and_accuracy(model, dataloader_valid, device, criterion=criterion)
    log_train.append([loss_train, acc_train])
    log_valid.append([loss_valid, acc_valid])

    # Save checkpoints
    if (epoch + 1) % 5 == 0:
      torch.save({'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict()}, f'{kf}_checkpoint{epoch + 1}.pt')

    # End time record
    e_time = time.time()

    # Output log
    print(f'epoch: {epoch + 1}, loss_train: {loss_train:.4f}, accuracy_train: {acc_train:.4f}, loss_valid: {loss_valid:.4f}, accuracy_valid: {acc_valid:.4f}, {(e_time - s_time):.4f}sec')

  return {'train': log_train, 'valid': log_valid}

In [None]:
# cross-validation

# tokeinzer settings
tagger = MeCab.Tagger("-Owakati -d /usr/local/lib/python3.10/dist-packages/unidic_lite/dicdir")
# tknz = BertTokenizer("/content/drive/MyDrive/BERT-base_aozora-jawiki3m_unidic_bpe-32k_2m/vocab.txt")
tknz = BertJapaneseTokenizer("/content/drive/MyDrive/BERT-base_aozora-jawiki3m_unidic_bpe-32k_2m/vocab.txt")

# Specify a pre-trained model
pretrained_config = "/content/drive/MyDrive/BERT-base_aozora-jawiki3m_unidic_bpe-32k_2m/bert_config.json"
pretrained = "/content/drive/MyDrive/BERT-base_aozora-jawiki3m_unidic_bpe-32k_2m"

# Specify maximum series length
MAX_LEN = 512

# Category Settings
categories = ["akutagawa", "izumi", "kikuchi", "mori", "natsume", "sasaki", "shimazaki","dazai", "okamoto","umino"]
# categories = ["suzuki", "kishi", "yoshida", "miyabe", "morimi", "ishida", "murakamiharuki", "murakami", "higashino", "minato"]

# Parameter Setting
DROP_RATE = 0.4
OUTPUT_SIZE = 10
BATCH_SIZE = 16
NUM_EPOCHS = 40
LEARNING_RATE = 2e-5


F1_test_list = []
recall_test_list = []
precision_test_list = []

for kf in range(1, 6):

  # # Loading Data Sets
  train = pd.read_csv(f"/content/drive/MyDrive/bert/AA/kfold_tsvfile/5fold_yamaru_train/yamaru_10times20_train_c_kfold{kf}.tsv", sep = "\t", encoding = "CP932", header = None)
  train.columns = ["author", "label", "content", "akutagawa", "izumi", "kikuchi", "mori", "natsume", "sasaki", "shimazaki","dazai", "okamoto","umino"]

  valid = pd.read_csv(f"/content/drive/MyDrive/bert/AA/kfold_tsvfile/5fold_yamaru_valid/yamaru_10times20_valid_c_kfold{kf}.tsv", sep = "\t", encoding = "CP932", header = None)
  valid.columns = ["author", "label", "content", "akutagawa", "izumi", "kikuchi", "mori", "natsume", "sasaki", "shimazaki","dazai", "okamoto","umino"]

  test = pd.read_csv(f"/content/drive/MyDrive/bert/AA/kfold_tsvfile/5fold_yamaru_test/yamaru_10times20_test_c_kfold{kf}.tsv", sep = "\t", encoding = "CP932", header = None)
  test.columns = ["author", "label", "content", "akutagawa", "izumi", "kikuchi", "mori", "natsume", "sasaki", "shimazaki","dazai", "okamoto","umino"]

  # # Loading Data Sets
  # train = pd.read_csv(f"/content/drive/MyDrive/bert/AA/kfold_tsvfile/5fold_yamaru_train_yanagi/10times20_train_yanagi_noIwai_kfold{kf}.tsv", sep = "\t", encoding = "CP932", header = None)
  # train.columns = ["author", "label", "content", "suzuki", "kishi", "yoshida", "miyabe", "morimi", "ishida", "murakamiharuki", "murakami", "higashino", "minato"]

  # valid = pd.read_csv(f"/content/drive/MyDrive/bert/AA/kfold_tsvfile/5fold_yamaru_valid_yanagi/10times20_valid_yanagi_noIwai_kfold{kf}.tsv", sep = "\t", encoding = "CP932", header = None)
  # valid.columns = ["author", "label", "content", "suzuki", "kishi", "yoshida", "miyabe", "morimi", "ishida", "murakamiharuki", "murakami", "higashino", "minato"]

  # test = pd.read_csv(f"/content/drive/MyDrive/bert/AA/kfold_tsvfile/5fold_yamaru_test_yanagi/10times20_test_yanagi_noIwai_kfold{kf}.tsv", sep = "\t", encoding = "CP932", header = None)
  # test.columns = ["author", "label", "content", "suzuki", "kishi", "yoshida", "miyabe", "morimi", "ishida", "murakamiharuki", "murakami", "higashino", "minato"]


  # Creating a Dataset
  dataset_train = CreateDataset(train['content'], train[categories].values, tknz, MAX_LEN)
  dataset_valid = CreateDataset(valid['content'], valid[categories].values, tknz, MAX_LEN)
  dataset_test = CreateDataset(test['content'], test[categories].values, tknz, MAX_LEN)


  # Model Definition
  model = BERTClass(pretrained, pretrained_config, DROP_RATE, OUTPUT_SIZE)

  # Definition of loss function
  criterion = torch.nn.BCEWithLogitsLoss()

  # Optimizer Definition
  optimizer = torch.optim.AdamW(params=model.parameters(), lr=LEARNING_RATE)

  # Device Designation
  device = 'cuda' if cuda.is_available() else 'cpu'

  #  Model Learning
  log = train_model(dataset_train, dataset_valid, BATCH_SIZE, model, criterion, optimizer, NUM_EPOCHS, device=device)
  # torch.save({'epoch': 40, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict()},
  #   f'/content/drive/MyDrive/bert/AA/kfold_torchfile/AozoraWiki3m_yamaru_yanagi/{kf}_aozorawiki3m_yanagi_tknModi2_checkpoint40.pt') 
  # Save the weight of the last epoch


# Calculation of accuracy rate
  dataloader_train = DataLoader(dataset_train, batch_size=1, shuffle=False)
  dataloader_valid = DataLoader(dataset_valid, batch_size=1, shuffle=False)
  dataloader_test = DataLoader(dataset_test, batch_size=1, shuffle=False)

  # Calculate the accuracy at the last epoch number
  # print(f'Accuracy（learning data）：{calculate_loss_and_accuracy_test(model, dataloader_train, device)[1]:.3f}')
  # print(f'Accuracy（Verification data)：{calculate_loss_and_accuracy_test(model, dataloader_valid, device)[1]:.3f}')
   print(f'Accuracy  (Evaluation Data)：{calculate_loss_and_accuracy_test(model, dataloader_test, device)[1]:.3f}')

  # print(f'Recall （learning data）：{calculate_loss_and_accuracy_test(model, dataloader_train, device)[3]:.3f}')
  # print(f'Recall （Verification data)：{calculate_loss_and_accuracy_test(model, dataloader_valid, device)[3]:.3f}')
  print(f'Recall (Evaluation Data)：{calculate_loss_and_accuracy_test(model, dataloader_test, device)[3]:.3f}')

  # print(f'Precision（learning data）：{calculate_loss_and_accuracy_test(model, dataloader_train, device)[4]:.3f}')  
  # print(f'Precision（Verification data)：{calculate_loss_and_accuracy_test(model, dataloader_valid, device)[4]:.3f}')
  print(f'Precision (Evaluation Data)：{calculate_loss_and_accuracy_test(model, dataloader_test, device)[4]:.3f}')

  # print(f'F1 （learning data）：{calculate_loss_and_accuracy_test(model, dataloader_train, device)[5]:.3f}')
  # print(f'F1（Verification data)：{calculate_loss_and_accuracy_test(model, dataloader_valid, device)[5]:.3f}')
  print(f'F1 (Evaluation Data)：{calculate_loss_and_accuracy_test(model, dataloader_test, device)[5]:.3f}')

  recall_test = calculate_loss_and_accuracy_test(model, dataloader_test, device)[3]
  precision_test = calculate_loss_and_accuracy_test(model, dataloader_test, device)[4]
  f1_test = calculate_loss_and_accuracy_test(model, dataloader_test, device)[5]
  recall_test_list.append(recall_test)
  precision_test_list.append(precision_test)
  F1_test_list.append(f1_test)


  # # for Ensemble soft voting
  # import torch.nn.functional as F

  # logits2 = calculate_loss_and_accuracy_test(model, dataloader_test, device)[6]

  # prob_list = []
  # for i in range(len(logits2)):
  #   prob = F.softmax(logits2[i], dim = -1)
  #   prob_list.append(prob.tolist()[0])

  # df_prob = pd.DataFrame(prob_list)
  # df_prob.to_csv(f"/content/drive/MyDrive/bert/AA/BERT_Prob/10times20_5fold_test/aozorawiki3m_test_prob/AozoraWiki3m_test{kf}_yanagi_tknModi2_prob.csv", index = None)


print(statistics.mean(recall_test_list))
print(statistics.pstdev(recall_test_list))
print(statistics.mean(precision_test_list))
print(statistics.pstdev(precision_test_list))
print(statistics.mean(F1_test_list))
print(statistics.pstdev(F1_test_list))

In [None]:
# Log visualization
x_axis = [x for x in range(1, len(log['train']) + 1)]
fig, ax = plt.subplots(1, 2, figsize=(15, 5))
ax[0].plot(x_axis, np.array(log['train']).T[0], label='train')
ax[0].plot(x_axis, np.array(log['valid']).T[0], label='valid')
ax[0].set_xlabel('epoch')
ax[0].set_ylabel('loss')
ax[0].legend()
ax[1].plot(x_axis, np.array(log['train']).T[1], label='train')
ax[1].plot(x_axis, np.array(log['valid']).T[1], label='valid')
ax[1].set_xlabel('epoch')
ax[1].set_ylabel('accuracy')
ax[1].legend()
plt.show()