In [1]:
!pip install transformers==4.5.0 fugashi==1.1.0 ipadic==1.0.0 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers==4.5.0
  Downloading transformers-4.5.0-py3-none-any.whl (2.1 MB)
[K     |████████████████████████████████| 2.1 MB 7.2 MB/s 
[?25hCollecting fugashi==1.1.0
  Downloading fugashi-1.1.0-cp37-cp37m-manylinux1_x86_64.whl (486 kB)
[K     |████████████████████████████████| 486 kB 56.4 MB/s 
[?25hCollecting ipadic==1.0.0
  Downloading ipadic-1.0.0.tar.gz (13.4 MB)
[K     |████████████████████████████████| 13.4 MB 37.6 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 37.2 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[K     |████████████████████████████████| 880 kB 38.0 MB/s 
Building wheels for collected packages: ipadic, sacremoses
  Building wheel for ipadi

In [2]:
import numpy as np
import pandas as pd
import string
import re
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoTokenizer, BertJapaneseTokenizer, BertModel
from torch import cuda
import sklearn.metrics as skm
from sklearn.model_selection import train_test_split
import torch.nn.functional as F
from transformers import logging


In [47]:
batch_size = 16
max_len = 512

In [48]:
df_train = pd.read_csv("./drive/MyDrive/Colab_Notebooks/data/IMDb/IMDb_train.tsv", sep="\t", header=None)
df_train = df_train.iloc[:, 0:2]
df_train.columns = ["text", "label"]
print(df_train.shape)
df_train.head()

(25000, 2)


Unnamed: 0,text,label
0,The Unborn is a pretty good low-budget horror ...,1
1,Vincente Minnelli directed some of the most ce...,1
2,"The first time I saw this, I didn't laugh too ...",1
3,This is a great movie for all Generation X'ers...,1
4,I first saw this absolutely riveting documenta...,1


In [49]:
df_test = pd.read_csv("./drive/MyDrive/Colab_Notebooks/data/IMDb/IMDb_test.tsv", sep="\t", header=None)
df_test = df_test.iloc[:, 0:2]
df_test.columns = ["text", "label"]
print(df_test.shape)
df_test.head()

(25000, 2)


Unnamed: 0,text,label
0,"Susan Sarandon is, for lack of a better word, ...",1
1,Seeing Laurel without Hardy in a film seems st...,1
2,I was recently at a sleepover birthday party w...,1
3,This movie took me by surprise. The opening cr...,1
4,A widely unknown strange little western with m...,1


In [50]:
df_train["label"].value_counts()

1    12500
0    12500
Name: label, dtype: int64

In [51]:
df_train["text"].map(len).median()

979.0

In [52]:
df_train["text"].map(len)

0         639
1        4018
2         921
3        1582
4        1526
         ... 
24995     472
24996     751
24997     964
24998    6041
24999    1405
Name: text, Length: 25000, dtype: int64

# 前処理

In [53]:
def preprocessing_text(text):
    # 改行コードを消去
    text = re.sub('<br />', '', text)

    # カンマ、ピリオド以外の記号をスペースに置換
    for p in string.punctuation:
        if (p == ".") or (p == ","):
            continue
        else:
            text = text.replace(p, " ")

    # ピリオドなどの前後にはスペースを入れておく
    text = text.replace(".", " . ")
    text = text.replace(",", " , ")
    return text


In [54]:
df_train["text"].map(preprocessing_text)
preprocessing_text(df_train["text"][0])

'The Unborn is a pretty good low budget horror movie exploiting the fears associated with pregnancy .  It s very well acted by the always good Brooke Adams and b movie stalwart James Karen ,  although the supporting cast is pretty average for a b grader .  The music ,  by Gary Numan of all people ,  is good too .  Henry Dominic s script is quite intelligent for this sort of thing ,  although there is a hint of misogyny about it .  Rodman Fender s direction is merely adequate ,  and there are some unnecessary cheap scares .  If you re a fan of Adams ,  whose movie career is nowhere near as illustrious as it should be ,  check it out  she s great ,  as always . '

# データセット作成

In [55]:
class CreateDataset(Dataset):
  def __init__(self, X, y, tokenizer, max_len):
    self.X = X
    self.y = y
    # self.uid = uid
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
    return len(self.y)

  def encode(self, tokenizer, text):

      # 前処理
      text = preprocessing_text(text)
      

      inputs = tokenizer.encode_plus(
          text,
          add_special_tokens=True,
          max_length=self.max_len,
          padding = 'max_length',
          truncation = True
      )
      return inputs

  def __getitem__(self, index):
    text = self.X[index]
    label = self.y[index]
    # userID = self.uid[index]
    ids = []
    mask = []
    inputs = self.encode(tokenizer=self.tokenizer, text=text)
    ids.append(torch.LongTensor(inputs['input_ids']))
    mask.append(torch.LongTensor(inputs['attention_mask']))

    return {
      'ids': ids,
      'mask': mask,
      'label': label,
      'text':text,
      # 'userID':userID
    }

In [56]:
# tokenizer = BertJapaneseTokenizer.from_pretrained("cl-tohoku/bert-base-japanese-whole-word-masking")
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [57]:
X = df_train["text"].values
y = df_train["label"].values

In [58]:
X_train, X_eval, y_train, y_eval = train_test_split(X, y, train_size=0.75)

print(len(X_train))
print(len(y_train))
print(len(X_eval))
print(len(y_eval))

X_test = df_test["text"].values
y_test = df_test["label"].values
print(len(X_test))
print(len(y_test))

18750
18750
6250
6250
25000
25000


In [59]:
dataset_train = CreateDataset(X_train, y_train, tokenizer, max_len=max_len)
dataset_eval = CreateDataset(X_eval, y_eval, tokenizer, max_len=max_len)
dataset_test = CreateDataset(X_test, y_test, tokenizer, max_len=max_len)

print(dataset_train.__len__())
print(dataset_eval.__len__())
print(dataset_test.__len__())

18750
6250
25000


In [60]:
dataset_train[1]

{'ids': [tensor([  101, 11121,  1106,   140, 23156,  1179,  1118,  1103,  2161,  1110,
           5203,  1141,  1104,  1109, 17751, 22087, 15966,  1116,  1115,  2825,
           3374,  1146,  1106,  1109,  7267,   119,  1135,  6467,  2117, 20164,
          26271,   117,  2750, 15463, 21643,   117,  8835,  8540,  4808,   117,
           1105,  2096, 10176,   117,   170,   153,  5821,  8401, 14613,   119,
           1327,  1132,  1284, 20801,   146,  9471,  4302,   119,  9656,  1195,
           2372,  8123,  1114,   170,  6844,  7277,  6094,  2227,  1104, 16455,
           1105,   151, 17294,  2340,  1272,  1135,  3982, 10865,  1111,  4552,
            119,  4981,  6819,  1141,  2096,  1109,  1798, 22087, 15966,  1116,
            117,  1122,  1110,  1141,  1104,  1109,  1798,   157, 24657,  1468,
           1106,  8553,  1112,   170,  3921,   119, 11336,  8178,  2354,  4902,
           1111,  6064,   119,   102,     0,     0,     0,     0,     0,     0,
              0,     0,     0,   

In [61]:
dataloader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle=True, pin_memory=True)
dataloader_eval = DataLoader(dataset_eval, batch_size=batch_size, shuffle=True, pin_memory=True)
dataloader_test = DataLoader(dataset_test, batch_size=batch_size, shuffle=True, pin_memory=True)

# 辞書オブジェクトにまとめる
dataloaders_dict = {"train": dataloader_train, "val": dataloader_eval}

In [62]:
tmp = next(iter(dataloader_train))
print(tmp["ids"][0].size())
print(tmp["label"])
tmp["ids"][0][0]


torch.Size([16, 512])
tensor([1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0])


tensor([  101,   146,  1176,  1142,  1273,   170,  1974,   119,  1135,  1144,
          170,  7310,  8117,  1206,  1103,  5681,  1105,  3301,   170,  1642,
         1115,  1110,  2785,  8462,   117,  1103,  1642,  1104,  1103,  5250,
         3309,  6997,  1488,   119,  1109,  7631,   146,  1176,  1103,  1436,
         1649,  1108,  1103,  1236,  1115,  1103, 10919,  1402,  1108,  1167,
         1190,  1198,   170,  3582,  1111,  1103,  1642,   119,  1249,  1103,
         1401,  1500,  1103,  1488,  1103,  1642,  1104,  1117,  1676,   188,
         1266,  1107,  1103,  2350,  6941,  1116,  1104,  5144,  1161,   117,
         1103,  5290,  1104,  1447,  1105, 19971,  3316,  1126,  1593,  8854,
         9181,   119,  4434,  1108,  1177, 20731,  1115,   170,  3014, 10919,
         1125, 14511,  5415,  1105,  2764,   119,  8007,  1103,  1273,  1108,
         1304,  3903,   119,  1247,  1127,  4899,   117,  1649,   117,  1165,
         1122, 18691,  1181,  1113,  1315,  4105,   119,   119, 

In [63]:
# model = BertModel.from_pretrained("cl-tohoku/bert-base-japanese-whole-word-masking", output_attentions=True, output_hidden_states=True)
model = BertModel.from_pretrained("bert-base-cased", output_attentions=True, output_hidden_states=True)

In [64]:
from torch import nn


class BertForLivedoor(nn.Module):
    '''BERTモデルにPosiNegaの2クラスを判定する部分をつなげたモデル'''

    def __init__(self):
        super(BertForLivedoor, self).__init__()

        # BERTモジュール
        self.bert = model  # 日本語学習済みのBERTモデル

        # headにクラス予測を追加
        # 入力はBERTの出力特徴量の次元768、出力は2クラス
        self.cls = nn.Linear(in_features=768, out_features=2)

        # 重み初期化処理
        nn.init.normal_(self.cls.weight, std=0.02)
        nn.init.normal_(self.cls.bias, 0)

    def forward(self, input_ids):
        '''
        input_ids： [batch_size, sequence_length]の文章の単語IDの羅列
        '''

        # BERTの基本モデル部分の順伝搬
        # 順伝搬させる
        result = self.bert(input_ids)  # reult は、sequence_output, pooled_output

        # sequence_outputの先頭の単語ベクトルを抜き出す
        vec_0 = result[0]  # 最初の0がsequence_outputを示す
        vec_0 = vec_0[:, 0, :]  # 全バッチ。先頭0番目の単語(cls)の全768要素
        vec_0 = vec_0.view(-1, 768)  # sizeを[batch_size, hidden_size]に変換
        output = self.cls(vec_0)  # 全結合層

        return output

In [65]:
# モデル構築
net = BertForLivedoor()

# 訓練モードに設定
net.train()

print('ネットワーク設定完了')

ネットワーク設定完了


In [66]:
# 勾配計算を最後のBertLayerモジュールと追加した分類アダプターのみ実行

# 1. まず全部を、勾配計算Falseにしてしまう
for param in net.parameters():
    param.requires_grad = False

# 2. BertLayerモジュールの最後を勾配計算ありに変更
for param in net.bert.encoder.layer[-1].parameters():
    param.requires_grad = True

# 3. 識別器を勾配計算ありに変更
for param in net.cls.parameters():
    param.requires_grad = True

In [67]:
# 最適化手法の設定
import torch.optim as optim


# BERTの元の部分はファインチューニング
optimizer = optim.Adam([
    {'params': net.bert.encoder.layer[-1].parameters(), 'lr': 5e-5},
    {'params': net.cls.parameters(), 'lr': 1e-4}
])

# 損失関数の設定
criterion = nn.CrossEntropyLoss()
# nn.LogSoftmax()を計算してからnn.NLLLoss(negative log likelihood loss)を計算

In [68]:
# モデルを学習させる関数を作成


def train_model(net, dataloaders_dict, criterion, optimizer, num_epochs):

    # GPUが使えるかを確認
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print("使用デバイス：", device)
    print('-----start-------')

    # ネットワークをGPUへ
    net.to(device)

    # ネットワークがある程度固定であれば、高速化させる
    torch.backends.cudnn.benchmark = True

    # ミニバッチのサイズ
    batch_size = dataloaders_dict["train"].batch_size

    # epochのループ
    for epoch in range(num_epochs):
        # epochごとの訓練と検証のループ
        for phase in ['train', 'val']:
            if phase == 'train':
                net.train()  # モデルを訓練モードに
            else:
                net.eval()   # モデルを検証モードに

            epoch_loss = 0.0  # epochの損失和
            epoch_corrects = 0  # epochの正解数
            iteration = 1

            # データローダーからミニバッチを取り出すループ
            for batch in (dataloaders_dict[phase]):
                # batchはTextとLableの辞書型変数

                # GPUが使えるならGPUにデータを送る
                inputs = batch["ids"][0].to(device)  # 文章
                labels = batch["label"].to(device)  # ラベル

                # optimizerを初期化
                optimizer.zero_grad()

                # 順伝搬（forward）計算
                with torch.set_grad_enabled(phase == 'train'):

                    # BERTに入力
                    outputs = net(inputs)

                    loss = criterion(outputs, labels)  # 損失を計算

                    _, preds = torch.max(outputs, 1)  # ラベルを予測

                    # 訓練時はバックプロパゲーション
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                        if (iteration % 10 == 0):  # 10iterに1度、lossを表示
                            acc = (torch.sum(preds == labels.data)
                                   ).double()/batch_size
                            print('イテレーション {} || Loss: {:.4f} || 10iter. || 本イテレーションの正解率：{}'.format(
                                iteration, loss.item(),  acc))

                    iteration += 1

                    # 損失と正解数の合計を更新
                    epoch_loss += loss.item() * batch_size
                    epoch_corrects += torch.sum(preds == labels.data)

            # epochごとのlossと正解率
            epoch_loss = epoch_loss / len(dataloaders_dict[phase].dataset)
            epoch_acc = epoch_corrects.double(
            ) / len(dataloaders_dict[phase].dataset)

            print('Epoch {}/{} | {:^5} |  Loss: {:.4f} Acc: {:.4f}'.format(epoch+1, num_epochs,
                                                                           phase, epoch_loss, epoch_acc))

    return net

In [None]:
# 学習・検証を実行する。
num_epochs = 3
net_trained = train_model(net, dataloaders_dict,
                          criterion, optimizer, num_epochs=num_epochs)


使用デバイス： cuda:0
-----start-------
イテレーション 10 || Loss: 0.6704 || 10iter. || 本イテレーションの正解率：0.625
イテレーション 20 || Loss: 0.7373 || 10iter. || 本イテレーションの正解率：0.5
イテレーション 30 || Loss: 0.6584 || 10iter. || 本イテレーションの正解率：0.5625
イテレーション 40 || Loss: 0.8347 || 10iter. || 本イテレーションの正解率：0.375
イテレーション 50 || Loss: 0.6714 || 10iter. || 本イテレーションの正解率：0.5625
イテレーション 60 || Loss: 0.6700 || 10iter. || 本イテレーションの正解率：0.4375
イテレーション 70 || Loss: 0.6947 || 10iter. || 本イテレーションの正解率：0.4375
イテレーション 80 || Loss: 0.6777 || 10iter. || 本イテレーションの正解率：0.6875
イテレーション 90 || Loss: 0.6885 || 10iter. || 本イテレーションの正解率：0.5
イテレーション 100 || Loss: 0.6169 || 10iter. || 本イテレーションの正解率：0.625
イテレーション 110 || Loss: 0.6189 || 10iter. || 本イテレーションの正解率：0.6875
イテレーション 120 || Loss: 0.5622 || 10iter. || 本イテレーションの正解率：0.8125
イテレーション 130 || Loss: 0.4013 || 10iter. || 本イテレーションの正解率：0.75
イテレーション 140 || Loss: 0.3209 || 10iter. || 本イテレーションの正解率：0.9375
イテレーション 150 || Loss: 0.3570 || 10iter. || 本イテレーションの正解率：0.9375
イテレーション 160 || Loss: 0.2851 || 10iter. || 本イテレーションの正解率：1.

In [None]:
from tqdm import tqdm

# テストデータでの正解率を求める
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

net_trained.eval()   # モデルを検証モードに
net_trained.to(device)  # GPUが使えるならGPUへ送る

# epochの正解数を記録する変数
epoch_corrects = 0

# モデル評価用データ
labels_all = []
preds_all = []

for batch in tqdm(dataloader_test):  # testデータのDataLoader
    # batchはTextとLableの辞書オブジェクト
    # GPUが使えるならGPUにデータを送る
    inputs = batch["ids"][0].to(device)  # 文章
    labels = batch["label"].to(device)  # ラベル

    # 順伝搬（forward）計算
    with torch.set_grad_enabled(False):

        # BertForLivedoorに入力
        outputs = net_trained(inputs)


        loss = criterion(outputs, labels)  # 損失を計算
        _, preds = torch.max(outputs, 1)  # ラベルを予測
        epoch_corrects += torch.sum(preds == labels.data)  # 正解数の合計を更新

        # f1計算用
        labels_all.extend(batch["label"].to('cpu').detach().numpy())
        preds_all.extend(preds.to('cpu').detach().numpy())

# 正解率
epoch_acc = epoch_corrects.double() / len(dataloader_test.dataset)

print('テストデータ{}個での正解率：{:.4f}'.format(len(dataloader_test.dataset), epoch_acc))

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

print(f"正解率: {accuracy_score(labels_all, preds_all):.3f}")
print(f"適合率: {precision_score(labels_all, preds_all):.3f}")
print(f"再現率: {recall_score(labels_all, preds_all):.3f}")
print(f"F1: {f1_score(labels_all, preds_all):.3f}")