
# BERT 感情分析　サンプル


参考記事:
https://www.analyticsvidhya.com/blog/2021/05/all-you-need-to-know-about-bert/


前提環境:　Google Colab
動作確認: 2022-03-13

## 環境準備

### transformersの導入

In [None]:
!pip install transformers | tail -n 1

In [None]:
# 乱数初期化
import torch
#import random

SEED = 1111
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

### BertTokenizerのインポート

In [None]:
# tokenizer インスタンスの生成
# 対象モデルは'bert-base-uncased'
#from pytorch_transformers import BertTokenizer
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
# tokenizer関数の動作確認
tokens = tokenizer.tokenize("What's going on?")
print(tokens)

In [None]:
# convert_tokens_to_ids関数の動作確認
indexes = tokenizer.convert_tokens_to_ids(tokens)
print(indexes)

In [None]:
# BERT固有の特殊トークン達
cls_token = tokenizer.cls_token
sep_token = tokenizer.sep_token
pad_token = tokenizer.pad_token
unk_token = tokenizer.unk_token
print(cls_token, sep_token, pad_token, unk_token)

In [None]:
# idによるトークン表記
cls_token_idx = tokenizer.cls_token_id
sep_token_idx = tokenizer.sep_token_id
pad_token_idx = tokenizer.pad_token_id
unk_token_idx = tokenizer.unk_token_id
print(cls_token_idx, sep_token_idx, pad_token_idx, unk_token_idx)

### 学習データの定義

In [None]:
# 入力テキストのトークン化関数
def tokenize(sentence):
    tokens = tokenizer.tokenize(sentence) 
    # 252までで切る
    tokens = tokens[:254-2]
    return tokens

In [None]:
# 学習データのデータ構造定義

# torchtextのバージョンアップに伴い、legacyを付ける必要あり
from torchtext.legacy import data

# 入力データ
TEXT = data.Field(batch_first = True,
                  use_vocab = False,
                  # 上で定義したトークン化関数
                  tokenize = tokenize,
                  # 前処理として各トークンをIDに変換
                  preprocessing = tokenizer.convert_tokens_to_ids,
                  init_token = cls_token_idx,
                  eos_token = sep_token_idx,
                  pad_token = pad_token_idx)
 
# 正解ラベル
LABEL = data.LabelField()

In [None]:
# データ読み込み
from torchtext.legacy import datasets
# IMDBは映画のレビューを含んだデータセットで、感情分析(positive/negative)モデルの学習に用いられる
# 読み込みに20分程度時間がかかります
train_data, valid_data = datasets.IMDB.splits(TEXT, LABEL)

In [None]:
# 全件のデータを使うと学習に時間がかかるので、1/10に間引いた
import random
train_data1, train_data2 = train_data.split(random_state=random.seed(SEED),split_ratio=0.1)
valid_data1, valid_data2 = valid_data.split(random_state=random.seed(SEED),split_ratio=0.1)

In [None]:
# ボキャブラリのビルド
LABEL.build_vocab(train_data1)

In [None]:
# 学習時のバッチサイズ
BATCH_SIZE = 16

In [None]:
# GPU利用
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

In [None]:
# 学習用、検証用のイテレーターの定義
train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_data1, valid_data1), 
    batch_size = BATCH_SIZE, 
    device = device)

In [None]:
# 学習データの中身を見てみる
idx0 = train_data[1].text
print(idx0)

In [None]:
# token表記に戻してみる
text0 = tokenizer.convert_ids_to_tokens(idx0)
print(text0)

## 学習の準備

 ### モデルの定義

bert-base-uncased:  
12-層、768-隠れ次元、12-ヘッド、110M パラメータ
lower-cased 英語テキスト上で訓練

https://torch.classcat.com/2021/05/16/huggingface-transformers-4-6-pretrained-models/

In [None]:
# 事前学習済みモデルのロード
from transformers import BertModel
bert = BertModel.from_pretrained('bert-base-uncased')

In [None]:
print(bert)

In [None]:
# モデルの定義
# 事前学習済みモデルの後段に線形関数を追加し、この出力で感情分析をする
import torch.nn as nn

class BERTSentiment(nn.Module):
    def __init__(self,
                 bert,
                 output_dim):
        
        super().__init__()
        
        self.bert = bert
        embedding_dim = bert.config.to_dict()['hidden_size']
        self.out = nn.Linear(embedding_dim, output_dim)
        
    def forward(self, text):
        #text = [batch size, sent len]

        #embedded = [batch size, emb dim]
        embedded = self.bert(text)[1]
        
        #output = [batch size, out dim]
        output = self.out(embedded)
        
        return output

In [None]:
# モデルインスタンスの生成
# 出力は感情分析なので2

OUTPUT_DIM = 2

model = BERTSentiment(bert,
                     OUTPUT_DIM).to(device)

In [None]:
# モデルのパラメータ数の確認
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

### 最適化関数、損失関数の定義など

In [None]:
import torch.optim as optim
from transformers import AdamW, get_constant_schedule_with_warmup
# 最適化関数の定義
optimizer = AdamW(model.parameters(),lr=2e-5,eps=1e-6)
# 損失関数の定義
criterion = nn.CrossEntropyLoss().to(device)

In [None]:
# スケジューラの定義
def get_scheduler(optimizer, warmup_steps):
    scheduler = get_constant_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps)
    return scheduler

In [None]:
#　精度計算

def categorical_accuracy(preds, y):
    max_preds = preds.argmax(dim = 1, keepdim = True)
    correct = (max_preds.squeeze(1)==y).float()
    return correct.sum() / len(y)

In [None]:
# ステータスバー表示用
from tqdm.notebook import tqdm

### 学習用関数trainの定義

In [None]:
def train(model, iterator, optimizer, criterion, scheduler):
    
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    
    for batch in tqdm(iterator):
        optimizer.zero_grad() # clear gradients first
        torch.cuda.empty_cache() # releases all unoccupied cached memory 
        text = batch.text
        label = batch.label
        predictions = model(text)
        loss = criterion(predictions, label)
        acc = categorical_accuracy(predictions, label)
        loss.backward()
        optimizer.step()
        scheduler.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

### 検証用関数evaluateの定義

In [None]:
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    
    with torch.no_grad():
        for batch in tqdm(iterator):
            text = batch.text
            label = batch.label
            predictions = model(text)
            loss = criterion(predictions, label)
            acc = categorical_accuracy(predictions, label)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

### 処理時間計算用関数　epoch_timeの定義

In [None]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

## 学習

In [None]:
# 各種変数の初期化
import math
N_EPOCHS = 3
#train_data_len = 25000
train_data_len = 2500

warmup_percent = 0.2
total_steps = math.ceil(N_EPOCHS*train_data_len*1./BATCH_SIZE)
warmup_steps = int(total_steps*warmup_percent)
scheduler = get_scheduler(optimizer, warmup_steps)

best_valid_loss = float('inf')

In [None]:
# 学習
# 学習時間は１epochあたり５分、計15分程度です(Google ColabでGPU利用の場合)
for epoch in range(N_EPOCHS):

    start_time = time.time()
    # 学習と評価
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion, scheduler)
    # 検証データによる評価
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    #  処理時間の計算
    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    # 検証データの損失が最もいい場合は、モデルを保存する
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'bert-nli.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')