# 感情分析

In [1]:
import torch

print(torch.__version__)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

2.0.0+cu117
cpu


## Fine-tuning

In [8]:
from transformers import BertForSequenceClassification, BertTokenizerFast, BertJapaneseTokenizer, Trainer, TrainingArguments
from datasets import load_dataset

base_model_name = 'cl-tohoku/bert-base-japanese-v3'
# base_model_name = 'cl-tohoku/bert-large-japanese-v2'
prefix = base_model_name.split('/')[-1]

id2label = {0: "NEGATIVE", 1: "POSITIVE", 2: "NEUTRAL"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1, "NEUTRAL": 2}

model = BertForSequenceClassification.from_pretrained(
    base_model_name, num_labels=3, id2label=id2label, label2id=label2id)
model = model.to(device)
tokenizer = BertJapaneseTokenizer.from_pretrained('cl-tohoku/bert-large-japanese-v2')

Downloading (…)lve/main/config.json:   0%|          | 0.00/472 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/447M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cl-tohoku/bert-base-japanese-v3 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average='weighted', zero_division=0)
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [4]:
from datasets import load_dataset

dataset = load_dataset('dataset_loader.py', name='sentiment_dataset')

In [5]:
def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True)

train_dataset, test_dataset = dataset['train'].map(tokenize, batched=True), dataset['test'].map(tokenize, batched=True)
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

Map:   0%|          | 0/649 [00:00<?, ? examples/s]

Map:   0%|          | 0/324 [00:00<?, ? examples/s]

In [6]:
train_dataset, test_dataset

(Dataset({
     features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
     num_rows: 649
 }),
 Dataset({
     features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
     num_rows: 324
 }))

In [36]:
# Poetryが入っていないとログが出力されないので注意
# !pip install tensorboard/ poetry add tensorboard

# トレーニングの設定
training_args = TrainingArguments(
    output_dir='./results',             # 出力フォルダ
    logging_dir='./logs',               # ログ保存フォルダ
    num_train_epochs=50,               # エポック数
    per_device_train_batch_size=1,      # 訓練のバッチサイズ (GPU数によって変える) 8, 1
    per_device_eval_batch_size=4,      # 評価のバッチサイズ (GPU数によって変える) 16 ,4
    gradient_accumulation_steps=2,      # accumulate gradients over 2 batches (GPU数によって変える)
    warmup_steps=500,                   # 学習率スケジューラのウォームアップステップ数
    weight_decay=0.01,                  # 重み減衰の強さ
    save_steps=1000,
    evaluation_strategy="steps",
    eval_steps=100,
    logging_steps=100,
    prediction_loss_only=True,
)

training_args.output_dir = f'./results_{prefix}_{training_args.num_train_epochs}_v1' # 出力フォルダ

# トレーナーの初期化とトレーニング開始
trainer = Trainer(
    model=model,                        # モデル
    args=training_args,                 # 訓練引数
    train_dataset=train_dataset,        # 訓練データセット
    eval_dataset=test_dataset,          # 評価データセット
    compute_metrics=compute_metrics
)

# チェックポイントから学習を再開したいとき
# trainer.train(ignore_keys_for_eval=['last_hidden_state', 'hidden_states', 'attentions'],
            #   resume_from_checkpoint=True)

trainer.train()



  0%|          | 0/972 [00:00<?, ?it/s]

{'loss': 1.0619, 'learning_rate': 1e-05, 'epoch': 0.31}


  0%|          | 0/81 [00:00<?, ?it/s]

{'eval_loss': 0.8175802230834961, 'eval_runtime': 26.04, 'eval_samples_per_second': 12.442, 'eval_steps_per_second': 3.111, 'epoch': 0.31}
{'loss': 0.8521, 'learning_rate': 2e-05, 'epoch': 0.62}


  0%|          | 0/81 [00:00<?, ?it/s]

{'eval_loss': 1.250738501548767, 'eval_runtime': 29.7627, 'eval_samples_per_second': 10.886, 'eval_steps_per_second': 2.722, 'epoch': 0.62}
{'loss': 0.9761, 'learning_rate': 3e-05, 'epoch': 0.92}


  0%|          | 0/81 [00:00<?, ?it/s]

{'eval_loss': 0.9984952211380005, 'eval_runtime': 26.678, 'eval_samples_per_second': 12.145, 'eval_steps_per_second': 3.036, 'epoch': 0.92}
{'loss': 1.0181, 'learning_rate': 4e-05, 'epoch': 1.23}


  0%|          | 0/81 [00:00<?, ?it/s]

{'eval_loss': 1.3086249828338623, 'eval_runtime': 30.0851, 'eval_samples_per_second': 10.769, 'eval_steps_per_second': 2.692, 'epoch': 1.23}
{'loss': 1.1165, 'learning_rate': 5e-05, 'epoch': 1.54}


  0%|          | 0/81 [00:00<?, ?it/s]

{'eval_loss': 0.8654221892356873, 'eval_runtime': 32.9724, 'eval_samples_per_second': 9.826, 'eval_steps_per_second': 2.457, 'epoch': 1.54}
{'loss': 0.9662, 'learning_rate': 3.940677966101695e-05, 'epoch': 1.85}


  0%|          | 0/81 [00:00<?, ?it/s]

{'eval_loss': 1.2588351964950562, 'eval_runtime': 50.4202, 'eval_samples_per_second': 6.426, 'eval_steps_per_second': 1.606, 'epoch': 1.85}
{'loss': 1.2417, 'learning_rate': 2.88135593220339e-05, 'epoch': 2.16}


  0%|          | 0/81 [00:00<?, ?it/s]

{'eval_loss': 1.290429711341858, 'eval_runtime': 51.2295, 'eval_samples_per_second': 6.324, 'eval_steps_per_second': 1.581, 'epoch': 2.16}
{'loss': 1.1504, 'learning_rate': 1.8220338983050846e-05, 'epoch': 2.47}


  0%|          | 0/81 [00:00<?, ?it/s]

{'eval_loss': 1.0374375581741333, 'eval_runtime': 39.9298, 'eval_samples_per_second': 8.114, 'eval_steps_per_second': 2.029, 'epoch': 2.47}
{'loss': 1.0809, 'learning_rate': 7.627118644067798e-06, 'epoch': 2.77}


  0%|          | 0/81 [00:00<?, ?it/s]

{'eval_loss': 1.0978227853775024, 'eval_runtime': 47.0998, 'eval_samples_per_second': 6.879, 'eval_steps_per_second': 1.72, 'epoch': 2.77}
{'train_runtime': 2006.6417, 'train_samples_per_second': 0.97, 'train_steps_per_second': 0.484, 'train_loss': 1.050971670896428, 'epoch': 3.0}


TrainOutput(global_step=972, training_loss=1.050971670896428, metrics={'train_runtime': 2006.6417, 'train_samples_per_second': 0.97, 'train_steps_per_second': 0.484, 'train_loss': 1.050971670896428, 'epoch': 3.0})

In [38]:
num_training_steps = train_dataset.num_rows / (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps) * training_args.num_train_epochs

num_training_steps

973.5

In [37]:
trainer.evaluate(test_dataset)

  0%|          | 0/81 [00:00<?, ?it/s]

{'eval_loss': 1.0628305673599243,
 'eval_runtime': 58.2609,
 'eval_samples_per_second': 5.561,
 'eval_steps_per_second': 1.39,
 'epoch': 3.0}

In [39]:
trainer.save_state()
trainer.save_model()

In [40]:
training_args.output_dir

'./results_bert-large-japanese-v2_3'

In [None]:
# poetry shell
# tensorboard --logdir ./datasets/sentiment_classification/logs

## 保存したモデルで予測する

In [2]:
from transformers import BertForSequenceClassification, BertTokenizerFast, BertJapaneseTokenizer, Trainer, TrainingArguments
from datasets import load_dataset

tokenizer = BertJapaneseTokenizer.from_pretrained('cl-tohoku/bert-base-japanese-v3')
dataset = load_dataset('dataset_loader.py', name='sentiment_dataset')

def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True)

eval_dataset =  dataset['validation'].map(tokenize, batched=True)
eval_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/231k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading (…)okenizer_config.json:   0%|          | 0.00/251 [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/325 [00:00<?, ? examples/s]

In [5]:
# 保存したモデルを読み込む
model_path = 'results/'
model = BertForSequenceClassification.from_pretrained(model_path)

trainer = Trainer(
    model=model
)

In [12]:
predictions = trainer.predict(eval_dataset)
predictions

In [11]:
eval_dataset.to_pandas()

Unnamed: 0,text,label,input_ids,token_type_ids,attention_mask
0,以前より高くなっている 。,0,"[2, 13204, 12505, 14031, 12493, 456, 12483, 38...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, ..."
1,小岩井の生乳100％ヨーグルトを安くしてくださって嬉しいです これからも買い続けるのでお値段...,1,"[2, 1829, 7718, 7641, 464, 3904, 7507, 12915, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,ＱＵＩＣpayで支払いで10%OＦＦにするならば、majicaにチャージして支払いをした場合...,1,"[2, 64, 7100, 13952, 7085, 13243, 457, 19567, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,9月の特売品がたくさん有って良かったです!,1,"[2, 40, 2806, 464, 3720, 7932, 1286, 430, 2227...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,是非気軽にいただける飲食店を増やしてほしいです！フードコートでも店舗でも。,1,"[2, 32039, 3235, 7316, 461, 27184, 12685, 1901...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
...,...,...,...,...,...
320,カツカレーライスを、購入したのだけど、フォークと割り箸しか無くて、残念ですスプーンも有ると嬉...,0,"[2, 21804, 19487, 25281, 500, 384, 13929, 441,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
321,ナシ。,2,"[2, 546, 7033, 385, 3, 0, 0, 0, 0, 0, 0, 0, 0,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
322,手首を痛めているので、店員さんにカゴの移動をお願いしたら、笑顔で運んでくれ､とても感じ良かっ...,1,"[2, 29091, 500, 24031, 456, 12483, 464, 457, 3...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
323,会計時にSSさんがLINEのお友達登録を勧めて下さりスマホの登録の方法も丁寧に教えてくれまし...,1,"[2, 17643, 2734, 461, 16104, 13038, 430, 25426...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
