# 感情分析

In [1]:
import torch

print(torch.__version__)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

2.0.1+cpu
cpu


## Fine-tuning

In [2]:
from transformers import BertForSequenceClassification, BertTokenizerFast, BertJapaneseTokenizer, Trainer, TrainingArguments
from datasets import load_dataset

# 日本語版(東北大BERT-base)
model = BertForSequenceClassification.from_pretrained('cl-tohoku/bert-base-japanese-v3', num_labels=3)
tokenizer = BertJapaneseTokenizer.from_pretrained('cl-tohoku/bert-base-japanese-v3')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cl-tohoku/bert-base-japanese-v3 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
from datasets import load_dataset

dataset = load_dataset('dataset_loader.py', name='sentiment_dataset')

In [4]:
def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True)

train_dataset, test_dataset, eval_dataset = dataset['train'].map(tokenize, batched=True), dataset['test'].map(tokenize, batched=True), dataset['validation'].map(tokenize, batched=True)
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
eval_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

In [5]:
train_dataset, test_dataset, eval_dataset

(Dataset({
     features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
     num_rows: 649
 }),
 Dataset({
     features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
     num_rows: 324
 }),
 Dataset({
     features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
     num_rows: 325
 }))

In [60]:
# トレーニングの設定
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2,  # accumulate gradients over 2 batches
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)

# トレーナーの初期化とトレーニング開始
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# チェックポイントから学習を再開したいときと
# trainer.train(ignore_keys_for_eval=['last_hidden_state', 'hidden_states', 'attentions'],
            #   resume_from_checkpoint=True)

trainer.train()



  0%|          | 0/243 [00:00<?, ?it/s]

{'train_runtime': 65.1361, 'train_samples_per_second': 29.891, 'train_steps_per_second': 3.731, 'train_loss': 0.7231104344497492, 'epoch': 2.98}


TrainOutput(global_step=243, training_loss=0.7231104344497492, metrics={'train_runtime': 65.1361, 'train_samples_per_second': 29.891, 'train_steps_per_second': 3.731, 'train_loss': 0.7231104344497492, 'epoch': 2.98})

In [None]:
predictions = trainer.predict(test_dataset)
predictions

In [45]:
trainer.save_state()
trainer.save_model()

## 保存したモデルで予測する

In [2]:
from transformers import BertForSequenceClassification, BertTokenizerFast, BertJapaneseTokenizer, Trainer, TrainingArguments
from datasets import load_dataset

tokenizer = BertJapaneseTokenizer.from_pretrained('cl-tohoku/bert-base-japanese-v3')
dataset = load_dataset('dataset_loader.py', name='sentiment_dataset')

def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True)

eval_dataset =  dataset['validation'].map(tokenize, batched=True)
eval_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/231k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading (…)okenizer_config.json:   0%|          | 0.00/251 [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/325 [00:00<?, ? examples/s]

In [5]:
# 保存したモデルを読み込む
model_path = 'results/'
model = BertForSequenceClassification.from_pretrained(model_path)

trainer = Trainer(
    model=model
)

In [12]:
predictions = trainer.predict(eval_dataset)
predictions

In [11]:
eval_dataset.to_pandas()

Unnamed: 0,text,label,input_ids,token_type_ids,attention_mask
0,以前より高くなっている 。,0,"[2, 13204, 12505, 14031, 12493, 456, 12483, 38...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, ..."
1,小岩井の生乳100％ヨーグルトを安くしてくださって嬉しいです これからも買い続けるのでお値段...,1,"[2, 1829, 7718, 7641, 464, 3904, 7507, 12915, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,ＱＵＩＣpayで支払いで10%OＦＦにするならば、majicaにチャージして支払いをした場合...,1,"[2, 64, 7100, 13952, 7085, 13243, 457, 19567, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,9月の特売品がたくさん有って良かったです!,1,"[2, 40, 2806, 464, 3720, 7932, 1286, 430, 2227...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,是非気軽にいただける飲食店を増やしてほしいです！フードコートでも店舗でも。,1,"[2, 32039, 3235, 7316, 461, 27184, 12685, 1901...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
...,...,...,...,...,...
320,カツカレーライスを、購入したのだけど、フォークと割り箸しか無くて、残念ですスプーンも有ると嬉...,0,"[2, 21804, 19487, 25281, 500, 384, 13929, 441,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
321,ナシ。,2,"[2, 546, 7033, 385, 3, 0, 0, 0, 0, 0, 0, 0, 0,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
322,手首を痛めているので、店員さんにカゴの移動をお願いしたら、笑顔で運んでくれ､とても感じ良かっ...,1,"[2, 29091, 500, 24031, 456, 12483, 464, 457, 3...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
323,会計時にSSさんがLINEのお友達登録を勧めて下さりスマホの登録の方法も丁寧に教えてくれまし...,1,"[2, 17643, 2734, 461, 16104, 13038, 430, 25426...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
