# 感情分析

In [9]:
import torch

print(torch.__version__)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

2.0.1+cu117
cuda:0


In [10]:
from transformers import BertForSequenceClassification, BertTokenizerFast, BertJapaneseTokenizer, Trainer, TrainingArguments
from datasets import load_dataset

# 日本語版(東北大BERT-base)
model = BertForSequenceClassification.from_pretrained('cl-tohoku/bert-base-japanese-v3', num_labels=3)
tokenizer = BertJapaneseTokenizer.from_pretrained('cl-tohoku/bert-base-japanese-v3')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cl-tohoku/bert-base-japanese-v3 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
from datasets import load_dataset

dataset = load_dataset('dataset_loader.py', name='sentiment_dataset')

In [24]:
def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True)

train_dataset, test_dataset = dataset['train'].map(tokenize, batched=True), dataset['test'].map(tokenize, batched=True)
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

Map:   0%|          | 0/1039 [00:00<?, ? examples/s]

Map:   0%|          | 0/260 [00:00<?, ? examples/s]

In [25]:
train_dataset, test_dataset

(Dataset({
     features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
     num_rows: 1039
 }),
 Dataset({
     features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
     num_rows: 260
 }))

In [27]:
# トレーニングの設定
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2,  # accumulate gradients over 2 batches
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)

# トレーナーの初期化とトレーニング開始
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

trainer.train()



  0%|          | 0/390 [00:00<?, ?it/s]

{'train_runtime': 103.9203, 'train_samples_per_second': 29.994, 'train_steps_per_second': 3.753, 'train_loss': 0.5935268695537861, 'epoch': 3.0}


TrainOutput(global_step=390, training_loss=0.5935268695537861, metrics={'train_runtime': 103.9203, 'train_samples_per_second': 29.994, 'train_steps_per_second': 3.753, 'train_loss': 0.5935268695537861, 'epoch': 3.0})

In [45]:
trainer.save_state()
trainer.save_model()

In [31]:
import pandas as pd
valid_df = pd.read_csv('valid.csv')

In [47]:
predictions = trainer.predict(test_dataset)

  0%|          | 0/17 [00:00<?, ?it/s]

In [48]:
predictions

PredictionOutput(predictions=array([[-1.9655111 , -1.2986985 ,  4.169394  ],
       [-2.0385394 ,  5.1809254 , -0.8070311 ],
       [-2.1409342 ,  5.070004  , -0.45958045],
       [-1.9050606 ,  3.775023  , -0.5910518 ],
       [-0.44389388, -0.90765095,  1.0388612 ],
       [ 2.5747912 , -3.14888   , -1.5317324 ],
       [ 1.0897841 , -3.409898  ,  0.54566747],
       [-1.8271153 ,  3.3134055 , -0.2808682 ],
       [-1.7967771 ,  4.0041804 , -0.7487513 ],
       [ 2.8475056 , -2.9783704 , -2.173442  ],
       [-1.2418618 ,  2.5220127 ,  0.53816503],
       [-1.9050539 ,  3.9254553 , -0.14849323],
       [-1.5194582 ,  3.2830365 ,  0.2885439 ],
       [-1.8286268 ,  5.0385675 , -0.77411944],
       [-1.7477475 , -1.0836008 ,  4.0378585 ],
       [ 2.6249578 , -3.2748709 , -1.2870512 ],
       [-1.7763867 ,  4.7867723 , -0.5169575 ],
       [-1.7274085 ,  4.3119264 , -0.8143187 ],
       [-1.2560631 ,  4.306547  , -0.45434156],
       [-1.0653136 ,  1.3445514 ,  0.7005827 ],
       [ 1.