In [1]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import torch
from datasets import load_dataset


In [2]:
from datasets.utils.logging import disable_progress_bar
import datasets
import transformers
import logging
import os
import warnings


disable_progress_bar()  # 關掉datasets進度條
datasets.logging.set_verbosity_error()  # 關掉datasets日誌
transformers.logging.set_verbosity_error()  # 關掉transformers日誌

# 關閉 transformers 套件的未來警告
warnings.simplefilter(action='ignore', category=FutureWarning)

# 關閉 transformers 的 log 訊息
logging.getLogger("transformers").setLevel(logging.ERROR)

# 完全關閉 wandb 的功能與訊息
os.environ["WANDB_MODE"] = "disabled"


In [3]:
from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch

# 1. 載入 Financial PhraseBank 資料集
dataset = load_dataset("takala/financial_phrasebank", "sentences_allagree")

# 檢查資料結構
print(dataset)

# 分割資料集為訓練集和測試集
dataset = dataset["train"].train_test_split(test_size=0.2, seed=42)
train_dataset = dataset["train"]
test_dataset = dataset["test"]

# 2. 載入 BERT Tokenizer
model_name = "bert-base-uncased"  # 或 "ProsusAI/finbert"（適合金融領域）
tokenizer = BertTokenizer.from_pretrained(model_name)

# Tokenize 資料
def tokenize_function(examples):
    return tokenizer(examples['sentence'], truncation=True, padding="max_length", max_length=128)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# 移除多餘欄位
train_dataset = train_dataset.remove_columns(["sentence"])
test_dataset = test_dataset.remove_columns(["sentence"])

# 設定格式為 PyTorch tensors
train_dataset.set_format("torch")
test_dataset.set_format("torch")

# 3. 載入預訓練的 BERT 模型
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=3)

# 4. 訓練參數設定
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",    # 每個 epoch 評估一次
    save_strategy="epoch",          # 每個 epoch 保存模型
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True     # 訓練結束後載入最佳模型
)


# 5. 設定 Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
)

# 6. 開始訓練
trainer.train()

# 7. 保存模型
model.save_pretrained("./sentiment_model")
tokenizer.save_pretrained("./sentiment_model")

print("模型訓練完成，已保存至 ./sentiment_model")


DatasetDict({
    train: Dataset({
        features: ['sentence', 'label'],
        num_rows: 2264
    })
})
{'loss': 0.985, 'learning_rate': 1.9415204678362573e-05, 'epoch': 0.09}
{'loss': 0.7861, 'learning_rate': 1.8830409356725147e-05, 'epoch': 0.18}
{'loss': 0.6907, 'learning_rate': 1.824561403508772e-05, 'epoch': 0.26}
{'loss': 0.6417, 'learning_rate': 1.7660818713450293e-05, 'epoch': 0.35}
{'loss': 0.6017, 'learning_rate': 1.7076023391812867e-05, 'epoch': 0.44}
{'loss': 0.5637, 'learning_rate': 1.649122807017544e-05, 'epoch': 0.53}
{'loss': 0.4817, 'learning_rate': 1.5906432748538013e-05, 'epoch': 0.61}
{'loss': 0.5114, 'learning_rate': 1.5321637426900587e-05, 'epoch': 0.7}
{'loss': 0.5408, 'learning_rate': 1.4736842105263159e-05, 'epoch': 0.79}
{'loss': 0.4034, 'learning_rate': 1.4152046783625733e-05, 'epoch': 0.88}
{'loss': 0.3376, 'learning_rate': 1.3567251461988304e-05, 'epoch': 0.96}
{'eval_loss': 0.3035512864589691, 'eval_runtime': 28.6369, 'eval_samples_per_second': 15.819

In [4]:
# 測試句子
test_texts = [
    "The company's profit has increased significantly this quarter.",  # Positive
    "The increase in costs negatively affected the revenue.",          # Negative
    "The company's performance remained stable."                       # Neutral
]

# Tokenize 測試句子
inputs = tokenizer(test_texts, return_tensors="pt", truncation=True, padding=True, max_length=128)

# 模型推論
model.eval()
with torch.no_grad():
    outputs = model(**inputs)

# 取得預測結果
predictions = torch.argmax(outputs.logits, dim=1).numpy()

# 標籤對應
label_map = {0: "Negative", 1: "Neutral", 2: "Positive"}
predicted_labels = [label_map[pred] for pred in predictions]

print(predicted_labels)


['Positive', 'Negative', 'Positive']
