In [None]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import torch
from datasets import load_dataset
import warnings
warnings.filterwarnings("ignore")

In [2]:
from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch

# 1. 載入 Financial PhraseBank 資料集
dataset = load_dataset("takala/financial_phrasebank", "sentences_allagree")

# 檢查資料結構
print(dataset)

# 分割資料集為訓練集和測試集
dataset = dataset["train"].train_test_split(test_size=0.2, seed=42)
train_dataset = dataset["train"]
test_dataset = dataset["test"]

# 2. 載入 BERT Tokenizer
model_name = "bert-base-uncased"  # 或 "ProsusAI/finbert"（適合金融領域）
tokenizer = BertTokenizer.from_pretrained(model_name)

# Tokenize 資料
def tokenize_function(examples):
    return tokenizer(examples['sentence'], truncation=True, padding="max_length", max_length=128)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# 移除多餘欄位
train_dataset = train_dataset.remove_columns(["sentence"])
test_dataset = test_dataset.remove_columns(["sentence"])

# 設定格式為 PyTorch tensors
train_dataset.set_format("torch")
test_dataset.set_format("torch")

# 3. 載入預訓練的 BERT 模型
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=3)

# 4. 訓練參數設定
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",    # 每個 epoch 評估一次
    save_strategy="epoch",          # 每個 epoch 保存模型
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True     # 訓練結束後載入最佳模型
)


# 5. 設定 Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
)

# 6. 開始訓練
trainer.train()

# 7. 保存模型
model.save_pretrained("./sentiment_model")
tokenizer.save_pretrained("./sentiment_model")

print("模型訓練完成，已保存至 ./sentiment_model")


Found cached dataset financial_phrasebank (C:/Users/miche/.cache/huggingface/datasets/takala___financial_phrasebank/sentences_allagree/1.0.0/550bde12e6c30e2674da973a55f57edde5181d53f5a5a34c1531c53f93b7e141)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached split indices for dataset at C:\Users\miche\.cache\huggingface\datasets\takala___financial_phrasebank\sentences_allagree\1.0.0\550bde12e6c30e2674da973a55f57edde5181d53f5a5a34c1531c53f93b7e141\cache-149e9ab1ee378ec4.arrow and C:\Users\miche\.cache\huggingface\datasets\takala___financial_phrasebank\sentences_allagree\1.0.0\550bde12e6c30e2674da973a55f57edde5181d53f5a5a34c1531c53f93b7e141\cache-e7b02ae1a3031edc.arrow


DatasetDict({
    train: Dataset({
        features: ['sentence', 'label'],
        num_rows: 2264
    })
})


Loading cached processed dataset at C:\Users\miche\.cache\huggingface\datasets\takala___financial_phrasebank\sentences_allagree\1.0.0\550bde12e6c30e2674da973a55f57edde5181d53f5a5a34c1531c53f93b7e141\cache-b9cdef0daec75b53.arrow
Loading cached processed dataset at C:\Users\miche\.cache\huggingface\datasets\takala___financial_phrasebank\sentences_allagree\1.0.0\550bde12e6c30e2674da973a55f57edde5181d53f5a5a34c1531c53f93b7e141\cache-574dd26729716385.arrow
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. ini

  0%|          | 0/342 [00:00<?, ?it/s]

{'loss': 0.9645, 'learning_rate': 1.9415204678362573e-05, 'epoch': 0.09}
{'loss': 0.765, 'learning_rate': 1.8830409356725147e-05, 'epoch': 0.18}
{'loss': 0.6619, 'learning_rate': 1.824561403508772e-05, 'epoch': 0.26}
{'loss': 0.6352, 'learning_rate': 1.7660818713450293e-05, 'epoch': 0.35}
{'loss': 0.5417, 'learning_rate': 1.7076023391812867e-05, 'epoch': 0.44}
{'loss': 0.4724, 'learning_rate': 1.649122807017544e-05, 'epoch': 0.53}
{'loss': 0.4122, 'learning_rate': 1.5906432748538013e-05, 'epoch': 0.61}
{'loss': 0.3753, 'learning_rate': 1.5321637426900587e-05, 'epoch': 0.7}
{'loss': 0.3414, 'learning_rate': 1.4736842105263159e-05, 'epoch': 0.79}
{'loss': 0.2806, 'learning_rate': 1.4152046783625733e-05, 'epoch': 0.88}
{'loss': 0.2142, 'learning_rate': 1.3567251461988304e-05, 'epoch': 0.96}


  0%|          | 0/29 [00:00<?, ?it/s]

{'eval_loss': 0.2515396177768707, 'eval_runtime': 33.783, 'eval_samples_per_second': 13.409, 'eval_steps_per_second': 0.858, 'epoch': 1.0}
{'loss': 0.1997, 'learning_rate': 1.2982456140350879e-05, 'epoch': 1.05}
{'loss': 0.143, 'learning_rate': 1.239766081871345e-05, 'epoch': 1.14}
{'loss': 0.0933, 'learning_rate': 1.1812865497076024e-05, 'epoch': 1.23}
{'loss': 0.1517, 'learning_rate': 1.1228070175438597e-05, 'epoch': 1.32}
{'loss': 0.0587, 'learning_rate': 1.0643274853801172e-05, 'epoch': 1.4}
{'loss': 0.134, 'learning_rate': 1.0058479532163743e-05, 'epoch': 1.49}
{'loss': 0.1667, 'learning_rate': 9.473684210526315e-06, 'epoch': 1.58}
{'loss': 0.1041, 'learning_rate': 8.888888888888888e-06, 'epoch': 1.67}
{'loss': 0.117, 'learning_rate': 8.304093567251463e-06, 'epoch': 1.75}
{'loss': 0.0624, 'learning_rate': 7.719298245614036e-06, 'epoch': 1.84}
{'loss': 0.0436, 'learning_rate': 7.134502923976608e-06, 'epoch': 1.93}


  0%|          | 0/29 [00:00<?, ?it/s]

{'eval_loss': 0.1892276406288147, 'eval_runtime': 30.9964, 'eval_samples_per_second': 14.615, 'eval_steps_per_second': 0.936, 'epoch': 2.0}
{'loss': 0.0392, 'learning_rate': 6.549707602339181e-06, 'epoch': 2.02}
{'loss': 0.0366, 'learning_rate': 5.964912280701755e-06, 'epoch': 2.11}
{'loss': 0.0392, 'learning_rate': 5.380116959064328e-06, 'epoch': 2.19}
{'loss': 0.0341, 'learning_rate': 4.7953216374269005e-06, 'epoch': 2.28}
{'loss': 0.0538, 'learning_rate': 4.210526315789474e-06, 'epoch': 2.37}
{'loss': 0.0218, 'learning_rate': 3.625730994152047e-06, 'epoch': 2.46}
{'loss': 0.0454, 'learning_rate': 3.04093567251462e-06, 'epoch': 2.54}
{'loss': 0.0645, 'learning_rate': 2.456140350877193e-06, 'epoch': 2.63}
{'loss': 0.0374, 'learning_rate': 1.8713450292397662e-06, 'epoch': 2.72}
{'loss': 0.0679, 'learning_rate': 1.2865497076023392e-06, 'epoch': 2.81}
{'loss': 0.0629, 'learning_rate': 7.017543859649123e-07, 'epoch': 2.89}
{'loss': 0.0115, 'learning_rate': 1.1695906432748539e-07, 'epoch':

  0%|          | 0/29 [00:00<?, ?it/s]

{'eval_loss': 0.16409093141555786, 'eval_runtime': 32.9371, 'eval_samples_per_second': 13.754, 'eval_steps_per_second': 0.88, 'epoch': 3.0}
{'train_runtime': 1358.0614, 'train_samples_per_second': 4.001, 'train_steps_per_second': 0.252, 'train_loss': 0.21798429997060556, 'epoch': 3.0}
模型訓練完成，已保存至 ./sentiment_model


In [3]:
# 測試句子
test_texts = [
    "The company's profit has increased significantly this quarter.",  # Positive
    "The increase in costs negatively affected the revenue.",          # Negative
    "The company's performance remained stable."                       # Neutral
]

# Tokenize 測試句子
inputs = tokenizer(test_texts, return_tensors="pt", truncation=True, padding=True, max_length=128)

# 模型推論
model.eval()
with torch.no_grad():
    outputs = model(**inputs)

# 取得預測結果
predictions = torch.argmax(outputs.logits, dim=1).numpy()

# 標籤對應
label_map = {0: "Negative", 1: "Neutral", 2: "Positive"}
predicted_labels = [label_map[pred] for pred in predictions]

print(predicted_labels)


['Positive', 'Negative', 'Positive']
