### 下載 dataset

In [1]:
import os
from huggingface_hub import snapshot_download
 
# 下載資料集
snapshot_download(repo_id="takala/financial_phrasebank", repo_type="dataset",
                  cache_dir="",
                  local_dir_use_symlinks=False, resume_download=True,
                  token='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')

  from .autonotebook import tqdm as notebook_tqdm
Fetching 4 files: 100%|██████████| 4/4 [00:00<?, ?it/s]


'datasets--takala--financial_phrasebank\\snapshots\\1484d06fe7af23030c7c977b12556108d1f67039'

### BERT Train

In [2]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from datasets import Dataset
import pandas as pd
import torch

In [3]:
# 檢查 GPU 是否可用
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 讀取數據
def load_data(file_path):
    sentences = []
    labels = []
    label_mapping = {"positive": 2, "neutral": 1, "negative": 0}
    
    with open(file_path, "r",encoding='latin1') as file:
        for line in file:
            # 分割句子與情感標籤
            sentence, sentiment = line.strip().rsplit("@", 1)
            sentences.append(sentence)
            labels.append(label_mapping[sentiment])
    
    return pd.DataFrame({"sentence": sentences, "label": labels})

# 加載並處理資料
file_path = "./datasets--takala--financial_phrasebank/data/FinancialPhraseBank-v1.0/Sentences_AllAgree.txt"  # 替換為正確的檔案路徑
data = load_data(file_path)

# 分割資料集
train_texts, val_texts, train_labels, val_labels = train_test_split(
    data["sentence"], data["label"], test_size=0.2, random_state=42
)

# 轉換為Datasets格式
train_dataset = Dataset.from_dict({"text": train_texts, "label": train_labels})
val_dataset = Dataset.from_dict({"text": val_texts, "label": val_labels})

# 初始化 BERT
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)
model.to(device)  # 將模型移至 GPU

# Tokenization
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

# 設置格式為 PyTorch tensors
train_dataset = train_dataset.with_format("torch")
val_dataset = val_dataset.with_format("torch")

# 訓練參數設置
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=10,
    report_to="none",  # 禁用默認的報告器（例如 wandb）
    fp16=True  # 啟用 16 位浮點數計算（混合精度訓練）
)

# 定義訓練器
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

# 開始訓練
trainer.train()

# 模型評估
trainer.evaluate()

Using device: cuda


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1811/1811 [00:00<00:00, 2545.52 examples/s]
Map: 100%|██████████| 453/453 [00:00<00:00, 2624.95 examples/s]
  trainer = Trainer(
  3%|▎         | 10/342 [00:07<02:23,  2.32it/s]

{'loss': 0.9191, 'grad_norm': 5.645734786987305, 'learning_rate': 4.868421052631579e-05, 'epoch': 0.09}


  6%|▌         | 20/342 [00:13<02:14,  2.39it/s]

{'loss': 0.6572, 'grad_norm': 6.836689472198486, 'learning_rate': 4.722222222222222e-05, 'epoch': 0.18}


  9%|▉         | 30/342 [00:19<02:09,  2.41it/s]

{'loss': 0.5327, 'grad_norm': 6.590019702911377, 'learning_rate': 4.576023391812866e-05, 'epoch': 0.26}


 12%|█▏        | 40/342 [00:25<02:05,  2.40it/s]

{'loss': 0.5532, 'grad_norm': 9.049614906311035, 'learning_rate': 4.429824561403509e-05, 'epoch': 0.35}


 15%|█▍        | 50/342 [00:31<02:01,  2.39it/s]

{'loss': 0.3361, 'grad_norm': 5.400299549102783, 'learning_rate': 4.283625730994152e-05, 'epoch': 0.44}


 18%|█▊        | 60/342 [00:37<01:58,  2.38it/s]

{'loss': 0.2801, 'grad_norm': 7.022729873657227, 'learning_rate': 4.137426900584795e-05, 'epoch': 0.53}


 20%|██        | 70/342 [00:45<02:00,  2.26it/s]

{'loss': 0.21, 'grad_norm': 3.3065104484558105, 'learning_rate': 3.991228070175439e-05, 'epoch': 0.61}


 23%|██▎       | 80/342 [00:51<01:49,  2.39it/s]

{'loss': 0.1157, 'grad_norm': 6.795736789703369, 'learning_rate': 3.845029239766082e-05, 'epoch': 0.7}


 26%|██▋       | 90/342 [00:57<01:45,  2.39it/s]

{'loss': 0.1533, 'grad_norm': 4.032856464385986, 'learning_rate': 3.6988304093567254e-05, 'epoch': 0.79}


 29%|██▉       | 100/342 [01:05<01:45,  2.30it/s]

{'loss': 0.1464, 'grad_norm': 6.182636737823486, 'learning_rate': 3.5526315789473684e-05, 'epoch': 0.88}


 32%|███▏      | 110/342 [01:12<01:41,  2.29it/s]

{'loss': 0.2537, 'grad_norm': 0.42610445618629456, 'learning_rate': 3.406432748538012e-05, 'epoch': 0.96}


                                                 
 33%|███▎      | 114/342 [01:18<02:02,  1.85it/s]

{'eval_loss': 0.1049937978386879, 'eval_runtime': 3.1675, 'eval_samples_per_second': 143.016, 'eval_steps_per_second': 9.156, 'epoch': 1.0}


 35%|███▌      | 120/342 [01:21<02:06,  1.75it/s]

{'loss': 0.1076, 'grad_norm': 0.323152631521225, 'learning_rate': 3.274853801169591e-05, 'epoch': 1.05}


 38%|███▊      | 130/342 [01:27<01:30,  2.34it/s]

{'loss': 0.1228, 'grad_norm': 12.573695182800293, 'learning_rate': 3.128654970760234e-05, 'epoch': 1.14}


 41%|████      | 140/342 [01:35<01:30,  2.24it/s]

{'loss': 0.0342, 'grad_norm': 0.12160547077655792, 'learning_rate': 2.9824561403508772e-05, 'epoch': 1.23}


 44%|████▍     | 150/342 [01:42<01:23,  2.31it/s]

{'loss': 0.1393, 'grad_norm': 0.16723516583442688, 'learning_rate': 2.8362573099415208e-05, 'epoch': 1.32}


 47%|████▋     | 160/342 [01:48<01:16,  2.37it/s]

{'loss': 0.0409, 'grad_norm': 0.20046114921569824, 'learning_rate': 2.6900584795321637e-05, 'epoch': 1.4}


 50%|████▉     | 170/342 [01:54<01:12,  2.37it/s]

{'loss': 0.1311, 'grad_norm': 0.11337663978338242, 'learning_rate': 2.5438596491228074e-05, 'epoch': 1.49}


 53%|█████▎    | 180/342 [02:01<01:10,  2.29it/s]

{'loss': 0.1002, 'grad_norm': 5.323081970214844, 'learning_rate': 2.3976608187134503e-05, 'epoch': 1.58}


 56%|█████▌    | 190/342 [02:08<01:07,  2.26it/s]

{'loss': 0.1361, 'grad_norm': 0.0889088436961174, 'learning_rate': 2.2514619883040936e-05, 'epoch': 1.67}


 58%|█████▊    | 200/342 [02:15<01:00,  2.36it/s]

{'loss': 0.0598, 'grad_norm': 1.974332332611084, 'learning_rate': 2.105263157894737e-05, 'epoch': 1.75}


 61%|██████▏   | 210/342 [02:21<00:56,  2.35it/s]

{'loss': 0.0997, 'grad_norm': 1.0070358514785767, 'learning_rate': 1.9590643274853802e-05, 'epoch': 1.84}


 64%|██████▍   | 220/342 [02:27<00:50,  2.40it/s]

{'loss': 0.1034, 'grad_norm': 0.5035066604614258, 'learning_rate': 1.8128654970760235e-05, 'epoch': 1.93}


                                                 
 67%|██████▋   | 228/342 [02:36<00:45,  2.51it/s]

{'eval_loss': 0.11506521701812744, 'eval_runtime': 3.031, 'eval_samples_per_second': 149.456, 'eval_steps_per_second': 9.568, 'epoch': 2.0}


 67%|██████▋   | 230/342 [02:37<01:54,  1.02s/it]

{'loss': 0.0419, 'grad_norm': 2.66349720954895, 'learning_rate': 1.6666666666666667e-05, 'epoch': 2.02}


 70%|███████   | 240/342 [02:43<00:43,  2.34it/s]

{'loss': 0.0102, 'grad_norm': 0.0621904581785202, 'learning_rate': 1.5204678362573099e-05, 'epoch': 2.11}


 73%|███████▎  | 250/342 [02:49<00:37,  2.48it/s]

{'loss': 0.0154, 'grad_norm': 1.4612834453582764, 'learning_rate': 1.3742690058479531e-05, 'epoch': 2.19}


 76%|███████▌  | 260/342 [02:56<00:33,  2.43it/s]

{'loss': 0.0431, 'grad_norm': 0.03354426845908165, 'learning_rate': 1.2280701754385964e-05, 'epoch': 2.28}


 79%|███████▉  | 270/342 [03:04<00:30,  2.36it/s]

{'loss': 0.0198, 'grad_norm': 0.03577283397316933, 'learning_rate': 1.0818713450292397e-05, 'epoch': 2.37}


 82%|████████▏ | 280/342 [03:09<00:24,  2.48it/s]

{'loss': 0.0342, 'grad_norm': 0.4569975435733795, 'learning_rate': 9.35672514619883e-06, 'epoch': 2.46}


 85%|████████▍ | 290/342 [03:15<00:20,  2.49it/s]

{'loss': 0.0812, 'grad_norm': 0.24573484063148499, 'learning_rate': 7.894736842105263e-06, 'epoch': 2.54}


 88%|████████▊ | 300/342 [03:27<00:19,  2.10it/s]

{'loss': 0.017, 'grad_norm': 0.07203696668148041, 'learning_rate': 6.432748538011696e-06, 'epoch': 2.63}


 91%|█████████ | 310/342 [03:40<00:15,  2.03it/s]

{'loss': 0.0716, 'grad_norm': 0.03813231736421585, 'learning_rate': 4.970760233918129e-06, 'epoch': 2.72}


 94%|█████████▎| 320/342 [03:46<00:08,  2.48it/s]

{'loss': 0.0407, 'grad_norm': 0.18236462771892548, 'learning_rate': 3.5087719298245615e-06, 'epoch': 2.81}


 96%|█████████▋| 330/342 [04:03<00:06,  1.83it/s]

{'loss': 0.0177, 'grad_norm': 0.07016615569591522, 'learning_rate': 2.0467836257309943e-06, 'epoch': 2.89}


 99%|█████████▉| 340/342 [04:12<00:00,  2.12it/s]

{'loss': 0.0271, 'grad_norm': 0.053797993808984756, 'learning_rate': 5.847953216374269e-07, 'epoch': 2.98}


                                                 
100%|██████████| 342/342 [04:21<00:00,  1.31it/s]


{'eval_loss': 0.12069191783666611, 'eval_runtime': 3.4001, 'eval_samples_per_second': 133.229, 'eval_steps_per_second': 8.529, 'epoch': 3.0}
{'train_runtime': 261.1558, 'train_samples_per_second': 20.804, 'train_steps_per_second': 1.31, 'train_loss': 0.16529753169573147, 'epoch': 3.0}


100%|██████████| 29/29 [00:03<00:00,  8.80it/s]


{'eval_loss': 0.12069191783666611,
 'eval_runtime': 3.3123,
 'eval_samples_per_second': 136.764,
 'eval_steps_per_second': 8.755,
 'epoch': 3.0}

### BERT Val

In [11]:
model
test_texts = [
"The company's profit has increased significantly this quarter.",
"The increase in costs negatively affected the revenue.",
"The company's performance remianed stable."
]

test_encodings = tokenizer(test_texts, truncation=True, padding=True, return_tensors="pt").to(device)

outputs = model(**test_encodings)

preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()

label_map = {0:"Negative",1:"Neutral", 2:"Postive"}
predicted_labels = [label_map[pred] for pred in preds]
print(predicted_labels)

['Postive', 'Negative', 'Postive']
