# Transformer模型微調

本筆記本實作和微調Transformer模型：
- DistilBERT: 輕量化BERT模型
- RoBERTa: 強化版BERT模型

比較不同Transformer架構的性能表現。

In [None]:
# 導入必要套件
import sys
sys.path.append('../')

import torch
from torch.utils.data import DataLoader
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

# 自定義模組
from src.data import IMDBDataLoader, TextPreprocessor, IMDBDataset
from src.models.transformers import (
    DistilBERTClassifier, RoBERTaClassifier, 
    TransformerTrainer, TransformerModelManager
)
from src.evaluation.evaluator import ModelEvaluator
from src.utils.logger import logger

# 設定隨機種子
torch.manual_seed(42)
np.random.seed(42)

# 檢查GPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"使用設備: {device}")
print("套件載入完成！")

In [None]:
# 載入和預處理資料
logger.info("載入IMDB資料集...")

data_loader = IMDBDataLoader(cache_dir="../data/raw")
train_texts, train_labels, test_texts, test_labels = data_loader.load_data()

# 為了節省訓練時間，使用較小的資料子集
# 在實際應用中可以使用完整資料集
train_texts = train_texts[:5000]  # 使用5000個訓練樣本
train_labels = train_labels[:5000]
test_texts = test_texts[:1000]    # 使用1000個測試樣本
test_labels = test_labels[:1000]

# 創建驗證集
train_texts_final, val_texts, train_labels_final, val_labels = train_test_split(
    train_texts, train_labels, test_size=0.2, stratify=train_labels, random_state=42
)

print(f"訓練集: {len(train_texts_final)}")
print(f"驗證集: {len(val_texts)}")
print(f"測試集: {len(test_texts)}")

In [None]:
# 基礎文本預處理（保持原始結構供BERT使用）
preprocessor = TextPreprocessor(
    remove_html=True,
    remove_urls=True,
    lowercase=False,  # BERT對大小寫敏感
    handle_negations=False,  # BERT能處理否定詞
    remove_punctuation=False  # 保留標點符號
)

train_texts_clean = preprocessor.preprocess_batch(train_texts_final)
val_texts_clean = preprocessor.preprocess_batch(val_texts)
test_texts_clean = preprocessor.preprocess_batch(test_texts)

print("文本預處理完成！")
print(f"範例文本: {train_texts_clean[0][:200]}...")

In [None]:
# 創建BERT格式的Dataset
batch_size = 16  # Transformer模型通常使用較小的batch size
max_length = 256

# 創建Dataset
train_dataset = IMDBDataset(
    train_texts_clean, train_labels_final, 
    tokenizer_name='distilbert-base-uncased',
    max_length=max_length, is_bert_like=True
)

val_dataset = IMDBDataset(
    val_texts_clean, val_labels,
    tokenizer_name='distilbert-base-uncased', 
    max_length=max_length, is_bert_like=True
)

test_dataset = IMDBDataset(
    test_texts_clean, test_labels,
    tokenizer_name='distilbert-base-uncased',
    max_length=max_length, is_bert_like=True
)

# 創建DataLoader
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

print(f"訓練批次數: {len(train_loader)}")
print(f"驗證批次數: {len(val_loader)}")

In [None]:
# 訓練DistilBERT模型
logger.info("開始訓練DistilBERT模型...")

model_manager = TransformerModelManager(device=device)

# 創建DistilBERT模型
distilbert_model = model_manager.create_model(
    'distilbert',
    num_labels=2,
    max_length=max_length
)

# 創建訓練器
distilbert_trainer = TransformerTrainer(
    model=distilbert_model,
    train_loader=train_loader,
    val_loader=val_loader,
    output_dir='../experiments/models'
)

# 設置優化器和調度器
distilbert_trainer.setup_optimizer_and_scheduler(
    learning_rate=2e-5,
    weight_decay=0.01,
    num_epochs=3,
    warmup_ratio=0.1
)

# 訓練模型
distilbert_history = distilbert_trainer.train(
    epochs=3,
    model_name='distilbert_imdb'
)

print("DistilBERT訓練完成！")

In [None]:
# 繪製DistilBERT訓練歷史
def plot_transformer_history(history, model_name):
    """繪製Transformer訓練歷史"""
    fig, axes = plt.subplots(1, 3, figsize=(15, 4))
    
    # 損失曲線
    axes[0].plot(history['train_loss'], label='Train Loss', marker='o')
    axes[0].plot(history['val_loss'], label='Val Loss', marker='s')
    axes[0].set_title(f'{model_name} - Loss')
    axes[0].set_xlabel('Epoch')
    axes[0].set_ylabel('Loss')
    axes[0].legend()
    axes[0].grid(True, alpha=0.3)
    
    # 準確率曲線
    axes[1].plot(history['val_accuracy'], label='Val Accuracy', marker='o', color='green')
    axes[1].set_title(f'{model_name} - Validation Accuracy')
    axes[1].set_xlabel('Epoch')
    axes[1].set_ylabel('Accuracy')
    axes[1].legend()
    axes[1].grid(True, alpha=0.3)
    
    # 學習率曲線
    axes[2].plot(history['learning_rate'], label='Learning Rate', marker='d', color='red')
    axes[2].set_title(f'{model_name} - Learning Rate')
    axes[2].set_xlabel('Epoch')
    axes[2].set_ylabel('Learning Rate')
    axes[2].legend()
    axes[2].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

# 繪製DistilBERT訓練歷史
plot_transformer_history(distilbert_history, 'DistilBERT')

In [None]:
# 評估DistilBERT模型
logger.info("評估DistilBERT模型...")

evaluator = ModelEvaluator()

# 載入最佳模型
distilbert_trainer.load_model('../experiments/models/distilbert_imdb')

# 在測試集上預測
distilbert_predictions, distilbert_probs = distilbert_model.predict(test_texts_clean)

# 評估結果
distilbert_results = evaluator.evaluate_classification(
    test_labels, distilbert_predictions, distilbert_probs, 'DistilBERT'
)

print(f"DistilBERT測試準確率: {distilbert_results['accuracy']:.4f}")
print(f"DistilBERT F1分數: {distilbert_results['f1_score']:.4f}")

if 'auc_roc' in distilbert_results:
    print(f"DistilBERT AUC-ROC: {distilbert_results['auc_roc']:.4f}")

In [None]:
# 模型性能總結
print("\n" + "="*60)
print("TRANSFORMER模型性能總結")
print("="*60)

comparison_df = evaluator.compare_models(evaluator.evaluation_results)
print(comparison_df)

# 繪製混淆矩陣
if 'confusion_matrix' in distilbert_results:
    fig = evaluator.plot_confusion_matrix(
        distilbert_results['confusion_matrix'], 'DistilBERT'
    )
    plt.show()

# 生成評估報告
report = evaluator.generate_evaluation_report(evaluator.evaluation_results)
print("\n" + "="*50)
print(report)

In [None]:
# 示例預測
sample_texts = [
    "This movie was absolutely fantastic! Great acting and amazing plot.",
    "Terrible movie, waste of time. Poor acting and boring story.",
    "The film was okay, nothing special but not bad either."
]

print("DistilBERT預測示例:")
print("-" * 50)

predictions, probabilities = distilbert_model.predict(sample_texts)

for i, text in enumerate(sample_texts):
    pred_label = "正面" if predictions[i] == 1 else "負面"
    confidence = probabilities[i][predictions[i]]
    
    print(f"文本: {text}")
    print(f"預測: {pred_label} (信心度: {confidence:.3f})")
    print(f"機率分佈: 負面={probabilities[i][0]:.3f}, 正面={probabilities[i][1]:.3f}")
    print()