# 深度學習模型實作

本筆記本實作和訓練深度學習模型：
- TextCNN: 卷積神經網路文本分類
- BiLSTM: 雙向長短期記憶網路
- GRU: 門控循環單元

比較不同架構的性能表現。

In [None]:
# 導入必要套件
import sys
sys.path.append('../')

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from collections import Counter

# 自定義模組
from src.data import IMDBDataLoader, TextPreprocessor, IMDBDataset
from src.models.deep_learning import TextCNN, BiLSTM, GRUClassifier, DeepLearningModelManager
from src.training.trainer import DeepLearningTrainer
from src.evaluation.evaluator import ModelEvaluator
from src.utils.logger import logger

# 設定隨機種子
torch.manual_seed(42)
np.random.seed(42)

# 檢查GPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"使用設備: {device}")
print("套件載入完成！")

In [None]:
# 載入和預處理資料
logger.info("載入IMDB資料集...")

data_loader = IMDBDataLoader(cache_dir="../data/raw")
train_texts, train_labels, test_texts, test_labels = data_loader.load_data()

# 創建驗證集
train_texts_final, val_texts, train_labels_final, val_labels = train_test_split(
    train_texts, train_labels, test_size=0.2, stratify=train_labels, random_state=42
)

# 文本預處理
preprocessor = TextPreprocessor(
    remove_html=True,
    remove_urls=True,
    lowercase=True,
    handle_negations=True
)

train_texts_clean = preprocessor.preprocess_batch(train_texts_final)
val_texts_clean = preprocessor.preprocess_batch(val_texts)
test_texts_clean = preprocessor.preprocess_batch(test_texts)

print(f"訓練集: {len(train_texts_clean)}")
print(f"驗證集: {len(val_texts_clean)}")
print(f"測試集: {len(test_texts_clean)}")

In [None]:
# 建立詞彙表和數值化
def build_vocab(texts, max_vocab_size=10000):
    """建立詞彙表"""
    word_counts = Counter()
    for text in texts:
        words = text.split()
        word_counts.update(words)
    
    # 取最常見的詞彙
    most_common = word_counts.most_common(max_vocab_size - 2)  # 保留<UNK>和<PAD>
    
    vocab = {'<PAD>': 0, '<UNK>': 1}
    for word, _ in most_common:
        vocab[word] = len(vocab)
    
    return vocab

def texts_to_sequences(texts, vocab, max_length=256):
    """將文本轉換為數值序列"""
    sequences = []
    for text in texts:
        words = text.split()
        seq = [vocab.get(word, vocab['<UNK>']) for word in words]
        
        # 截斷或填充
        if len(seq) > max_length:
            seq = seq[:max_length]
        else:
            seq.extend([vocab['<PAD>']] * (max_length - len(seq)))
        
        sequences.append(seq)
    
    return np.array(sequences)

# 建立詞彙表
vocab = build_vocab(train_texts_clean, max_vocab_size=10000)
vocab_size = len(vocab)
max_length = 256

print(f"詞彙表大小: {vocab_size}")
print(f"最大序列長度: {max_length}")

# 轉換為數值序列
X_train = texts_to_sequences(train_texts_clean, vocab, max_length)
X_val = texts_to_sequences(val_texts_clean, vocab, max_length)
X_test = texts_to_sequences(test_texts_clean, vocab, max_length)

y_train = np.array(train_labels_final)
y_val = np.array(val_labels)
y_test = np.array(test_labels)

print(f"訓練資料形狀: {X_train.shape}")

In [None]:
# 創建DataLoader
from torch.utils.data import TensorDataset

batch_size = 32

# 轉換為張量
train_dataset = TensorDataset(torch.LongTensor(X_train), torch.LongTensor(y_train))
val_dataset = TensorDataset(torch.LongTensor(X_val), torch.LongTensor(y_val))
test_dataset = TensorDataset(torch.LongTensor(X_test), torch.LongTensor(y_test))

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

print(f"訓練批次數: {len(train_loader)}")
print(f"驗證批次數: {len(val_loader)}")

In [None]:
# 訓練TextCNN模型
logger.info("訓練TextCNN模型...")

model_manager = DeepLearningModelManager(device=device)

# 創建TextCNN模型
textcnn_model = model_manager.create_model(
    'textcnn', 
    vocab_size=vocab_size,
    embed_dim=128,
    num_filters=100,
    filter_sizes=[3, 4, 5]
)

# 創建訓練器
textcnn_trainer = DeepLearningTrainer(
    model=textcnn_model,
    train_loader=train_loader,
    val_loader=val_loader,
    device=device,
    output_dir='../experiments/models'
)

# 設置優化器
textcnn_trainer.setup_optimizer_and_scheduler(
    learning_rate=1e-3,
    optimizer_type='adam',
    scheduler_type='step',
    step_size=5,
    gamma=0.5
)

# 訓練模型
textcnn_history = textcnn_trainer.train(
    epochs=10,
    early_stopping_patience=3,
    model_name='textcnn_best'
)

print("TextCNN訓練完成！")

In [None]:
# 訓練BiLSTM模型
logger.info("訓練BiLSTM模型...")

# 創建BiLSTM模型
bilstm_model = model_manager.create_model(
    'bilstm',
    vocab_size=vocab_size,
    embed_dim=128,
    hidden_dim=64,
    num_layers=2
)

# 創建訓練器
bilstm_trainer = DeepLearningTrainer(
    model=bilstm_model,
    train_loader=train_loader,
    val_loader=val_loader,
    device=device,
    output_dir='../experiments/models'
)

# 設置優化器
bilstm_trainer.setup_optimizer_and_scheduler(
    learning_rate=1e-3,
    optimizer_type='adam',
    scheduler_type='step'
)

# 訓練模型
bilstm_history = bilstm_trainer.train(
    epochs=10,
    early_stopping_patience=3,
    model_name='bilstm_best'
)

print("BiLSTM訓練完成！")

In [None]:
# 繪製訓練歷史
def plot_training_history(history, model_name):
    """繪製訓練歷史"""
    fig, axes = plt.subplots(1, 2, figsize=(12, 4))
    
    # 損失曲線
    axes[0].plot(history['train_loss'], label='Train Loss')
    axes[0].plot(history['val_loss'], label='Val Loss')
    axes[0].set_title(f'{model_name} - Loss')
    axes[0].set_xlabel('Epoch')
    axes[0].set_ylabel('Loss')
    axes[0].legend()
    axes[0].grid(True)
    
    # 準確率曲線
    axes[1].plot(history['train_acc'], label='Train Acc')
    axes[1].plot(history['val_acc'], label='Val Acc')
    axes[1].set_title(f'{model_name} - Accuracy')
    axes[1].set_xlabel('Epoch')
    axes[1].set_ylabel('Accuracy')
    axes[1].legend()
    axes[1].grid(True)
    
    plt.tight_layout()
    plt.show()

# 繪製訓練歷史
plot_training_history(textcnn_history, 'TextCNN')
plot_training_history(bilstm_history, 'BiLSTM')

In [None]:
# 模型評估和比較
evaluator = ModelEvaluator()

# 載入最佳模型進行測試
textcnn_trainer.load_model('../experiments/models/textcnn_best.pth')
bilstm_trainer.load_model('../experiments/models/bilstm_best.pth')

# 在測試集上預測
textcnn_preds, textcnn_probs = textcnn_trainer.predict(test_loader)
bilstm_preds, bilstm_probs = bilstm_trainer.predict(test_loader)

# 評估結果
textcnn_results = evaluator.evaluate_classification(
    y_test, textcnn_preds, textcnn_probs, 'TextCNN'
)

bilstm_results = evaluator.evaluate_classification(
    y_test, bilstm_preds, bilstm_probs, 'BiLSTM'
)

# 模型比較
comparison_df = evaluator.compare_models(evaluator.evaluation_results)
print("深度學習模型性能比較:")
print(comparison_df)

# 生成評估報告
report = evaluator.generate_evaluation_report(evaluator.evaluation_results)
print("\n" + "="*50)
print(report)