# 傳統機器學習基線模型

本筆記本實作和比較多種傳統機器學習算法：
- 邏輯回歸 (Logistic Regression)
- 支持向量機 (SVM)
- 樸素貝葉斯 (Naive Bayes)
- 隨機森林 (Random Forest)

使用TF-IDF特徵提取和網格搜索優化參數。

In [None]:
# 導入必要套件
import sys
sys.path.append('../')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

# 自定義模組
from src.data import IMDBDataLoader, TextPreprocessor
from src.models.baseline import BaselineModelManager
from src.evaluation.evaluator import ModelEvaluator
from src.utils.logger import logger
from src.utils.config import ProjectConfig

# 設定隨機種子
np.random.seed(42)

print("套件載入完成！")

In [None]:
# 載入和預處理資料
logger.info("載入IMDB資料集...")

# 載入資料
data_loader = IMDBDataLoader(cache_dir="../data/raw")
train_texts, train_labels, test_texts, test_labels = data_loader.load_data()

# 創建驗證集
train_texts_final, val_texts, train_labels_final, val_labels = train_test_split(
    train_texts, train_labels, test_size=0.2, stratify=train_labels, random_state=42
)

print(f"最終訓練集大小: {len(train_texts_final)}")
print(f"驗證集大小: {len(val_texts)}")
print(f"測試集大小: {len(test_texts)}")

In [None]:
# 文本預處理
logger.info("進行文本預處理...")

preprocessor = TextPreprocessor(
    remove_html=True,
    remove_urls=True,
    lowercase=True,
    handle_negations=True,
    remove_punctuation=False  # 保留標點符號供TF-IDF使用
)

# 預處理所有文本
train_texts_clean = preprocessor.preprocess_batch(train_texts_final)
val_texts_clean = preprocessor.preprocess_batch(val_texts)
test_texts_clean = preprocessor.preprocess_batch(test_texts)

print("文本預處理完成！")
print(f"預處理前範例: {train_texts_final[0][:100]}...")
print(f"預處理後範例: {train_texts_clean[0][:100]}...")

In [None]:
# 訓練基線模型
logger.info("開始訓練基線模型...")

model_manager = BaselineModelManager(models_dir="../experiments/models")

# 訓練所有模型（使用網格搜索）
training_results = model_manager.train_all_models(
    train_texts_clean, 
    train_labels_final,
    use_grid_search=True
)

# 顯示訓練結果
for model_name, result in training_results.items():
    if 'error' not in result:
        print(f"\n{model_name}:")
        print(f"  CV分數: {result['cv_score']:.4f}")
        print(f"  最佳參數: {result['best_params']}")
    else:
        print(f"\n{model_name}: 訓練失敗 - {result['error']}")

In [None]:
# 在驗證集上評估模型
logger.info("在驗證集上評估模型...")

evaluator = ModelEvaluator()
validation_results = model_manager.evaluate_all_models(val_texts_clean, val_labels)

# 使用評估器進行詳細分析
for model_name, result in validation_results.items():
    if 'error' not in result:
        evaluator.evaluate_classification(
            val_labels,
            result['predictions'],
            result.get('probabilities'),
            model_name
        )

print("驗證集評估完成！")

In [None]:
# 模型比較和視覺化
comparison_df = evaluator.compare_models(evaluator.evaluation_results)
print("模型性能比較:")
print(comparison_df)

# 繪製比較圖
fig = evaluator.plot_model_comparison(comparison_df)
plt.show()

# 繪製ROC曲線
if any('auc_roc' in result for result in evaluator.evaluation_results.values()):
    fig_roc = evaluator.plot_roc_curves(evaluator.evaluation_results)
    plt.show()

In [None]:
# 最佳模型測試集評估
best_model_name = comparison_df.iloc[0]['Model']
logger.info(f"最佳模型: {best_model_name}")

# 在測試集上評估最佳模型
test_result = model_manager.evaluate_model(best_model_name, test_texts_clean, test_labels)
print(f"\n{best_model_name} 測試集結果:")
print(f"準確率: {test_result['accuracy']:.4f}")

# 繪製混淆矩陣
fig_cm = evaluator.plot_confusion_matrix(
    test_result['confusion_matrix'], 
    best_model_name
)
plt.show()

# 生成評估報告
report = evaluator.generate_evaluation_report(evaluator.evaluation_results)
print("\n" + "="*50)
print(report)