# SVC 与线性核：实现方式对比

## 概述

scikit-learn 提供了多种线性 SVM 分类器实现，本 notebook 对比分析它们的差异、适用场景和性能特点。

### 三种主要实现

| 实现 | 核心库 | 优化目标 | 适用规模 |
|------|--------|----------|----------|
| `SVC(kernel='linear')` | libsvm | 对偶问题 | 小规模 |
| `LinearSVC` | liblinear | 原始/对偶 | 中大规模 |
| `SGDClassifier(loss='hinge')` | 自研 | SGD | 超大规模 |

### 数学背景

线性 SVM 的优化目标：

$$
\min_{w,b} \frac{1}{2}\|w\|^2 + C\sum_{i=1}^{n}\max(0, 1 - y_i(w^\top x_i + b))
$$

其中 $\max(0, 1 - y_i(w^\top x_i + b))$ 是 hinge 损失函数。

## 1. 环境配置

In [None]:
# =============================================================================
# 导入库
# =============================================================================
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.datasets import make_classification, load_iris
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import time
import warnings
warnings.filterwarnings('ignore')

# 设置随机种子
np.random.seed(42)

# matplotlib 配置
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS', 'SimHei', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['figure.dpi'] = 100

## 2. 数据准备

In [None]:
# =============================================================================
# 生成二分类数据集
# =============================================================================

# 使用 make_classification 生成线性可分的数据
X, y = make_classification(
    n_samples=500,
    n_features=2,
    n_informative=2,
    n_redundant=0,
    n_clusters_per_class=1,
    class_sep=1.5,
    random_state=42
)

# 划分数据集
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 特征标准化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"训练集: {X_train.shape[0]} 样本, {X_train.shape[1]} 特征")
print(f"测试集: {X_test.shape[0]} 样本")
print(f"类别分布: 0类 = {sum(y_train==0)}, 1类 = {sum(y_train==1)}")

In [None]:
# =============================================================================
# 可视化数据分布
# =============================================================================

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# 原始数据
ax = axes[0]
scatter = ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, 
                     cmap='RdYlBu', alpha=0.7, edgecolors='white', s=50)
ax.set_xlabel('特征 1', fontsize=12)
ax.set_ylabel('特征 2', fontsize=12)
ax.set_title('原始数据分布', fontsize=14)
ax.grid(True, alpha=0.3)
plt.colorbar(scatter, ax=ax, label='类别')

# 标准化后的数据
ax = axes[1]
scatter = ax.scatter(X_train_scaled[:, 0], X_train_scaled[:, 1], c=y_train, 
                     cmap='RdYlBu', alpha=0.7, edgecolors='white', s=50)
ax.set_xlabel('特征 1 (标准化)', fontsize=12)
ax.set_ylabel('特征 2 (标准化)', fontsize=12)
ax.set_title('标准化后的数据分布', fontsize=14)
ax.grid(True, alpha=0.3)
plt.colorbar(scatter, ax=ax, label='类别')

plt.tight_layout()
plt.show()

## 3. 三种线性 SVM 实现对比

In [None]:
# =============================================================================
# 定义三种线性 SVM 分类器
# =============================================================================

# 设置相同的正则化强度
C = 1.0

# 定义模型
models = {
    'SVC (linear kernel)': SVC(kernel='linear', C=C, random_state=42),
    'LinearSVC': LinearSVC(C=C, loss='hinge', max_iter=10000, random_state=42),
    'SGDClassifier': SGDClassifier(
        loss='hinge',           # hinge loss = SVM
        alpha=1/(len(X_train) * C),  # alpha ≈ 1/(n*C)
        max_iter=1000,
        tol=1e-3,
        random_state=42
    )
}

# 训练和评估
results = []

for name, model in models.items():
    # 训练
    start_time = time.time()
    model.fit(X_train_scaled, y_train)
    train_time = time.time() - start_time
    
    # 预测
    y_pred_train = model.predict(X_train_scaled)
    y_pred_test = model.predict(X_test_scaled)
    
    # 评估
    train_acc = accuracy_score(y_train, y_pred_train)
    test_acc = accuracy_score(y_test, y_pred_test)
    
    results.append({
        'name': name,
        'train_acc': train_acc,
        'test_acc': test_acc,
        'train_time': train_time,
        'model': model
    })

# 打印结果
print("="*70)
print("三种线性 SVM 实现性能对比")
print("="*70)
print(f"{'模型':<25} {'训练准确率':<15} {'测试准确率':<15} {'训练时间(ms)':<15}")
print("-"*70)
for r in results:
    print(f"{r['name']:<25} {r['train_acc']:<15.4f} {r['test_acc']:<15.4f} {r['train_time']*1000:<15.2f}")

## 4. 决策边界可视化

In [None]:
# =============================================================================
# 可视化决策边界
# =============================================================================

def plot_decision_boundary(ax, model, X, y, title):
    """绘制决策边界和间隔"""
    # 创建网格
    h = 0.02
    x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
    y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    
    # 预测
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    
    # 绘制决策区域
    ax.contourf(xx, yy, Z, alpha=0.3, cmap='RdYlBu')
    
    # 绘制决策边界
    ax.contour(xx, yy, Z, levels=[0.5], colors='black', linewidths=2)
    
    # 绘制数据点
    ax.scatter(X[y==0, 0], X[y==0, 1], c='red', marker='o', 
               edgecolors='white', s=50, label='类别 0')
    ax.scatter(X[y==1, 0], X[y==1, 1], c='blue', marker='s', 
               edgecolors='white', s=50, label='类别 1')
    
    ax.set_xlabel('特征 1', fontsize=11)
    ax.set_ylabel('特征 2', fontsize=11)
    ax.set_title(title, fontsize=12)
    ax.legend(loc='upper right', fontsize=9)
    ax.grid(True, alpha=0.3)

# 绘制三个模型的决策边界
fig, axes = plt.subplots(1, 3, figsize=(16, 5))

for idx, r in enumerate(results):
    plot_decision_boundary(axes[idx], r['model'], X_train_scaled, y_train, 
                          f"{r['name']}\n(Acc: {r['test_acc']:.3f})")

plt.suptitle('线性 SVM 分类器决策边界对比', fontsize=14, y=1.02)
plt.tight_layout()
plt.show()

## 5. 模型参数分析

In [None]:
# =============================================================================
# 分析模型参数（权重和偏置）
# =============================================================================

print("="*60)
print("模型参数对比")
print("="*60)

for r in results:
    model = r['model']
    print(f"\n{r['name']}:")
    
    # 获取权重和偏置
    if hasattr(model, 'coef_'):
        weights = model.coef_.flatten()
        print(f"  权重 (w): [{weights[0]:.4f}, {weights[1]:.4f}]")
    
    if hasattr(model, 'intercept_'):
        intercept = model.intercept_[0] if len(model.intercept_) > 0 else model.intercept_
        print(f"  偏置 (b): {intercept:.4f}")
    
    # SVC 特有: 支持向量
    if hasattr(model, 'support_'):
        print(f"  支持向量数: {len(model.support_)}")

# 可视化权重向量
fig, ax = plt.subplots(figsize=(10, 6))

width = 0.25
x = np.arange(2)

for idx, r in enumerate(results):
    if hasattr(r['model'], 'coef_'):
        weights = r['model'].coef_.flatten()
        ax.bar(x + idx*width, weights, width, label=r['name'], alpha=0.8)

ax.set_xticks(x + width)
ax.set_xticklabels(['权重 w1', '权重 w2'])
ax.set_ylabel('权重值')
ax.set_title('三种实现的权重对比')
ax.legend()
ax.grid(True, alpha=0.3, axis='y')
ax.axhline(y=0, color='black', linestyle='-', linewidth=0.5)

plt.tight_layout()
plt.show()

## 6. C 参数影响分析

In [None]:
# =============================================================================
# 分析 C 参数对决策边界的影响
# =============================================================================

C_values = [0.01, 0.1, 1.0, 10.0, 100.0]

fig, axes = plt.subplots(1, 5, figsize=(20, 4))

for idx, C in enumerate(C_values):
    # 训练模型
    model = LinearSVC(C=C, loss='hinge', max_iter=10000, random_state=42)
    model.fit(X_train_scaled, y_train)
    
    # 计算准确率
    acc = accuracy_score(y_test, model.predict(X_test_scaled))
    
    # 绘制
    ax = axes[idx]
    
    # 创建网格
    h = 0.05
    x_min, x_max = X_train_scaled[:, 0].min() - 0.5, X_train_scaled[:, 0].max() + 0.5
    y_min, y_max = X_train_scaled[:, 1].min() - 0.5, X_train_scaled[:, 1].max() + 0.5
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)
    
    ax.contourf(xx, yy, Z, alpha=0.3, cmap='RdYlBu')
    ax.scatter(X_train_scaled[:, 0], X_train_scaled[:, 1], c=y_train, 
               cmap='RdYlBu', edgecolors='white', s=30)
    ax.set_title(f'C = {C}\nAcc = {acc:.3f}')
    ax.set_xlabel('特征 1')
    ax.grid(True, alpha=0.3)

axes[0].set_ylabel('特征 2')
plt.suptitle('正则化参数 C 对决策边界的影响', fontsize=14, y=1.05)
plt.tight_layout()
plt.show()

print("\nC 参数总结:")
print("- C 较小: 强正则化，间隔更宽，允许更多误分类")
print("- C 较大: 弱正则化，间隔更窄，更严格地分类训练样本")

## 7. 大规模数据扩展性测试

In [None]:
# =============================================================================
# 测试不同规模数据上的训练时间
# =============================================================================

sample_sizes = [100, 500, 1000, 2000, 5000]
timing_results = {name: [] for name in ['SVC (linear)', 'LinearSVC', 'SGDClassifier']}

for n_samples in sample_sizes:
    # 生成数据
    X_scale, y_scale = make_classification(
        n_samples=n_samples,
        n_features=20,
        n_informative=10,
        n_redundant=5,
        random_state=42
    )
    X_scale = StandardScaler().fit_transform(X_scale)
    
    # 测试 SVC
    if n_samples <= 2000:  # SVC 在大数据上太慢
        start = time.time()
        SVC(kernel='linear', C=1.0).fit(X_scale, y_scale)
        timing_results['SVC (linear)'].append(time.time() - start)
    else:
        timing_results['SVC (linear)'].append(np.nan)
    
    # 测试 LinearSVC
    start = time.time()
    LinearSVC(C=1.0, max_iter=10000).fit(X_scale, y_scale)
    timing_results['LinearSVC'].append(time.time() - start)
    
    # 测试 SGDClassifier
    start = time.time()
    SGDClassifier(loss='hinge', max_iter=1000).fit(X_scale, y_scale)
    timing_results['SGDClassifier'].append(time.time() - start)

# 可视化
fig, ax = plt.subplots(figsize=(10, 6))

for name, times in timing_results.items():
    ax.plot(sample_sizes, times, 'o-', linewidth=2, markersize=8, label=name)

ax.set_xlabel('样本数量', fontsize=12)
ax.set_ylabel('训练时间 (秒)', fontsize=12)
ax.set_title('不同实现的扩展性对比', fontsize=14)
ax.legend()
ax.grid(True, alpha=0.3)
ax.set_xscale('log')
ax.set_yscale('log')

plt.tight_layout()
plt.show()

print("\n扩展性总结:")
print("- SVC: O(n²) ~ O(n³) 复杂度，适合小数据集")
print("- LinearSVC: O(n) 复杂度，适合中大规模数据")
print("- SGDClassifier: O(n) 复杂度，适合超大规模和流式数据")

## 8. 单元测试

In [None]:
# =============================================================================
# 单元测试
# =============================================================================

def run_tests():
    """运行单元测试"""
    test_results = []
    
    # 测试 1: SVC 线性核训练
    try:
        model = SVC(kernel='linear', C=1.0)
        model.fit(X_train_scaled, y_train)
        assert hasattr(model, 'support_'), "SVC 未正确训练"
        test_results.append(("SVC 线性核训练", True, f"支持向量数: {len(model.support_)}"))
    except Exception as e:
        test_results.append(("SVC 线性核训练", False, str(e)))
    
    # 测试 2: LinearSVC 训练
    try:
        model = LinearSVC(C=1.0, max_iter=10000)
        model.fit(X_train_scaled, y_train)
        assert hasattr(model, 'coef_'), "LinearSVC 未正确训练"
        test_results.append(("LinearSVC 训练", True, ""))
    except Exception as e:
        test_results.append(("LinearSVC 训练", False, str(e)))
    
    # 测试 3: SGDClassifier 训练
    try:
        model = SGDClassifier(loss='hinge', max_iter=1000)
        model.fit(X_train_scaled, y_train)
        assert hasattr(model, 'coef_'), "SGDClassifier 未正确训练"
        test_results.append(("SGDClassifier 训练", True, ""))
    except Exception as e:
        test_results.append(("SGDClassifier 训练", False, str(e)))
    
    # 测试 4: 预测输出格式
    try:
        for name, m in models.items():
            pred = m.predict(X_test_scaled)
            assert pred.shape == y_test.shape, f"{name} 预测维度错误"
        test_results.append(("预测输出格式", True, ""))
    except Exception as e:
        test_results.append(("预测输出格式", False, str(e)))
    
    # 测试 5: 准确率合理
    try:
        for name, m in models.items():
            acc = accuracy_score(y_test, m.predict(X_test_scaled))
            assert acc > 0.5, f"{name} 准确率过低: {acc}"
        test_results.append(("准确率合理性", True, ""))
    except Exception as e:
        test_results.append(("准确率合理性", False, str(e)))
    
    # 测试 6: Pipeline 集成
    try:
        pipe = Pipeline([
            ('scaler', StandardScaler()),
            ('svc', LinearSVC(C=1.0, max_iter=10000))
        ])
        pipe.fit(X_train, y_train)
        pred = pipe.predict(X_test)
        assert pred.shape == y_test.shape
        test_results.append(("Pipeline 集成", True, ""))
    except Exception as e:
        test_results.append(("Pipeline 集成", False, str(e)))
    
    # 输出结果
    print("="*60)
    print("单元测试结果")
    print("="*60)
    
    passed = 0
    for name, success, msg in test_results:
        status = "✓ 通过" if success else "✗ 失败"
        passed += int(success)
        print(f"{status} | {name}")
        if msg:
            print(f"       {msg}")
    
    print("="*60)
    print(f"总计: {passed}/{len(test_results)} 测试通过")
    print("="*60)
    
    return passed == len(test_results)

# 运行测试
all_passed = run_tests()

## 9. 知识总结

### 三种实现的选择指南

| 场景 | 推荐实现 | 原因 |
|------|----------|------|
| 样本数 < 10,000 | `SVC(kernel='linear')` | 稳定可靠，支持向量信息丰富 |
| 10,000 < 样本数 < 100,000 | `LinearSVC` | 训练快速，内存效率高 |
| 样本数 > 100,000 或流式数据 | `SGDClassifier` | 支持增量学习，内存占用低 |
| 需要概率输出 | `SVC(probability=True)` | 支持 Platt scaling |
| 需要自定义损失 | `SGDClassifier` | 支持多种损失函数 |

### 关键参数

1. **C (正则化参数)**
   - 控制间隔宽度与误分类惩罚的权衡
   - 通常在 $[10^{-3}, 10^3]$ 范围内网格搜索

2. **loss (损失函数)**
   - `hinge`: 标准 SVM (LinearSVC/SGDClassifier)
   - `squared_hinge`: 平方 hinge，梯度更平滑 (LinearSVC 默认)

3. **dual (对偶问题)**
   - `dual=True`: 样本数 < 特征数时更高效
   - `dual=False`: 样本数 > 特征数时更高效

### 最佳实践

1. 始终对特征进行标准化
2. 使用 Pipeline 封装预处理和模型
3. 通过交叉验证选择最佳 C 值
4. 大数据集优先考虑 LinearSVC 或 SGDClassifier