# 支持向量机线性分类器入门

## 理论基础

支持向量机 (SVM) 是一种强大的监督学习算法，核心思想是寻找一个最优超平面来分隔不同类别的数据点。

### 核心概念

1. **超平面 (Hyperplane)**: 在 n 维空间中，超平面是 n-1 维的决策边界
   - 二维空间中是一条直线
   - 三维空间中是一个平面
   - 数学表达: $w^\top x + b = 0$

2. **支持向量 (Support Vectors)**: 距离超平面最近的样本点
   - 决定超平面位置的关键样本
   - 移除其他样本不影响决策边界

3. **间隔 (Margin)**: 超平面到最近样本点的距离
   - 几何间隔: $\gamma = \frac{2}{\|w\|}$
   - SVM 的目标是最大化间隔

### 硬间隔 vs 软间隔

- **硬间隔**: 要求所有样本被正确分类，不允许任何违规
- **软间隔**: 允许部分样本违规，通过参数 C 控制权衡

## 1. 环境配置

In [None]:
# =============================================================================
# 导入必要的库
# =============================================================================
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

# 设置随机种子
np.random.seed(42)

# matplotlib 配置
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS', 'SimHei', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['figure.dpi'] = 100

## 2. 数据准备

使用经典的鸢尾花数据集进行二分类任务。

In [None]:
# =============================================================================
# 加载并准备鸢尾花数据集
# =============================================================================

# 加载数据
iris = datasets.load_iris()

# 选择两个特征: 花瓣长度和花瓣宽度
X = iris['data'][:, (2, 3)]  # petal length, petal width

# 二分类任务: 是否为 Virginica (类别 2)
y = (iris['target'] == 2).astype(np.float64)

# 数据集信息
print("=" * 50)
print("数据集信息")
print("=" * 50)
print(f"样本数量: {X.shape[0]}")
print(f"特征数量: {X.shape[1]}")
print(f"特征名称: {iris['feature_names'][2]}, {iris['feature_names'][3]}")
print(f"\n类别分布:")
print(f"  非 Virginica (0): {sum(y == 0):.0f} 样本")
print(f"  Virginica (1): {sum(y == 1):.0f} 样本")

# 划分数据集
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\n训练集: {X_train.shape[0]} 样本")
print(f"测试集: {X_test.shape[0]} 样本")

In [None]:
# =============================================================================
# 可视化原始数据
# =============================================================================

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# 左图: 散点图
ax = axes[0]
ax.scatter(X[y==0, 0], X[y==0, 1], c='steelblue', marker='o', 
           edgecolors='white', s=60, label='非 Virginica')
ax.scatter(X[y==1, 0], X[y==1, 1], c='coral', marker='s', 
           edgecolors='white', s=60, label='Virginica')
ax.set_xlabel('花瓣长度 (cm)', fontsize=12)
ax.set_ylabel('花瓣宽度 (cm)', fontsize=12)
ax.set_title('鸢尾花数据分布', fontsize=14)
ax.legend()
ax.grid(True, alpha=0.3)

# 右图: 特征分布直方图
ax = axes[1]
ax.hist(X[y==0, 0], bins=15, alpha=0.6, color='steelblue', 
        label='非 Virginica - 花瓣长度')
ax.hist(X[y==1, 0], bins=15, alpha=0.6, color='coral', 
        label='Virginica - 花瓣长度')
ax.set_xlabel('花瓣长度 (cm)', fontsize=12)
ax.set_ylabel('频数', fontsize=12)
ax.set_title('花瓣长度分布', fontsize=14)
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 3. 构建 SVM 分类器

In [None]:
# =============================================================================
# 构建标准化 + SVM Pipeline
# =============================================================================

# 重要: SVM 对特征尺度敏感，必须先标准化!
svm_clf = Pipeline([
    # 标准化: 将特征缩放到均值=0，标准差=1
    ('scaler', StandardScaler()),
    
    # 线性 SVM 分类器
    # C=1: 正则化参数，控制间隔和违规之间的权衡
    # loss='hinge': 使用标准的 hinge 损失函数
    ('linear_svc', LinearSVC(C=1, loss='hinge', max_iter=10000, random_state=42))
])

# 训练模型
svm_clf.fit(X_train, y_train)

print("Pipeline 结构:")
for name, step in svm_clf.steps:
    print(f"  {name}: {type(step).__name__}")

print(f"\n模型参数:")
print(f"  权重 (w): {svm_clf.named_steps['linear_svc'].coef_.flatten()}")
print(f"  偏置 (b): {svm_clf.named_steps['linear_svc'].intercept_[0]:.4f}")

In [None]:
# =============================================================================
# 模型预测
# =============================================================================

# 测试样本
test_samples = [
    [5.5, 1.7],  # 较长较宽的花瓣
    [2.0, 0.5],  # 较短较窄的花瓣
    [4.0, 1.3],  # 中等大小
]

print("预测结果:")
print("-" * 60)
print(f"{'花瓣长度':<12} {'花瓣宽度':<12} {'预测类别':<15} {'决策值':<12}")
print("-" * 60)

for sample in test_samples:
    pred = svm_clf.predict([sample])[0]
    decision = svm_clf.decision_function([sample])[0]
    class_name = "Virginica" if pred == 1 else "非 Virginica"
    print(f"{sample[0]:<12} {sample[1]:<12} {class_name:<15} {decision:<12.4f}")

print("-" * 60)
print("\n注: 决策值 > 0 预测为正类 (Virginica)，< 0 预测为负类")

## 4. 决策边界可视化

In [None]:
# =============================================================================
# 可视化决策边界和间隔
# =============================================================================

# 创建网格
x0_min, x0_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
x1_min, x1_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
xx0, xx1 = np.meshgrid(np.linspace(x0_min, x0_max, 200),
                        np.linspace(x1_min, x1_max, 200))

# 预测网格点
X_grid = np.c_[xx0.ravel(), xx1.ravel()]
y_pred = svm_clf.predict(X_grid).reshape(xx0.shape)

# 获取决策函数值
decision_values = svm_clf.decision_function(X_grid).reshape(xx0.shape)

# 绘图
fig, ax = plt.subplots(figsize=(12, 8))

# 绘制决策边界和间隔
ax.contourf(xx0, xx1, y_pred, alpha=0.3, cmap='RdYlGn')
contours = ax.contour(xx0, xx1, decision_values, levels=[-1, 0, 1], 
                      colors=['blue', 'black', 'blue'], 
                      linestyles=['--', '-', '--'], linewidths=[1.5, 2, 1.5])
ax.clabel(contours, fmt={-1: 'margin=-1', 0: 'decision boundary', 1: 'margin=+1'}, 
          inline=True, fontsize=9)

# 绘制训练样本
ax.scatter(X[y==0, 0], X[y==0, 1], c='red', marker='o', 
           edgecolors='black', s=60, label='非 Virginica')
ax.scatter(X[y==1, 0], X[y==1, 1], c='green', marker='s', 
           edgecolors='black', s=60, label='Virginica')

# 标记测试样本
for sample in test_samples:
    ax.scatter(sample[0], sample[1], c='yellow', marker='*', 
               s=200, edgecolors='black', linewidths=2, zorder=5)

ax.set_xlabel('花瓣长度 (cm)', fontsize=12)
ax.set_ylabel('花瓣宽度 (cm)', fontsize=12)
ax.set_title('SVM 线性分类器 - 决策边界与间隔', fontsize=14)
ax.legend(loc='upper left')
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 5. 模型评估

In [None]:
# =============================================================================
# 评估模型性能
# =============================================================================

# 预测
y_pred_train = svm_clf.predict(X_train)
y_pred_test = svm_clf.predict(X_test)

# 准确率
train_acc = accuracy_score(y_train, y_pred_train)
test_acc = accuracy_score(y_test, y_pred_test)

# 交叉验证
cv_scores = cross_val_score(svm_clf, X, y, cv=5)

print("=" * 60)
print("模型性能评估")
print("=" * 60)
print(f"\n训练集准确率: {train_acc:.4f}")
print(f"测试集准确率: {test_acc:.4f}")
print(f"\n5-折交叉验证:")
print(f"  各折准确率: {cv_scores}")
print(f"  平均准确率: {cv_scores.mean():.4f} (+/- {cv_scores.std()*2:.4f})")

# 分类报告
print(f"\n分类报告 (测试集):")
print(classification_report(y_test, y_pred_test, 
                           target_names=['非 Virginica', 'Virginica']))

# 混淆矩阵
cm = confusion_matrix(y_test, y_pred_test)
print(f"混淆矩阵:")
print(cm)

In [None]:
# =============================================================================
# 可视化混淆矩阵
# =============================================================================

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# 左图: 混淆矩阵热力图
ax = axes[0]
im = ax.imshow(cm, interpolation='nearest', cmap='Blues')
ax.figure.colorbar(im, ax=ax)
ax.set(xticks=[0, 1], yticks=[0, 1],
       xticklabels=['非 Virginica', 'Virginica'],
       yticklabels=['非 Virginica', 'Virginica'],
       xlabel='预测类别', ylabel='真实类别',
       title='混淆矩阵')

# 在矩阵上标注数值
for i in range(2):
    for j in range(2):
        ax.text(j, i, format(cm[i, j], 'd'),
                ha="center", va="center", fontsize=16,
                color="white" if cm[i, j] > cm.max()/2 else "black")

# 右图: 决策值分布
ax = axes[1]
decision_train = svm_clf.decision_function(X_train)
ax.hist(decision_train[y_train==0], bins=20, alpha=0.6, color='steelblue', 
        label='非 Virginica', density=True)
ax.hist(decision_train[y_train==1], bins=20, alpha=0.6, color='coral', 
        label='Virginica', density=True)
ax.axvline(x=0, color='black', linestyle='--', linewidth=2, label='决策边界')
ax.set_xlabel('决策函数值', fontsize=12)
ax.set_ylabel('密度', fontsize=12)
ax.set_title('决策函数值分布', fontsize=14)
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 6. 正则化参数 C 的影响

In [None]:
# =============================================================================
# 分析 C 参数对决策边界的影响
# =============================================================================

C_values = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for idx, C in enumerate(C_values):
    # 训练模型
    model = Pipeline([
        ('scaler', StandardScaler()),
        ('svc', LinearSVC(C=C, loss='hinge', max_iter=10000, random_state=42))
    ])
    model.fit(X_train, y_train)
    
    # 计算准确率
    acc = accuracy_score(y_test, model.predict(X_test))
    
    # 可视化
    ax = axes[idx]
    
    # 决策边界
    y_grid = model.predict(X_grid).reshape(xx0.shape)
    ax.contourf(xx0, xx1, y_grid, alpha=0.3, cmap='RdYlGn')
    
    # 间隔边界
    decision = model.decision_function(X_grid).reshape(xx0.shape)
    ax.contour(xx0, xx1, decision, levels=[-1, 0, 1], 
               colors=['blue', 'black', 'blue'], 
               linestyles=['--', '-', '--'])
    
    # 数据点
    ax.scatter(X[y==0, 0], X[y==0, 1], c='red', marker='o', 
               edgecolors='black', s=30)
    ax.scatter(X[y==1, 0], X[y==1, 1], c='green', marker='s', 
               edgecolors='black', s=30)
    
    ax.set_title(f'C = {C}\nAcc = {acc:.3f}', fontsize=12)
    ax.set_xlabel('花瓣长度')
    ax.set_ylabel('花瓣宽度')
    ax.grid(True, alpha=0.3)

plt.suptitle('正则化参数 C 对决策边界的影响', fontsize=14, y=1.02)
plt.tight_layout()
plt.show()

print("\nC 参数总结:")
print("- C 很小 (强正则化): 间隔很宽，模型简单，可能欠拟合")
print("- C 很大 (弱正则化): 间隔很窄，模型复杂，可能过拟合")
print("- 最佳 C 值通常通过交叉验证确定")

## 7. 单元测试

In [None]:
# =============================================================================
# 单元测试
# =============================================================================

def run_tests():
    """运行单元测试"""
    test_results = []
    
    # 测试 1: 数据加载
    try:
        iris_test = datasets.load_iris()
        assert iris_test['data'].shape == (150, 4)
        assert iris_test['target'].shape == (150,)
        test_results.append(("数据加载", True, ""))
    except Exception as e:
        test_results.append(("数据加载", False, str(e)))
    
    # 测试 2: Pipeline 训练
    try:
        test_pipe = Pipeline([
            ('scaler', StandardScaler()),
            ('svc', LinearSVC(C=1, max_iter=10000))
        ])
        test_pipe.fit(X_train, y_train)
        assert hasattr(test_pipe.named_steps['svc'], 'coef_')
        test_results.append(("Pipeline 训练", True, ""))
    except Exception as e:
        test_results.append(("Pipeline 训练", False, str(e)))
    
    # 测试 3: 预测输出
    try:
        pred = test_pipe.predict(X_test)
        assert pred.shape == y_test.shape
        assert set(pred).issubset({0, 1})
        test_results.append(("预测输出", True, ""))
    except Exception as e:
        test_results.append(("预测输出", False, str(e)))
    
    # 测试 4: 决策函数
    try:
        decision = test_pipe.decision_function(X_test)
        assert decision.shape == (len(X_test),)
        test_results.append(("决策函数", True, ""))
    except Exception as e:
        test_results.append(("决策函数", False, str(e)))
    
    # 测试 5: 准确率合理
    try:
        acc = accuracy_score(y_test, pred)
        assert acc > 0.7, f"准确率过低: {acc}"
        test_results.append(("准确率合理性", True, f"Acc = {acc:.4f}"))
    except Exception as e:
        test_results.append(("准确率合理性", False, str(e)))
    
    # 测试 6: 交叉验证
    try:
        cv = cross_val_score(test_pipe, X, y, cv=3)
        assert len(cv) == 3
        assert all(0 <= s <= 1 for s in cv)
        test_results.append(("交叉验证", True, f"Mean CV = {cv.mean():.4f}"))
    except Exception as e:
        test_results.append(("交叉验证", False, str(e)))
    
    # 输出结果
    print("="*60)
    print("单元测试结果")
    print("="*60)
    
    passed = 0
    for name, success, msg in test_results:
        status = "✓ 通过" if success else "✗ 失败"
        passed += int(success)
        print(f"{status} | {name}")
        if msg:
            print(f"       {msg}")
    
    print("="*60)
    print(f"总计: {passed}/{len(test_results)} 测试通过")
    print("="*60)
    
    return passed == len(test_results)

# 运行测试
all_passed = run_tests()

## 8. 知识总结

### SVM 核心要点

1. **最大间隔原理**
   - SVM 寻找使间隔最大化的超平面
   - 只有支持向量决定边界位置
   - 间隔越大，泛化能力通常越好

2. **关键参数**
   - `C`: 正则化参数
     - C 大: 严格分类，小间隔，可能过拟合
     - C 小: 宽松分类，大间隔，可能欠拟合
   - `kernel`: 核函数类型 ('linear', 'rbf', 'poly')
   - `gamma`: RBF 核的参数

3. **使用建议**
   - 必须标准化特征 (SVM 对尺度敏感)
   - 使用 Pipeline 封装预处理和模型
   - 通过交叉验证选择最佳参数

4. **常用实现类**
   - `LinearSVC`: 线性 SVM，训练快
   - `SVC`: 支持核技巧，更灵活
   - `SVR`: SVM 回归

### 参考文献

- Cortes, C., & Vapnik, V. (1995). Support-Vector Networks
- Schölkopf, B., & Smola, A. J. (2002). Learning with Kernels