# 基于多项式特征的 SVM 非线性分类

## 理论背景

当数据在原始特征空间中线性不可分时，可以通过添加多项式特征将数据映射到高维空间，在高维空间中实现线性分离。

### 多项式特征映射

对于二维特征 $(x_1, x_2)$，三阶多项式特征映射为：

$$\phi(x_1, x_2) = (1, x_1, x_2, x_1^2, x_1 x_2, x_2^2, x_1^3, x_1^2 x_2, x_1 x_2^2, x_2^3)$$

### 优点与缺点

**优点：**
- 直观易懂
- 可以使用线性分类器
- 特征可解释

**缺点：**
- 特征数量爆炸性增长
- 高阶多项式容易过拟合
- 计算和内存开销大

## 1. 环境配置

In [None]:
# =============================================================================
# 导入必要的库
# =============================================================================
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_moons
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
import warnings
warnings.filterwarnings('ignore')

# 设置随机种子
np.random.seed(42)

# matplotlib 配置
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS', 'SimHei', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['figure.dpi'] = 100

## 2. 数据准备

使用 `make_moons` 生成非线性可分的月牙形数据集。

In [None]:
# =============================================================================
# 生成月牙形非线性数据集
# =============================================================================

X, y = make_moons(n_samples=200, noise=0.15, random_state=42)

# 划分数据集
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("数据集信息:")
print(f"  总样本数: {len(X)}")
print(f"  训练集: {len(X_train)} 样本")
print(f"  测试集: {len(X_test)} 样本")
print(f"  特征维度: {X.shape[1]}")
print(f"  类别分布: 类0 = {sum(y==0)}, 类1 = {sum(y==1)}")

In [None]:
# =============================================================================
# 可视化月牙形数据
# =============================================================================

fig, ax = plt.subplots(figsize=(10, 6))

ax.scatter(X[y==0, 0], X[y==0, 1], c='steelblue', marker='o', 
           edgecolors='white', s=60, label='类别 0')
ax.scatter(X[y==1, 0], X[y==1, 1], c='coral', marker='s', 
           edgecolors='white', s=60, label='类别 1')

ax.set_xlabel('特征 1', fontsize=12)
ax.set_ylabel('特征 2', fontsize=12)
ax.set_title('月牙形数据集 (make_moons) - 非线性可分', fontsize=14)
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\n观察: 两个月牙形类别交错分布，无法用一条直线分开")

## 3. 线性 SVM 的局限性

In [None]:
# =============================================================================
# 展示线性 SVM 在非线性数据上的局限
# =============================================================================

# 训练线性 SVM
linear_svm = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', LinearSVC(C=1, loss='hinge', max_iter=10000, random_state=42))
])
linear_svm.fit(X_train, y_train)

# 评估
linear_acc = accuracy_score(y_test, linear_svm.predict(X_test))
print(f"线性 SVM 测试准确率: {linear_acc:.4f}")

# 可视化决策边界
def plot_decision_boundary(model, X, y, ax, title):
    h = 0.02
    x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
    y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)
    
    ax.contourf(xx, yy, Z, alpha=0.3, cmap='RdYlBu')
    ax.scatter(X[y==0, 0], X[y==0, 1], c='steelblue', marker='o', 
               edgecolors='white', s=40)
    ax.scatter(X[y==1, 0], X[y==1, 1], c='coral', marker='s', 
               edgecolors='white', s=40)
    ax.set_title(title, fontsize=12)
    ax.grid(True, alpha=0.3)

fig, ax = plt.subplots(figsize=(10, 6))
plot_decision_boundary(linear_svm, X, y, ax, 
                       f'线性 SVM (准确率: {linear_acc:.2%})')
ax.set_xlabel('特征 1')
ax.set_ylabel('特征 2')
plt.tight_layout()
plt.show()

print("\n结论: 线性 SVM 无法很好地分离月牙形数据")

## 4. 多项式特征 + 线性 SVM

In [None]:
# =============================================================================
# 构建多项式特征 + 线性 SVM Pipeline
# =============================================================================

# 创建带多项式特征的 Pipeline
polynomial_svm_clf = Pipeline([
    ('poly_features', PolynomialFeatures(degree=3)),  # 三阶多项式
    ('scaler', StandardScaler()),
    ('svm_clf', LinearSVC(C=5, loss='hinge', max_iter=10000, random_state=42))
])

# 训练
polynomial_svm_clf.fit(X_train, y_train)

# 评估
poly_acc_train = accuracy_score(y_train, polynomial_svm_clf.predict(X_train))
poly_acc_test = accuracy_score(y_test, polynomial_svm_clf.predict(X_test))

print("多项式特征 SVM 性能:")
print(f"  训练准确率: {poly_acc_train:.4f}")
print(f"  测试准确率: {poly_acc_test:.4f}")

# 查看特征维度变化
poly_features = polynomial_svm_clf.named_steps['poly_features']
print(f"\n特征维度变化:")
print(f"  原始特征数: {X.shape[1]}")
print(f"  多项式特征数: {poly_features.n_output_features_}")

In [None]:
# =============================================================================
# 对比线性 SVM 和多项式 SVM
# =============================================================================

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# 线性 SVM
plot_decision_boundary(linear_svm, X, y, axes[0], 
                       f'线性 SVM\n准确率: {linear_acc:.2%}')
axes[0].set_xlabel('特征 1')
axes[0].set_ylabel('特征 2')

# 多项式 SVM
plot_decision_boundary(polynomial_svm_clf, X, y, axes[1], 
                       f'多项式特征 SVM (degree=3)\n准确率: {poly_acc_test:.2%}')
axes[1].set_xlabel('特征 1')
axes[1].set_ylabel('特征 2')

plt.suptitle('线性 vs 多项式特征 SVM', fontsize=14, y=1.02)
plt.tight_layout()
plt.show()

## 5. 多项式阶数分析

In [None]:
# =============================================================================
# 分析不同多项式阶数的影响
# =============================================================================

degrees = [1, 2, 3, 4, 5, 6]
results = []

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for idx, degree in enumerate(degrees):
    # 训练模型
    model = Pipeline([
        ('poly', PolynomialFeatures(degree=degree)),
        ('scaler', StandardScaler()),
        ('svm', LinearSVC(C=5, loss='hinge', max_iter=10000, random_state=42))
    ])
    model.fit(X_train, y_train)
    
    # 评估
    train_acc = accuracy_score(y_train, model.predict(X_train))
    test_acc = accuracy_score(y_test, model.predict(X_test))
    n_features = model.named_steps['poly'].n_output_features_
    
    results.append({
        'degree': degree,
        'train_acc': train_acc,
        'test_acc': test_acc,
        'n_features': n_features
    })
    
    # 可视化
    plot_decision_boundary(model, X, y, axes[idx], 
                          f'Degree={degree}\nAcc={test_acc:.2%}, Features={n_features}')

plt.suptitle('多项式阶数对决策边界的影响', fontsize=14, y=1.02)
plt.tight_layout()
plt.show()

# 结果表格
print("\n" + "="*70)
print("不同多项式阶数的性能对比")
print("="*70)
print(f"{'阶数':<8} {'特征数':<12} {'训练准确率':<15} {'测试准确率':<15}")
print("-"*70)
for r in results:
    print(f"{r['degree']:<8} {r['n_features']:<12} {r['train_acc']:<15.4f} {r['test_acc']:<15.4f}")

In [None]:
# =============================================================================
# 可视化阶数与性能的关系
# =============================================================================

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# 左图: 准确率随阶数变化
ax = axes[0]
ax.plot([r['degree'] for r in results], [r['train_acc'] for r in results], 
        'o-', linewidth=2, markersize=8, label='训练准确率')
ax.plot([r['degree'] for r in results], [r['test_acc'] for r in results], 
        's-', linewidth=2, markersize=8, label='测试准确率')
ax.set_xlabel('多项式阶数', fontsize=12)
ax.set_ylabel('准确率', fontsize=12)
ax.set_title('准确率 vs 多项式阶数', fontsize=14)
ax.legend()
ax.grid(True, alpha=0.3)
ax.set_xticks([r['degree'] for r in results])

# 右图: 特征数随阶数变化
ax = axes[1]
ax.bar([r['degree'] for r in results], [r['n_features'] for r in results], 
       color='steelblue', alpha=0.7, edgecolor='white')
ax.set_xlabel('多项式阶数', fontsize=12)
ax.set_ylabel('特征数量', fontsize=12)
ax.set_title('特征维度爆炸', fontsize=14)
ax.grid(True, alpha=0.3, axis='y')

# 添加数值标签
for r in results:
    ax.text(r['degree'], r['n_features'] + 1, str(r['n_features']), 
            ha='center', fontsize=10)

plt.tight_layout()
plt.show()

print("\n观察:")
print("- 阶数过低: 欠拟合，无法捕获非线性模式")
print("- 阶数过高: 特征数爆炸，可能过拟合，计算开销大")
print("- 需要在模型复杂度和泛化能力之间找平衡")

## 6. 超参数调优

In [None]:
# =============================================================================
# 网格搜索找最佳参数
# =============================================================================

# 参数网格
param_grid = {
    'poly__degree': [2, 3, 4],
    'svm__C': [0.1, 1, 10]
}

# 基础模型
base_model = Pipeline([
    ('poly', PolynomialFeatures()),
    ('scaler', StandardScaler()),
    ('svm', LinearSVC(loss='hinge', max_iter=10000, random_state=42))
])

# 网格搜索
grid_search = GridSearchCV(
    base_model,
    param_grid,
    cv=5,
    scoring='accuracy',
    return_train_score=True,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

print("网格搜索结果:")
print(f"  最佳参数: {grid_search.best_params_}")
print(f"  最佳交叉验证准确率: {grid_search.best_score_:.4f}")

# 最佳模型评估
best_model = grid_search.best_estimator_
best_acc = accuracy_score(y_test, best_model.predict(X_test))
print(f"  测试集准确率: {best_acc:.4f}")

## 7. 单元测试

In [None]:
# =============================================================================
# 单元测试
# =============================================================================

def run_tests():
    """运行单元测试"""
    test_results = []
    
    # 测试 1: make_moons 数据生成
    try:
        X_t, y_t = make_moons(n_samples=100, noise=0.1)
        assert X_t.shape == (100, 2)
        assert y_t.shape == (100,)
        test_results.append(("数据生成", True, ""))
    except Exception as e:
        test_results.append(("数据生成", False, str(e)))
    
    # 测试 2: 多项式特征生成
    try:
        poly = PolynomialFeatures(degree=3)
        X_poly = poly.fit_transform(X_t)
        assert X_poly.shape[1] == 10  # (1, x1, x2, x1^2, x1*x2, x2^2, x1^3, ...)
        test_results.append(("多项式特征", True, f"特征数: {X_poly.shape[1]}"))
    except Exception as e:
        test_results.append(("多项式特征", False, str(e)))
    
    # 测试 3: Pipeline 训练
    try:
        test_pipe = Pipeline([
            ('poly', PolynomialFeatures(degree=3)),
            ('scaler', StandardScaler()),
            ('svm', LinearSVC(max_iter=10000))
        ])
        test_pipe.fit(X_train, y_train)
        assert hasattr(test_pipe.named_steps['svm'], 'coef_')
        test_results.append(("Pipeline 训练", True, ""))
    except Exception as e:
        test_results.append(("Pipeline 训练", False, str(e)))
    
    # 测试 4: 预测输出
    try:
        pred = test_pipe.predict(X_test)
        assert pred.shape == y_test.shape
        test_results.append(("预测输出", True, ""))
    except Exception as e:
        test_results.append(("预测输出", False, str(e)))
    
    # 测试 5: 准确率显著提升
    try:
        poly_acc = accuracy_score(y_test, pred)
        # 多项式 SVM 应该比线性 SVM 好
        assert poly_acc > linear_acc, "多项式 SVM 应该优于线性 SVM"
        test_results.append(("准确率提升", True, f"线性:{linear_acc:.2%} < 多项式:{poly_acc:.2%}"))
    except Exception as e:
        test_results.append(("准确率提升", False, str(e)))
    
    # 输出结果
    print("="*60)
    print("单元测试结果")
    print("="*60)
    
    passed = 0
    for name, success, msg in test_results:
        status = "✓ 通过" if success else "✗ 失败"
        passed += int(success)
        print(f"{status} | {name}")
        if msg:
            print(f"       {msg}")
    
    print("="*60)
    print(f"总计: {passed}/{len(test_results)} 测试通过")
    print("="*60)
    
    return passed == len(test_results)

# 运行测试
all_passed = run_tests()

## 8. 知识总结

### 多项式特征方法

1. **核心思想**
   - 显式构造高阶多项式特征
   - 在高维空间中使用线性分类器
   - 将非线性问题转化为线性问题

2. **优点**
   - 直观，特征可解释
   - 可以使用高效的线性求解器
   - 适合低维数据

3. **缺点**
   - 特征数随阶数指数增长
   - 内存和计算开销大
   - 高阶容易过拟合

4. **替代方案: 核技巧**
   - 使用多项式核 `SVC(kernel='poly')` 
   - 无需显式构造高维特征
   - 通过核函数隐式计算

### 参数选择建议

- `degree`: 从低阶开始 (2-3)，通过交叉验证确定
- `C`: 较大的 C 可能导致过拟合，需要调优
- 对于高维数据，优先考虑核方法而非显式多项式特征