# 经典控制环境深度解析

---

## 核心思想

经典控制环境是一组基于**控制论经典问题**设计的低维环境。它们具有简单的状态空间和明确的物理意义，是验证强化学习算法的理想测试平台。

## 本节内容

1. **CartPole**: 倒立摆平衡 - 欠驱动系统控制
2. **MountainCar**: 爬山车 - 稀疏奖励与探索
3. **Acrobot**: 双摆 - 欠驱动摆动控制
4. **Pendulum**: 单摆 - 连续控制入门

---

In [None]:
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle, Circle, FancyBboxPatch
from matplotlib.lines import Line2D
import matplotlib.animation as animation
from IPython.display import HTML, display
from collections import deque

plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 11

print(f"Gymnasium 版本: {gym.__version__}")

---

## 1. CartPole - 倒立摆平衡

### 物理系统

CartPole 是一个**欠驱动系统**：通过移动小车来平衡倒立的摆杆。

### 运动方程

小车-摆杆系统的运动方程（拉格朗日力学推导）：

$$\ddot{\theta} = \frac{g\sin\theta + \cos\theta \cdot \frac{-F - m_p l \dot{\theta}^2 \sin\theta}{m_c + m_p}}{l\left(\frac{4}{3} - \frac{m_p \cos^2\theta}{m_c + m_p}\right)}$$

$$\ddot{x} = \frac{F + m_p l (\dot{\theta}^2 \sin\theta - \ddot{\theta}\cos\theta)}{m_c + m_p}$$

其中：
- $x$: 小车位置
- $\theta$: 摆杆与垂直方向的夹角
- $F$: 施加在小车上的力 (±10N)
- $m_c = 1.0$ kg: 小车质量
- $m_p = 0.1$ kg: 摆杆质量
- $l = 0.5$ m: 摆杆半长
- $g = 9.8$ m/s²: 重力加速度

In [None]:
# CartPole 环境详解
env = gym.make("CartPole-v1")

print("=" * 70)
print("CartPole-v1 环境详解")
print("=" * 70)

# 状态空间
print("\n【状态空间】")
state_vars = [
    ("小车位置 x", env.observation_space.low[0], env.observation_space.high[0], "m"),
    ("小车速度 ẋ", env.observation_space.low[1], env.observation_space.high[1], "m/s"),
    ("摆杆角度 θ", env.observation_space.low[2], env.observation_space.high[2], "rad"),
    ("摆杆角速度 θ̇", env.observation_space.low[3], env.observation_space.high[3], "rad/s"),
]
print(f"{'变量':<15} {'下界':>15} {'上界':>15} {'单位':>10}")
print("-" * 60)
for name, low, high, unit in state_vars:
    print(f"{name:<15} {low:>15.2f} {high:>15.2f} {unit:>10}")

# 动作空间
print("\n【动作空间】")
print(f"类型: Discrete(2)")
print(f"  0: 向左施加 -10N 的力")
print(f"  1: 向右施加 +10N 的力")

# 终止条件
print("\n【终止条件】")
print(f"  1. |θ| > 12° (约 0.2095 rad)")
print(f"  2. |x| > 2.4 m")
print(f"  3. 步数 > 500 (截断)")

# 奖励
print("\n【奖励设计】")
print(f"  每存活一步: +1")
print(f"  目标: 累积奖励 ≥ 475")

env.close()

In [None]:
# 可视化 CartPole 系统
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# 左图: 系统示意图
ax1 = axes[0]
ax1.set_xlim(-3, 3)
ax1.set_ylim(-0.5, 3)
ax1.set_aspect('equal')

# 地面
ax1.axhline(y=0, color='brown', linewidth=4)
ax1.fill_between([-3, 3], [-0.3], [0], color='brown', alpha=0.3)

# 小车
cart = FancyBboxPatch((-0.4, 0.05), 0.8, 0.3, boxstyle="round,pad=0.02",
                       facecolor='steelblue', edgecolor='navy', linewidth=2)
ax1.add_patch(cart)

# 轮子
for wx in [-0.25, 0.25]:
    wheel = Circle((wx, 0.05), 0.08, facecolor='gray', edgecolor='black')
    ax1.add_patch(wheel)

# 摆杆
theta = 0.2  # 示例角度
pole_length = 2.0
pole_x = pole_length * np.sin(theta)
pole_y = pole_length * np.cos(theta)
ax1.plot([0, pole_x], [0.35, 0.35 + pole_y], 'r-', linewidth=10, solid_capstyle='round')
ax1.plot(pole_x, 0.35 + pole_y, 'ro', markersize=18)

# 角度标注
arc_angles = np.linspace(np.pi/2 - theta, np.pi/2, 20)
arc_r = 0.6
ax1.plot(arc_r * np.cos(arc_angles), 0.35 + arc_r * np.sin(arc_angles), 'g--', linewidth=2)
ax1.annotate('θ', (0.25, 0.8), fontsize=16, color='green', fontweight='bold')

# 力箭头
ax1.annotate('', xy=(1.2, 0.2), xytext=(0.5, 0.2),
             arrowprops=dict(arrowstyle='->', color='orange', lw=3))
ax1.text(0.85, 0.35, 'F', fontsize=14, color='orange', fontweight='bold')

# 坐标轴标注
ax1.annotate('', xy=(2.5, 0), xytext=(-2.5, 0),
             arrowprops=dict(arrowstyle='->', color='black', lw=1))
ax1.text(2.3, -0.2, 'x', fontsize=12)

ax1.set_title('CartPole 系统示意图', fontsize=14, fontweight='bold')
ax1.axis('off')

# 右图: 相平面
ax2 = axes[1]

# 生成相轨迹数据
env = gym.make("CartPole-v1")
trajectories = []

for seed in range(5):
    obs, _ = env.reset(seed=seed)
    traj = [obs]
    for _ in range(200):
        # 简单策略
        action = 1 if obs[2] > 0 else 0
        obs, _, terminated, truncated, _ = env.step(action)
        traj.append(obs)
        if terminated or truncated:
            break
    trajectories.append(np.array(traj))
env.close()

# 绘制相轨迹 (角度 vs 角速度)
colors = plt.cm.viridis(np.linspace(0, 1, len(trajectories)))
for traj, color in zip(trajectories, colors):
    ax2.plot(np.degrees(traj[:, 2]), traj[:, 3], '-', color=color, alpha=0.7, linewidth=1.5)
    ax2.plot(np.degrees(traj[0, 2]), traj[0, 3], 'o', color=color, markersize=8)
    ax2.plot(np.degrees(traj[-1, 2]), traj[-1, 3], 's', color=color, markersize=8)

# 终止边界
ax2.axvline(x=12, color='red', linestyle='--', alpha=0.7, label='终止边界 (±12°)')
ax2.axvline(x=-12, color='red', linestyle='--', alpha=0.7)
ax2.axhline(y=0, color='gray', linestyle='-', alpha=0.3)
ax2.axvline(x=0, color='gray', linestyle='-', alpha=0.3)

ax2.set_xlabel('摆杆角度 θ (度)', fontsize=12)
ax2.set_ylabel('摆杆角速度 θ̇ (rad/s)', fontsize=12)
ax2.set_title('相平面轨迹', fontsize=14, fontweight='bold')
ax2.legend(loc='upper right')
ax2.set_xlim(-20, 20)

plt.tight_layout()
plt.show()

In [None]:
# CartPole 控制策略对比

def random_policy(obs):
    return np.random.randint(2)

def angle_policy(obs):
    """基于角度的策略"""
    return 1 if obs[2] > 0 else 0

def pd_policy(obs):
    """PD 控制策略"""
    x, x_dot, theta, theta_dot = obs
    # 角度 PD
    u_theta = 50 * theta + 10 * theta_dot
    # 位置 PD (小权重)
    u_x = 0.5 * x + 1.0 * x_dot
    return 1 if (u_theta + u_x) > 0 else 0

def evaluate_policy(env_id, policy, n_episodes=50, seed=42):
    """评估策略"""
    env = gym.make(env_id)
    rewards = []
    for i in range(n_episodes):
        obs, _ = env.reset(seed=seed + i)
        total_reward = 0
        while True:
            action = policy(obs)
            obs, reward, terminated, truncated, _ = env.step(action)
            total_reward += reward
            if terminated or truncated:
                break
        rewards.append(total_reward)
    env.close()
    return rewards

# 评估
print("CartPole 策略评估 (50 回合)")
print("=" * 60)

policies = {
    "随机策略": random_policy,
    "角度策略": angle_policy,
    "PD控制": pd_policy,
}

results = {}
for name, policy in policies.items():
    rewards = evaluate_policy("CartPole-v1", policy)
    results[name] = rewards
    print(f"{name:12s}: {np.mean(rewards):6.1f} ± {np.std(rewards):5.1f} "
          f"(min: {np.min(rewards):3.0f}, max: {np.max(rewards):3.0f})")

---

## 2. MountainCar - 稀疏奖励问题

### 问题描述

小车位于山谷底部，引擎不够强大，无法直接爬上山顶。必须**利用动量**——先向左爬坡积累势能，再向右冲刺。

### 动力学方程

$$v_{t+1} = v_t + 0.001 \cdot a - 0.0025 \cdot \cos(3x_t)$$
$$x_{t+1} = x_t + v_{t+1}$$

其中 $a \in \{-1, 0, 1\}$ 表示加速方向。

### 为什么这是一个难题？

- **稀疏奖励**: 除非到达目标，每步都是 -1
- **随机策略失败**: 随机动作几乎无法到达目标
- **需要探索**: 必须学会"先后退再前进"的策略

In [None]:
# MountainCar 环境详解
env = gym.make("MountainCar-v0")

print("=" * 70)
print("MountainCar-v0 环境详解")
print("=" * 70)

print("\n【状态空间】")
print(f"  位置 x: [{env.observation_space.low[0]:.2f}, {env.observation_space.high[0]:.2f}]")
print(f"  速度 v: [{env.observation_space.low[1]:.3f}, {env.observation_space.high[1]:.3f}]")

print("\n【动作空间】 Discrete(3)")
print(f"  0: 向左加速 (a = -1)")
print(f"  1: 不加速   (a =  0)")
print(f"  2: 向右加速 (a = +1)")

print("\n【目标】")
print(f"  到达 x ≥ 0.5 (山顶)")

print("\n【奖励】")
print(f"  每步: -1")
print(f"  到达目标: 回合结束 (累积奖励越接近 0 越好)")

print("\n【难点】")
print(f"  - 稀疏奖励: 只有到达目标才有正面信号")
print(f"  - 最大步数: 200 (随机策略几乎无法成功)")

env.close()

In [None]:
# 可视化 MountainCar 地形和能量
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# 左图: 地形和轨迹
ax1 = axes[0]

# 地形函数
x = np.linspace(-1.2, 0.6, 300)
y = np.sin(3 * x) * 0.45 + 0.55

ax1.plot(x, y, 'b-', linewidth=3, label='地形')
ax1.fill_between(x, 0, y, alpha=0.2, color='green')

# 关键位置
ax1.axvline(x=-0.5, color='blue', linestyle='--', alpha=0.7, label='起点区域')
ax1.axvline(x=0.5, color='red', linestyle='--', linewidth=2, label='目标线')

# 运行一个成功轨迹
env = gym.make("MountainCar-v0")
obs, _ = env.reset(seed=42)
positions = [obs[0]]

for _ in range(200):
    # 动量策略
    action = 2 if obs[1] > 0 else 0
    obs, _, terminated, truncated, _ = env.step(action)
    positions.append(obs[0])
    if terminated:
        break
env.close()

# 绘制轨迹
traj_y = np.sin(3 * np.array(positions)) * 0.45 + 0.55
colors = np.linspace(0, 1, len(positions))
for i in range(len(positions) - 1):
    ax1.plot(positions[i:i+2], traj_y[i:i+2], '-', 
             color=plt.cm.plasma(colors[i]), linewidth=2, alpha=0.7)

# 小车
ax1.plot(positions[0], traj_y[0], 'go', markersize=15, label='起点')
ax1.plot(positions[-1], traj_y[-1], 'r*', markersize=20, label='终点')

ax1.set_xlabel('位置 x', fontsize=12)
ax1.set_ylabel('高度', fontsize=12)
ax1.set_title(f'MountainCar 轨迹 ({len(positions)} 步)', fontsize=14, fontweight='bold')
ax1.legend(loc='upper left')
ax1.set_xlim(-1.3, 0.7)
ax1.set_ylim(0, 1.3)

# 右图: 位置-速度相图
ax2 = axes[1]

# 等能量线
pos = np.linspace(-1.2, 0.6, 100)
vel = np.linspace(-0.07, 0.07, 100)
P, V = np.meshgrid(pos, vel)

# 势能 (与高度成正比)
potential = np.sin(3 * P) * 0.45 + 0.55
# 动能
kinetic = 0.5 * V**2 * 500  # 缩放
# 总能量
total_energy = potential + kinetic

contour = ax2.contourf(P, V, total_energy, levels=20, cmap='coolwarm', alpha=0.7)
plt.colorbar(contour, ax=ax2, label='总能量 (a.u.)')

# 绘制相轨迹
env = gym.make("MountainCar-v0")
for seed in range(3):
    obs, _ = env.reset(seed=seed * 10)
    traj = [obs]
    for _ in range(200):
        action = 2 if obs[1] > 0 else 0
        obs, _, terminated, truncated, _ = env.step(action)
        traj.append(obs)
        if terminated:
            break
    traj = np.array(traj)
    ax2.plot(traj[:, 0], traj[:, 1], 'k-', linewidth=1.5, alpha=0.8)
    ax2.plot(traj[0, 0], traj[0, 1], 'go', markersize=8)
    ax2.plot(traj[-1, 0], traj[-1, 1], 'r*', markersize=12)
env.close()

ax2.axvline(x=0.5, color='red', linestyle='--', linewidth=2, label='目标')
ax2.set_xlabel('位置 x', fontsize=12)
ax2.set_ylabel('速度 v', fontsize=12)
ax2.set_title('相空间与能量等高线', fontsize=14, fontweight='bold')
ax2.legend(loc='upper left')

plt.tight_layout()
plt.show()

In [None]:
# MountainCar 策略对比

def random_policy_mc(obs):
    return np.random.randint(3)

def momentum_policy(obs):
    """跟随速度方向"""
    return 2 if obs[1] > 0 else 0

def energy_policy(obs):
    """基于能量的策略"""
    position, velocity = obs
    # 当位置较低时，通过摆动积累能量
    if position < -0.4:
        return 2 if velocity > 0 else 0
    elif position > 0.4:
        return 2  # 接近目标，全力向右
    else:
        return 2 if velocity > 0 else 0

print("MountainCar 策略评估 (20 回合)")
print("=" * 60)
print(f"{'策略':<15} {'平均奖励':>12} {'成功率':>10} {'平均步数':>10}")
print("-" * 50)

for name, policy in [("随机", random_policy_mc), ("动量", momentum_policy), ("能量", energy_policy)]:
    env = gym.make("MountainCar-v0")
    rewards = []
    successes = 0
    steps_list = []
    
    for i in range(20):
        obs, _ = env.reset(seed=i)
        total_reward = 0
        steps = 0
        while True:
            action = policy(obs)
            obs, reward, terminated, truncated, _ = env.step(action)
            total_reward += reward
            steps += 1
            if terminated:
                successes += 1
                break
            if truncated:
                break
        rewards.append(total_reward)
        steps_list.append(steps)
    env.close()
    
    print(f"{name:<15} {np.mean(rewards):>12.1f} {successes/20*100:>9.0f}% {np.mean(steps_list):>10.1f}")

---

## 3. Pendulum - 连续控制

### 物理系统

单摆从任意位置启动，目标是通过施加扭矩将其稳定在直立位置。

### 动力学方程

$$\ddot{\theta} = -\frac{3g}{2l}\sin(\theta + \pi) + \frac{3}{ml^2}u$$

其中 $u \in [-2, 2]$ 是施加的扭矩。

### 奖励函数

$$r = -(\theta^2 + 0.1\dot{\theta}^2 + 0.001u^2)$$

这是一个**二次型奖励**，鼓励：
1. 角度接近 0（直立）
2. 角速度接近 0（静止）
3. 控制输入小（节能）

In [None]:
# Pendulum 环境详解
env = gym.make("Pendulum-v1")

print("=" * 70)
print("Pendulum-v1 环境详解")
print("=" * 70)

print("\n【状态空间】 Box(3,)")
print(f"  cos(θ): [{env.observation_space.low[0]:.1f}, {env.observation_space.high[0]:.1f}]")
print(f"  sin(θ): [{env.observation_space.low[1]:.1f}, {env.observation_space.high[1]:.1f}]")
print(f"  θ̇:     [{env.observation_space.low[2]:.1f}, {env.observation_space.high[2]:.1f}] rad/s")

print("\n【动作空间】 Box(1,) - 连续!")
print(f"  扭矩 u: [{env.action_space.low[0]:.1f}, {env.action_space.high[0]:.1f}] N·m")

print("\n【奖励函数】")
print(f"  r = -(θ² + 0.1·θ̇² + 0.001·u²)")
print(f"  范围: 约 [-16.27, 0]")
print(f"  最优 (直立静止): 0")

print("\n【回合设置】")
print(f"  最大步数: 200")
print(f"  无提前终止")

env.close()

In [None]:
# Pendulum 控制演示

def pd_controller(obs, Kp=10.0, Kd=2.0):
    """PD 控制器"""
    cos_theta, sin_theta, theta_dot = obs
    theta = np.arctan2(sin_theta, cos_theta)
    torque = -Kp * theta - Kd * theta_dot
    return np.clip([torque], -2.0, 2.0)

def energy_controller(obs):
    """能量成形控制器"""
    cos_theta, sin_theta, theta_dot = obs
    theta = np.arctan2(sin_theta, cos_theta)
    
    # 物理参数
    g, l, m = 10.0, 1.0, 1.0
    
    # 当前能量
    E = 0.5 * m * l**2 * theta_dot**2 - m * g * l * cos_theta
    E_target = m * g * l  # 目标能量 (直立)
    
    # 控制策略
    if np.abs(theta) < 0.3:  # 接近直立，用PD稳定
        torque = -10.0 * theta - 2.0 * theta_dot
    else:  # 能量泵浦
        torque = -3.0 * (E - E_target) * theta_dot
    
    return np.clip([torque], -2.0, 2.0)

# 运行并记录
env = gym.make("Pendulum-v1")

controllers = {
    "PD控制": pd_controller,
    "能量控制": energy_controller
}

fig, axes = plt.subplots(2, 2, figsize=(14, 10))
colors = ['#1f77b4', '#ff7f0e']

for idx, (name, controller) in enumerate(controllers.items()):
    obs, _ = env.reset(seed=42)
    thetas, theta_dots, torques, rewards_list = [], [], [], []
    
    for _ in range(200):
        cos_theta, sin_theta, theta_dot = obs
        theta = np.arctan2(sin_theta, cos_theta)
        
        action = controller(obs)
        obs, reward, _, _, _ = env.step(action)
        
        thetas.append(np.degrees(theta))
        theta_dots.append(theta_dot)
        torques.append(action[0])
        rewards_list.append(reward)
    
    # 绘制
    axes[0, 0].plot(thetas, color=colors[idx], label=name, linewidth=2)
    axes[0, 1].plot(theta_dots, color=colors[idx], label=name, linewidth=2)
    axes[1, 0].plot(torques, color=colors[idx], label=name, linewidth=2)
    axes[1, 1].plot(np.cumsum(rewards_list), color=colors[idx], label=name, linewidth=2)

env.close()

# 设置图表
axes[0, 0].axhline(y=0, color='r', linestyle='--', alpha=0.5)
axes[0, 0].set_ylabel('角度 (度)')
axes[0, 0].set_title('摆角变化')
axes[0, 0].legend()

axes[0, 1].axhline(y=0, color='r', linestyle='--', alpha=0.5)
axes[0, 1].set_ylabel('角速度 (rad/s)')
axes[0, 1].set_title('角速度变化')
axes[0, 1].legend()

axes[1, 0].axhline(y=0, color='gray', linestyle='--', alpha=0.5)
axes[1, 0].set_xlabel('步数')
axes[1, 0].set_ylabel('扭矩')
axes[1, 0].set_title('控制输入')
axes[1, 0].legend()

axes[1, 1].set_xlabel('步数')
axes[1, 1].set_ylabel('累积奖励')
axes[1, 1].set_title('累积奖励')
axes[1, 1].legend()

for ax in axes.flat:
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

---

## 4. 环境对比总结

In [None]:
# 环境对比表
from gymnasium import spaces

envs_info = [
    ("CartPole-v1", "倒立摆", "简单", "DQN, A2C"),
    ("MountainCar-v0", "爬山车", "中等(探索)", "需要好的探索"),
    ("Acrobot-v1", "双摆", "中等", "DQN, PPO"),
    ("Pendulum-v1", "单摆", "中等(连续)", "DDPG, SAC, TD3"),
]

print("\n" + "=" * 90)
print("经典控制环境对比")
print("=" * 90)
print(f"{'环境ID':<25} {'描述':<10} {'状态维度':<10} {'动作空间':<15} {'难度':<15}")
print("-" * 90)

for env_id, desc, difficulty, algos in envs_info:
    try:
        env = gym.make(env_id)
        obs_dim = env.observation_space.shape[0]
        if isinstance(env.action_space, spaces.Discrete):
            act_info = f"Discrete({env.action_space.n})"
        else:
            act_info = f"Box({env.action_space.shape[0]})"
        env.close()
        print(f"{env_id:<25} {desc:<10} {obs_dim:<10} {act_info:<15} {difficulty:<15}")
    except:
        print(f"{env_id:<25} 未安装")

print("\n推荐算法:")
for env_id, _, _, algos in envs_info:
    print(f"  {env_id}: {algos}")

---

## 5. 练习

### 练习 1: 实现更好的 CartPole PD 控制器

In [None]:
def optimized_pd_policy(obs):
    """
    TODO: 调整 PD 参数使平均奖励 > 450
    
    提示:
    - 角度控制是主要的
    - 位置控制是次要的，防止小车跑出边界
    - 考虑添加角速度阻尼项
    """
    x, x_dot, theta, theta_dot = obs
    
    # TODO: 调整这些参数
    Kp_theta = 50.0
    Kd_theta = 10.0
    Kp_x = 0.5
    Kd_x = 1.0
    
    u = Kp_theta * theta + Kd_theta * theta_dot + Kp_x * x + Kd_x * x_dot
    return 1 if u > 0 else 0

# 测试
rewards = evaluate_policy("CartPole-v1", optimized_pd_policy, n_episodes=50)
print(f"你的 PD 控制器: {np.mean(rewards):.1f} ± {np.std(rewards):.1f}")
print(f"目标: > 450")

### 练习 2: 分析 Pendulum 的最优奖励

In [None]:
# 练习 2: 计算 Pendulum 的理论最优奖励

"""
Pendulum 奖励函数: r = -(θ² + 0.1·θ̇² + 0.001·u²)

问题:
1. 单步最优奖励是多少? (θ=0, θ̇=0, u=0)
2. 单步最差奖励是多少? (θ=π, θ̇=8, u=2)
3. 200 步回合的理论最优累积奖励是多少?
"""

# TODO: 计算
best_single_reward = 0  # 完美状态
worst_single_reward = -(np.pi**2 + 0.1 * 8**2 + 0.001 * 2**2)
theoretical_best_episode = 200 * best_single_reward

print(f"单步最优奖励: {best_single_reward}")
print(f"单步最差奖励: {worst_single_reward:.2f}")
print(f"理论最优累积奖励: {theoretical_best_episode}")

---

## 总结

本节我们深入分析了四个经典控制环境:

1. **CartPole**: 学习了欠驱动系统控制和 PD 控制器设计
2. **MountainCar**: 理解了稀疏奖励问题和能量利用策略
3. **Acrobot**: 认识了更复杂的欠驱动系统
4. **Pendulum**: 入门连续控制和能量成形控制

### 下一步

- 使用 Q-Learning 解决离散动作环境
- 使用 DQN 提升性能
- 使用 DDPG/SAC 解决连续控制问题