# OpenAI Gymnasium 强化学习环境快速入门

---

## 核心思想 (Core Idea)

Gymnasium (原 OpenAI Gym) 提供了强化学习研究的**标准化接口**，定义了智能体与环境交互的统一 API。通过标准化的 observation-action-reward 循环，研究者可以在不同环境间无缝切换算法。

## 数学原理 (Mathematical Theory)

强化学习建模为**马尔可夫决策过程 (MDP)**:

$$MDP = (\mathcal{S}, \mathcal{A}, P, R, \gamma)$$

其中:
- $\mathcal{S}$: 状态空间 (State Space)
- $\mathcal{A}$: 动作空间 (Action Space)  
- $P(s'|s,a)$: 状态转移概率
- $R(s,a,s')$: 奖励函数
- $\gamma \in [0,1]$: 折扣因子

智能体目标是最大化**期望累积折扣奖励**:

$$G_t = \sum_{k=0}^{\infty} \gamma^k R_{t+k+1}$$

---

## 1. 环境安装与导入

In [None]:
# 安装依赖 (如果尚未安装)
# !pip install gymnasium[classic-control] matplotlib numpy -q

In [None]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import clear_output
from collections import deque
import time

# 设置绘图风格
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['font.size'] = 12

print(f"Gymnasium 版本: {gym.__version__}")

## 2. 创建第一个环境: CartPole

CartPole 是经典的控制问题：通过左右移动小车来平衡竖立的杆子。

### 环境说明

| 属性 | 描述 |
|------|------|
| **状态** | [小车位置, 小车速度, 杆角度, 杆角速度] |
| **动作** | 0 (向左推) 或 1 (向右推) |
| **奖励** | 每步 +1 |
| **终止** | 杆角度 > 12° 或 小车位置 > 2.4 |
| **成功** | 平均奖励 ≥ 475 |

In [None]:
# 创建 CartPole 环境
env = gym.make("CartPole-v1")

# 查看环境信息
print("=" * 50)
print("环境信息")
print("=" * 50)
print(f"环境 ID: {env.spec.id}")
print(f"观测空间: {env.observation_space}")
print(f"动作空间: {env.action_space}")
print(f"最大步数: {env.spec.max_episode_steps}")

# 查看观测空间的具体范围
print("\n观测空间详情:")
print(f"  形状: {env.observation_space.shape}")
print(f"  下界: {env.observation_space.low}")
print(f"  上界: {env.observation_space.high}")

env.close()

## 3. 环境交互基础

### 核心 API

```python
observation, info = env.reset()           # 重置环境
observation, reward, terminated, truncated, info = env.step(action)  # 执行动作
```

- `terminated`: 任务完成（成功或失败）
- `truncated`: 回合因时间限制等原因被截断

In [None]:
env = gym.make("CartPole-v1")

# 重置环境
observation, info = env.reset(seed=42)
print("初始观测:")
print(f"  小车位置: {observation[0]:.4f}")
print(f"  小车速度: {observation[1]:.4f}")
print(f"  杆角度:   {observation[2]:.4f} rad ({np.degrees(observation[2]):.2f}°)")
print(f"  杆角速度: {observation[3]:.4f}")

# 执行一个动作
action = 1  # 向右推
next_obs, reward, terminated, truncated, info = env.step(action)

print(f"\n执行动作: {action} (向右推)")
print(f"获得奖励: {reward}")
print(f"终止: {terminated}, 截断: {truncated}")
print(f"新观测: {next_obs}")

env.close()

## 4. 运行完整回合

让我们运行一个完整的回合，使用随机策略和简单的规则策略进行对比。

In [None]:
def run_episode(env, policy_fn, seed=None, verbose=False):
    """
    运行一个完整回合
    
    Parameters
    ----------
    env : gym.Env
        环境实例
    policy_fn : callable
        策略函数，输入观测返回动作
    seed : int, optional
        随机种子
    verbose : bool
        是否打印详细信息
        
    Returns
    -------
    total_reward : float
        回合总奖励
    steps : int
        回合步数
    """
    obs, _ = env.reset(seed=seed)
    total_reward = 0
    steps = 0
    
    while True:
        action = policy_fn(obs)
        obs, reward, terminated, truncated, _ = env.step(action)
        total_reward += reward
        steps += 1
        
        if verbose and steps % 100 == 0:
            print(f"步数: {steps}, 累积奖励: {total_reward}")
        
        if terminated or truncated:
            break
    
    return total_reward, steps


# 定义策略
def random_policy(obs):
    """随机策略"""
    return np.random.randint(2)

def angle_policy(obs):
    """基于角度的简单策略: 杆往哪边倒就往哪边推"""
    pole_angle = obs[2]
    return 1 if pole_angle > 0 else 0

def pid_policy(obs):
    """PID 控制策略"""
    x, x_dot, theta, theta_dot = obs
    # PD 控制
    u = 10 * theta + 1 * theta_dot + 0.1 * x + 0.5 * x_dot
    return 1 if u > 0 else 0

In [None]:
# 测试不同策略
env = gym.make("CartPole-v1")

policies = {
    "随机策略": random_policy,
    "角度策略": angle_policy,
    "PID策略": pid_policy
}

n_episodes = 20
results = {}

print("=" * 60)
print(f"策略评估 ({n_episodes} 回合)")
print("=" * 60)

for name, policy in policies.items():
    rewards = []
    for i in range(n_episodes):
        reward, steps = run_episode(env, policy, seed=i)
        rewards.append(reward)
    
    results[name] = rewards
    print(f"\n{name}:")
    print(f"  平均奖励: {np.mean(rewards):.1f} ± {np.std(rewards):.1f}")
    print(f"  最小/最大: {np.min(rewards):.0f} / {np.max(rewards):.0f}")

env.close()

In [None]:
# 可视化策略比较
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# 箱线图
ax1 = axes[0]
data = [results[name] for name in results]
bp = ax1.boxplot(data, labels=list(results.keys()), patch_artist=True)
colors = ['#ff7f0e', '#2ca02c', '#1f77b4']
for patch, color in zip(bp['boxes'], colors):
    patch.set_facecolor(color)
    patch.set_alpha(0.7)
ax1.set_ylabel('回合奖励')
ax1.set_title('策略性能分布')
ax1.axhline(y=475, color='r', linestyle='--', label='成功阈值 (475)')
ax1.legend()

# 条形图
ax2 = axes[1]
means = [np.mean(results[name]) for name in results]
stds = [np.std(results[name]) for name in results]
x = np.arange(len(results))
bars = ax2.bar(x, means, yerr=stds, capsize=5, color=colors, alpha=0.7)
ax2.set_xticks(x)
ax2.set_xticklabels(list(results.keys()))
ax2.set_ylabel('平均奖励')
ax2.set_title('策略平均性能')
ax2.axhline(y=475, color='r', linestyle='--', label='成功阈值')

plt.tight_layout()
plt.show()

## 5. 探索更多环境

### 5.1 MountainCar - 爬山车

**挑战**: 小车引擎不够强，无法直接爬上山顶，需要利用来回摆动积累动量。

$$\text{速度更新}: v_{t+1} = v_t + 0.001 \cdot a - 0.0025 \cdot \cos(3x_t)$$
$$\text{位置更新}: x_{t+1} = x_t + v_{t+1}$$

In [None]:
env = gym.make("MountainCar-v0")

print("=" * 50)
print("MountainCar-v0 环境信息")
print("=" * 50)
print(f"观测空间: {env.observation_space}")
print(f"  位置范围: [{env.observation_space.low[0]:.2f}, {env.observation_space.high[0]:.2f}]")
print(f"  速度范围: [{env.observation_space.low[1]:.3f}, {env.observation_space.high[1]:.3f}]")
print(f"动作空间: {env.action_space}")
print(f"  0: 向左加速, 1: 不加速, 2: 向右加速")
print(f"目标: 到达 x >= 0.5")
print(f"奖励: 每步 -1，到达目标 0")

env.close()

In [None]:
# 可视化 MountainCar 地形
fig, ax = plt.subplots(figsize=(12, 5))

# 绘制地形
x = np.linspace(-1.2, 0.6, 200)
y = np.sin(3 * x) * 0.45 + 0.55

ax.plot(x, y, 'b-', linewidth=3, label='地形')
ax.fill_between(x, 0, y, alpha=0.3, color='green')

# 标记关键位置
ax.axvline(x=-0.5, color='red', linestyle='--', alpha=0.7, label='起点 (x=-0.5)')
ax.axvline(x=0.5, color='gold', linestyle='--', linewidth=2, label='目标 (x=0.5)')

# 绘制小车
car_x = -0.5
car_y = np.sin(3 * car_x) * 0.45 + 0.55
ax.plot(car_x, car_y + 0.05, 'ro', markersize=15, label='小车')

ax.set_xlabel('位置', fontsize=12)
ax.set_ylabel('高度', fontsize=12)
ax.set_title('MountainCar 环境地形', fontsize=14)
ax.legend(loc='upper left')
ax.set_xlim(-1.3, 0.7)
ax.set_ylim(0, 1.2)

plt.tight_layout()
plt.show()

In [None]:
# MountainCar 策略测试
env = gym.make("MountainCar-v0")

def momentum_policy(obs):
    """动量策略: 跟随当前速度方向加速"""
    position, velocity = obs
    if velocity > 0:
        return 2  # 向右加速
    else:
        return 0  # 向左加速

def random_policy_mc(obs):
    """随机策略"""
    return np.random.randint(3)

# 运行动量策略
print("测试动量策略:")
rewards = []
for i in range(10):
    reward, steps = run_episode(env, momentum_policy, seed=i)
    rewards.append(reward)
    print(f"  回合 {i+1}: 奖励={reward:.0f}, 步数={steps}")

print(f"\n平均奖励: {np.mean(rewards):.1f} ± {np.std(rewards):.1f}")

env.close()

### 5.2 Pendulum - 连续动作空间

Pendulum 是**连续控制**任务，动作是连续的扭矩值。

$$\text{动力学}: \ddot{\theta} = -\frac{3g}{2l}\sin(\theta + \pi) + \frac{3}{ml^2}u$$

$$\text{奖励}: r = -(\theta^2 + 0.1\dot{\theta}^2 + 0.001u^2)$$

In [None]:
env = gym.make("Pendulum-v1")

print("=" * 50)
print("Pendulum-v1 环境信息")
print("=" * 50)
print(f"观测空间: {env.observation_space}")
print(f"  观测: [cos(θ), sin(θ), θ̇]")
print(f"动作空间: {env.action_space}")
print(f"  扭矩范围: [{env.action_space.low[0]:.1f}, {env.action_space.high[0]:.1f}]")
print(f"\n这是一个连续动作空间环境!")

# 采样动作示例
print(f"\n随机采样的动作: {env.action_space.sample()}")

env.close()

In [None]:
# Pendulum PD 控制
env = gym.make("Pendulum-v1")

def pd_controller(obs):
    """PD 控制器策略"""
    cos_theta, sin_theta, theta_dot = obs
    theta = np.arctan2(sin_theta, cos_theta)
    
    # PD 控制
    Kp, Kd = 10.0, 2.0
    torque = -Kp * theta - Kd * theta_dot
    
    return np.clip([torque], -2.0, 2.0)

# 运行一个回合并记录数据
obs, _ = env.reset(seed=42)
observations = [obs]
actions = []
rewards_list = []

for _ in range(200):
    action = pd_controller(obs)
    obs, reward, terminated, truncated, _ = env.step(action)
    observations.append(obs)
    actions.append(action[0])
    rewards_list.append(reward)

print(f"回合总奖励: {sum(rewards_list):.1f}")

env.close()

In [None]:
# 可视化 Pendulum 控制过程
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

observations = np.array(observations)
steps = np.arange(len(observations))

# 角度
ax1 = axes[0, 0]
theta = np.arctan2(observations[:, 1], observations[:, 0])
ax1.plot(steps, np.degrees(theta), 'b-', linewidth=2)
ax1.axhline(y=0, color='r', linestyle='--', alpha=0.5)
ax1.set_xlabel('步数')
ax1.set_ylabel('角度 (度)')
ax1.set_title('摆角变化')

# 角速度
ax2 = axes[0, 1]
ax2.plot(steps, observations[:, 2], 'g-', linewidth=2)
ax2.axhline(y=0, color='r', linestyle='--', alpha=0.5)
ax2.set_xlabel('步数')
ax2.set_ylabel('角速度 (rad/s)')
ax2.set_title('角速度变化')

# 动作 (扭矩)
ax3 = axes[1, 0]
ax3.plot(range(len(actions)), actions, 'r-', linewidth=2)
ax3.axhline(y=0, color='gray', linestyle='--', alpha=0.5)
ax3.set_xlabel('步数')
ax3.set_ylabel('扭矩')
ax3.set_title('控制输入')

# 奖励
ax4 = axes[1, 1]
ax4.plot(range(len(rewards_list)), rewards_list, 'purple', linewidth=2)
ax4.plot(range(len(rewards_list)), np.cumsum(rewards_list) / (np.arange(len(rewards_list)) + 1), 
         'orange', linewidth=2, linestyle='--', label='移动平均')
ax4.set_xlabel('步数')
ax4.set_ylabel('奖励')
ax4.set_title('即时奖励')
ax4.legend()

plt.tight_layout()
plt.show()

## 6. 空间类型详解

Gymnasium 定义了多种空间类型来表示观测和动作空间。

In [None]:
from gymnasium import spaces

print("=" * 60)
print("Gymnasium 空间类型")
print("=" * 60)

# 1. Discrete - 离散空间
discrete = spaces.Discrete(5)
print(f"\n1. Discrete(5): 离散空间 {{0, 1, 2, 3, 4}}")
print(f"   采样: {[discrete.sample() for _ in range(5)]}")

# 2. Box - 连续空间
box = spaces.Box(low=-1.0, high=1.0, shape=(3,), dtype=np.float32)
print(f"\n2. Box([-1,1]^3): 连续空间")
print(f"   形状: {box.shape}")
print(f"   采样: {box.sample()}")

# 3. MultiDiscrete - 多离散空间
multi_discrete = spaces.MultiDiscrete([3, 2, 4])
print(f"\n3. MultiDiscrete([3,2,4]): 多维离散空间")
print(f"   每维范围: [0,3), [0,2), [0,4)")
print(f"   采样: {multi_discrete.sample()}")

# 4. MultiBinary - 多二值空间
multi_binary = spaces.MultiBinary(4)
print(f"\n4. MultiBinary(4): 多二值空间")
print(f"   采样: {multi_binary.sample()}")

# 5. Dict - 字典空间
dict_space = spaces.Dict({
    "position": spaces.Box(-10, 10, shape=(2,)),
    "velocity": spaces.Box(-1, 1, shape=(2,)),
    "flag": spaces.Discrete(2)
})
print(f"\n5. Dict 空间:")
sample = dict_space.sample()
for key, value in sample.items():
    print(f"   {key}: {value}")

## 7. 环境包装器 (Wrappers)

包装器允许我们在不修改原始环境的情况下，对观测、动作和奖励进行预处理。

In [None]:
from gymnasium.wrappers import (
    RecordEpisodeStatistics,
    TimeLimit,
    ClipAction
)

# 创建带包装器的环境
env = gym.make("CartPole-v1")
env = RecordEpisodeStatistics(env)  # 自动记录回合统计

print("使用 RecordEpisodeStatistics 包装器:")
print("="*50)

obs, _ = env.reset()
done = False

while not done:
    action = env.action_space.sample()
    obs, reward, terminated, truncated, info = env.step(action)
    done = terminated or truncated

if 'episode' in info:
    print(f"回合奖励: {info['episode']['r']}")
    print(f"回合长度: {info['episode']['l']}")
    print(f"回合时间: {info['episode']['t']:.3f}s")

env.close()

In [None]:
# 自定义包装器示例: 观测归一化
class NormalizeObservation(gym.ObservationWrapper):
    """
    在线观测归一化包装器
    
    使用 Welford 算法在线估计均值和方差
    """
    def __init__(self, env, epsilon=1e-8):
        super().__init__(env)
        self.epsilon = epsilon
        self.mean = np.zeros(env.observation_space.shape)
        self.var = np.ones(env.observation_space.shape)
        self.count = 0
    
    def observation(self, obs):
        # 更新统计量
        self.count += 1
        delta = obs - self.mean
        self.mean += delta / self.count
        self.var += delta * (obs - self.mean)
        
        # 归一化
        std = np.sqrt(self.var / max(1, self.count) + self.epsilon)
        return (obs - self.mean) / std

# 测试归一化包装器
env = gym.make("CartPole-v1")
env = NormalizeObservation(env)

print("归一化前后观测对比:")
print("="*50)

obs, _ = env.reset()
print(f"步骤 0 - 归一化观测: {obs}")

for i in range(50):
    obs, _, terminated, truncated, _ = env.step(env.action_space.sample())
    if terminated or truncated:
        obs, _ = env.reset()

print(f"步骤 50 - 归一化观测: {obs}")
print(f"\n估计的均值: {env.mean}")
print(f"估计的标准差: {np.sqrt(env.var / env.count)}")

env.close()

## 8. 向量化环境

向量化环境允许并行运行多个环境实例，大大提高采样效率。

In [None]:
from gymnasium.vector import SyncVectorEnv

# 创建 4 个并行环境
n_envs = 4

def make_env():
    return gym.make("CartPole-v1")

vec_env = SyncVectorEnv([make_env for _ in range(n_envs)])

print(f"向量化环境信息:")
print(f"  环境数量: {vec_env.num_envs}")
print(f"  单环境观测空间: {vec_env.single_observation_space}")
print(f"  批量观测空间: {vec_env.observation_space}")

# 并行重置
obs, info = vec_env.reset()
print(f"\n批量观测形状: {obs.shape}")

# 并行执行
actions = vec_env.action_space.sample()  # 采样 n_envs 个动作
print(f"批量动作: {actions}")

obs, rewards, terminateds, truncateds, infos = vec_env.step(actions)
print(f"批量奖励: {rewards}")
print(f"批量终止: {terminateds}")

vec_env.close()

In [None]:
# 向量化环境采样效率对比
import time

n_steps = 1000

# 单环境
env = gym.make("CartPole-v1")
start = time.time()
obs, _ = env.reset()
for _ in range(n_steps):
    obs, _, terminated, truncated, _ = env.step(env.action_space.sample())
    if terminated or truncated:
        obs, _ = env.reset()
single_time = time.time() - start
env.close()

# 向量化环境
n_envs = 4
vec_env = SyncVectorEnv([lambda: gym.make("CartPole-v1") for _ in range(n_envs)])
start = time.time()
obs, _ = vec_env.reset()
for _ in range(n_steps // n_envs):
    obs, _, _, _, _ = vec_env.step(vec_env.action_space.sample())
vec_time = time.time() - start
vec_env.close()

print(f"采样 {n_steps} 步:")
print(f"  单环境: {single_time:.3f}s")
print(f"  向量化 ({n_envs} 并行): {vec_time:.3f}s")
print(f"  加速比: {single_time / vec_time:.2f}x")

## 9. 练习题

### 练习 1: 实现更好的 CartPole 策略

尝试改进 PID 控制器的参数，使平均奖励达到 400 以上。

In [None]:
# 练习 1: 改进 PID 控制器
def improved_pid_policy(obs):
    """
    TODO: 调整 PID 参数以获得更好的性能
    提示:
    - Kp_theta: 角度比例增益
    - Kd_theta: 角度微分增益
    - Kp_x: 位置比例增益
    - Kd_x: 位置微分增益
    """
    x, x_dot, theta, theta_dot = obs
    
    # TODO: 调整这些参数
    Kp_theta = 10.0
    Kd_theta = 1.0
    Kp_x = 0.1
    Kd_x = 0.5
    
    u = Kp_theta * theta + Kd_theta * theta_dot + Kp_x * x + Kd_x * x_dot
    return 1 if u > 0 else 0

# 测试你的改进
env = gym.make("CartPole-v1")
rewards = [run_episode(env, improved_pid_policy, seed=i)[0] for i in range(20)]
print(f"平均奖励: {np.mean(rewards):.1f} ± {np.std(rewards):.1f}")
env.close()

### 练习 2: 实现 MountainCar 能量策略

小车的总能量 = 动能 + 势能。设计一个策略，通过调节能量来到达山顶。

In [None]:
# 练习 2: 能量策略
def energy_policy(obs):
    """
    TODO: 实现基于能量的策略
    
    提示:
    - 势能与高度成正比: PE ∝ sin(3*x)
    - 动能: KE = 0.5 * m * v^2
    - 目标: 积累足够能量到达 x = 0.5
    """
    position, velocity = obs
    
    # TODO: 实现你的策略
    # 基础版本: 跟随速度方向
    if velocity > 0:
        return 2  # 向右
    else:
        return 0  # 向左

# 测试
env = gym.make("MountainCar-v0")
rewards = [run_episode(env, energy_policy, seed=i)[0] for i in range(5)]
print(f"平均奖励: {np.mean(rewards):.1f} (越接近 0 越好)")
env.close()

### 练习 3: 自定义奖励包装器

实现一个奖励缩放包装器，将奖励除以一个常数。

In [None]:
# 练习 3: 奖励缩放包装器
class RewardScaler(gym.RewardWrapper):
    """
    TODO: 实现奖励缩放包装器
    
    缩放后的奖励 = 原始奖励 / scale
    """
    def __init__(self, env, scale=1.0):
        super().__init__(env)
        self.scale = scale
    
    def reward(self, reward):
        # TODO: 实现奖励缩放
        return reward / self.scale

# 测试
env = gym.make("CartPole-v1")
env = RewardScaler(env, scale=10.0)

obs, _ = env.reset()
obs, reward, _, _, _ = env.step(0)
print(f"缩放后的奖励: {reward} (原始应为 1.0)")
env.close()

## 10. 总结

本教程涵盖了 Gymnasium 的核心概念:

1. **环境创建**: `gym.make(env_id)`
2. **基本交互**: `reset()`, `step()`, `render()`, `close()`
3. **空间类型**: Discrete, Box, MultiDiscrete, Dict 等
4. **包装器**: 观测/动作/奖励预处理
5. **向量化环境**: 并行采样提高效率

### 下一步

- 学习 Q-Learning 和 SARSA 算法
- 探索深度强化学习 (DQN, PPO)
- 尝试更复杂的环境 (Atari, MuJoCo)

In [None]:
# 列出所有可用的经典控制环境
print("经典控制环境列表:")
print("="*50)

classic_envs = [
    ("CartPole-v1", "倒立摆平衡"),
    ("MountainCar-v0", "爬山车 (离散)"),
    ("MountainCarContinuous-v0", "爬山车 (连续)"),
    ("Acrobot-v1", "双摆控制"),
    ("Pendulum-v1", "单摆控制 (连续)"),
]

for env_id, desc in classic_envs:
    try:
        env = gym.make(env_id)
        obs_dim = env.observation_space.shape
        act_type = "离散" if isinstance(env.action_space, spaces.Discrete) else "连续"
        print(f"  {env_id:30s} | 观测: {str(obs_dim):10s} | 动作: {act_type}")
        env.close()
    except:
        print(f"  {env_id:30s} | 未安装")