## 倒立摆实验

#### 1. 只玩一局游戏

In [4]:
import gym

In [5]:
env = gym.make('CartPole-v0')
observation = env.reset()  # 幕的初始状态
print("初始观测：{}".format(observation))
for i in range(200):
    env.render()  # 帧渲染, 决定了是否显示游戏界面
    action = env.action_space.sample()  # 随机选择一个动作
    print("{}: 动作 = {}".format(i, action))
    obs, reward, done, info = env.step(action)
    print("{}: 观测 = {}, 奖励 = {}, 结束标识 = {}, 其它信息 = {}".format(i, obs, reward, done, info))

    if done:
        break

env.close()

初始观测：[-0.02574804 -0.0175332  -0.02261496  0.01928176]
0: 动作 = 0
0: 观测 = [-0.0260987  -0.21232365 -0.02222933  0.30474449], 奖励 = 1.0, 结束标识 = False, 其它信息 = {}
1: 动作 = 1
1: 观测 = [-0.03034517 -0.01689208 -0.01613444  0.00513468], 奖励 = 1.0, 结束标识 = False, 其它信息 = {}
2: 动作 = 0
2: 观测 = [-0.03068302 -0.21177897 -0.01603174  0.29268365], 奖励 = 1.0, 结束标识 = False, 其它信息 = {}
3: 动作 = 1
3: 观测 = [-0.0349186  -0.01643215 -0.01017807 -0.00501206], 奖励 = 1.0, 结束标识 = False, 其它信息 = {}
4: 动作 = 1
4: 观测 = [-0.03524724  0.17883428 -0.01027831 -0.30088883], 奖励 = 1.0, 结束标识 = False, 其它信息 = {}
5: 动作 = 0
5: 观测 = [-0.03167055 -0.01613968 -0.01629609 -0.01146511], 奖励 = 1.0, 结束标识 = False, 其它信息 = {}
6: 动作 = 0
6: 观测 = [-0.03199335 -0.21102418 -0.01652539  0.27603198], 奖励 = 1.0, 结束标识 = False, 其它信息 = {}
7: 动作 = 1
7: 观测 = [-0.03621383 -0.0156704  -0.01100475 -0.02181691], 奖励 = 1.0, 结束标识 = False, 其它信息 = {}
8: 动作 = 1
8: 观测 = [-0.03652724  0.17960762 -0.01144109 -0.31795155], 奖励 = 1.0, 结束标识 = False, 其它信息 = {}
9: 动作 = 0
9: 观测 = 

#### 2. 玩多局游戏

In [6]:
episode_num = 20
env = gym.make("CartPole-v0")

for e in range(episode_num):
    obs = env.reset()
    episode_rewards = 0
    while True:
        env.render()
        action = env.action_space.sample()
        obs, reward, done, _ = env.step(action)
        episode_rewards += reward
        obs = obs
        if done:
            print("第{}局得分: {}".format(e + 1, episode_rewards))
            break

env.close()

第1局得分: 11.0
第2局得分: 12.0
第3局得分: 12.0
第4局得分: 12.0
第5局得分: 24.0
第6局得分: 16.0
第7局得分: 25.0
第8局得分: 21.0
第9局得分: 18.0
第10局得分: 16.0
第11局得分: 16.0
第12局得分: 11.0
第13局得分: 32.0
第14局得分: 13.0
第15局得分: 15.0
第16局得分: 16.0
第17局得分: 15.0
第18局得分: 11.0
第19局得分: 24.0
第20局得分: 11.0


#### 3. 使用 DQN (FNN+Adam) 训练游戏

步骤 1：构建 FNN

In [63]:
import torch.nn as nn

fnn = nn.Sequential(
    nn.Linear(env.observation_space.shape[0], 128),
    nn.ReLU(),
    nn.Linear(128, 128),
    nn.ReLU(),
    nn.Linear(128, env.action_space.n)
)

步骤 2：决策方法 ($\epsilon$-greedy 策略)

In [64]:
import random

def act(net, state, epsilon):
    if random.random() > epsilon:  # 选择Q值最大的动作
        state = torch.FloatTensor(state).unsqueeze(0)
        q_value = net.forward(state)  # torch.Size([1, 2])
        action = q_value.max(1)[1].item()
    else:  # 随机选择一个动作
        action = random.randrange(env.action_space.n)
    
    return action

步骤 3：$\epsilon$ 随时间衰减

In [65]:
import math

def calc_epsilon(t, epsilon_start=1.0, epsilon_final=0.01, epsilon_decay=500):
    epsilon = epsilon_final + (epsilon_start-epsilon_final) * math.exp(-1. * t/epsilon_decay)
    
    return epsilon

步骤 4：定义经验回放集

In [66]:
import numpy as np
from collections import deque

class ReplayBuffer(object):
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)
        
    def store(self, state, action, reward, next_state, done):
        state = np.expand_dims(state, 0)
        next_state = np.expand_dims(next_state, 0)
        self.buffer.append((state, action, reward, next_state, done))
    
    def sample(self, batch_size):
        state, action, reward, next_state, done = zip(* random.sample(self.buffer, batch_size))
        state = np.concatenate(state)
        next_state = np.concatenate(next_state)
        
        return state, action, reward, next_state, done
    
    def __len__(self):
        return len(self.buffer)
    
replay_buffer = ReplayBuffer(1000)

步骤 5：训练网络

In [69]:
import torch.optim
optimizer = torch.optim.Adam(fnn.parameters())

t = 0  # 训练步数,用于计算epsilon
gamma = 0.99
batch_size = 32
episode_rewards = []  # 各局得分,用来判断训练是否完成


while True:
    
    # 开始新的一局
    state = env.reset()
    episode_reward = 0

    while True:
        epsilon = calc_epsilon(t)
        action = act(fnn, state, epsilon)
        next_state, reward, done, _ = env.step(action)
        replay_buffer.store(state, action, reward, next_state, done)

        state = next_state
        episode_reward += reward

        if len(replay_buffer) > batch_size:
            sample_state, sample_action, sample_reward, sample_next_state, \
                    sample_done = replay_buffer.sample(batch_size)
            sample_state = torch.tensor(sample_state, dtype=torch.float32)
            sample_action = torch.tensor(sample_action, dtype=torch.int64)
            sample_reward = torch.tensor(sample_reward, dtype=torch.float32)
            sample_next_state = torch.tensor(sample_next_state,
                    dtype=torch.float32)
            sample_done = torch.tensor(sample_done, dtype=torch.float32)
            
            next_qs = fnn(sample_next_state)
            next_q= next_qs.max(1)[0]
            expected_q = sample_reward + gamma * next_q * (1 - sample_done)
            
            qs = fnn(sample_state)
            q = qs.gather(1, sample_action.unsqueeze(1)).squeeze(1)
            
            td_error = expected_q - q
            # 计算 MSE 损失
            loss = td_error.pow(2).mean() 
            
            # 根据损失改进网络
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            t += 1
            
        if done: # 本局结束
            episode_rewards.append(episode_reward)
            i_episode = len(episode_rewards)
            print ('第{}局收益 = {}'.format(i_episode, episode_reward))
            break
            
    if len(episode_rewards) > 20 and np.mean(episode_rewards[-20:]) > 195:
        break # 训练结束

第1局收益 = 12.0
第2局收益 = 33.0
第3局收益 = 18.0
第4局收益 = 33.0
第5局收益 = 60.0
第6局收益 = 22.0
第7局收益 = 15.0
第8局收益 = 26.0
第9局收益 = 37.0
第10局收益 = 66.0
第11局收益 = 38.0
第12局收益 = 127.0
第13局收益 = 74.0
第14局收益 = 200.0
第15局收益 = 66.0
第16局收益 = 60.0
第17局收益 = 95.0
第18局收益 = 200.0
第19局收益 = 200.0
第20局收益 = 200.0
第21局收益 = 131.0
第22局收益 = 200.0
第23局收益 = 200.0
第24局收益 = 200.0
第25局收益 = 200.0
第26局收益 = 200.0
第27局收益 = 200.0
第28局收益 = 200.0
第29局收益 = 200.0
第30局收益 = 200.0
第31局收益 = 200.0
第32局收益 = 200.0
第33局收益 = 200.0
第34局收益 = 200.0
第35局收益 = 200.0
第36局收益 = 200.0
第37局收益 = 200.0


步骤 6：利用训练好的游戏 AI 玩游戏 (贪婪策略，$\epsilon = 0$)

In [73]:
n_episode = 20 
for e in range(n_episode):
    observation = env.reset()
    episode_reward = 0
    while True:
        env.render()
        action  = act(fnn, observation, 0)  # 贪婪策略
        observation, reward, done, _ = env.step(action)
        episode_reward += reward
        state = observation
        if done:
            break
    print ('第{}局收益 = {}'.format(e+1, episode_reward))

env.close()

第1局收益 = 200.0
第2局收益 = 200.0
第3局收益 = 200.0
第4局收益 = 200.0
第5局收益 = 200.0
第6局收益 = 200.0
第7局收益 = 200.0
第8局收益 = 200.0
第9局收益 = 200.0
第10局收益 = 200.0
第11局收益 = 200.0
第12局收益 = 200.0
第13局收益 = 200.0
第14局收益 = 200.0
第15局收益 = 200.0
第16局收益 = 200.0
第17局收益 = 200.0
第18局收益 = 200.0
第19局收益 = 200.0
第20局收益 = 200.0
