# 木棒台车(CartPole)

## 载入相关套件

In [2]:
import gym
from gym import envs




## 随机行动

In [3]:
# 参数设定
no = 50        # 比赛回合数

# 载入木棒台车(CartPole)游戏
env = gym.make("CartPole-v1")

# 重置
observation = env.reset()
all_rewards=[] # 每回合总报酬
all_steps=[] # 每回合总步数
total_rewards = 0
total_steps=0

while no > 0:   # 执行 50 比赛回合数
    # 随机行动
    action = env.action_space.sample() 
    total_steps+=1

    # 触动下一步
    observation, reward, done, info = env.step(action)
    # 累计报酬
    total_rewards += reward

    # 比赛回合结束，重置
    if done:
        observation = env.reset()
        all_rewards.append(total_rewards)
        all_steps.append(total_steps)
        total_rewards = 0
        total_steps=0
        no-=1

env.close()

In [4]:
# 显示执行结果
print('回合\t报酬\t结果')
for i, (rewards, steps) in enumerate(zip(all_rewards, all_steps)):
    result = 'Win' if steps >= 200 else 'Loss'
    print(f'{i}\t{rewards}\t{result}')

回合	报酬	结果
0	26.0	Loss
1	71.0	Loss
2	13.0	Loss
3	11.0	Loss
4	13.0	Loss
5	19.0	Loss
6	12.0	Loss
7	11.0	Loss
8	31.0	Loss
9	19.0	Loss
10	44.0	Loss
11	13.0	Loss
12	17.0	Loss
13	29.0	Loss
14	29.0	Loss
15	40.0	Loss
16	27.0	Loss
17	25.0	Loss
18	12.0	Loss
19	13.0	Loss
20	20.0	Loss
21	14.0	Loss
22	43.0	Loss
23	14.0	Loss
24	15.0	Loss
25	23.0	Loss
26	20.0	Loss
27	12.0	Loss
28	12.0	Loss
29	46.0	Loss
30	13.0	Loss
31	18.0	Loss
32	19.0	Loss
33	15.0	Loss
34	27.0	Loss
35	8.0	Loss
36	47.0	Loss
37	20.0	Loss
38	42.0	Loss
39	11.0	Loss
40	47.0	Loss
41	14.0	Loss
42	29.0	Loss
43	23.0	Loss
44	27.0	Loss
45	10.0	Loss
46	28.0	Loss
47	20.0	Loss
48	10.0	Loss
49	12.0	Loss


## 传统解法

In [5]:
import math 

# 参数设定
left, right = 0, 1  # 台车行进方向
max_angle = 8       # 偏右8度以上，就往右前进，偏左也是同样处理

In [6]:
class Agent:
    # 初始化
    def __init__(self):
        self.direction = left
        self.last_direction=right
        
    # 自订策略
    def act(self, observation):
        # 台车位置、台车速度、平衡杆角度、平衡杆速度
        cart_position, cart_velocity, pole_angle, pole_velocity = observation
        
        '''
        行动策略：
        1. 设定每次行动采一左一右，尽量不离中心点。
        2. 平衡杆角度偏右8度以上，就往右前进，直到角度偏右小于8度。
        3. 反之，偏左也是同样处理。
        '''
        if pole_angle < math.radians(max_angle) and \
            pole_angle > math.radians(-max_angle):
            self.direction = (self.last_direction + 1) % 2
        elif pole_angle >= math.radians(max_angle):
            self.direction = right
        else:
            self.direction = left

        self.last_direction = self.direction
        
        return self.direction  

In [8]:
no = 50        # 比赛回合数

# 载入 木棒台车(CartPole) 游戏
env = gym.make("CartPole-v1")

# 重置
observation = env.reset()
all_rewards=[] # 每回合总报酬
all_steps=[] # 每回合总步数
total_rewards = 0
total_steps=0

agent = Agent()
while no > 0:   # 执行 50 比赛回合数
    # 行动
    action = agent.act(observation) #env.action_space.sample()
    total_steps+=1

    # 触动下一步
    observation, reward, done, info = env.step(action)
    # 累计报酬
    total_rewards += reward

    # 比赛回合结束，重置
    if done:
        observation = env.reset()
        all_rewards.append(total_rewards)
        total_rewards = 0
        all_steps.append(total_steps)
        total_steps = 0
        no-=1

env.close()

In [9]:
# 显示执行结果
print('回合\t报酬\t结果')
for i, (rewards, steps) in enumerate(zip(all_rewards, all_steps)):
    result = 'Win' if steps >= 200 else 'Loss'
    print(f'{i}\t{rewards}\t{result}')

回合	报酬	结果
0	70.0	Loss
1	66.0	Loss
2	78.0	Loss
3	136.0	Loss
4	130.0	Loss
5	130.0	Loss
6	91.0	Loss
7	111.0	Loss
8	99.0	Loss
9	132.0	Loss
10	103.0	Loss
11	102.0	Loss
12	54.0	Loss
13	130.0	Loss
14	138.0	Loss
15	69.0	Loss
16	65.0	Loss
17	130.0	Loss
18	100.0	Loss
19	106.0	Loss
20	132.0	Loss
21	62.0	Loss
22	70.0	Loss
23	92.0	Loss
24	123.0	Loss
25	71.0	Loss
26	70.0	Loss
27	101.0	Loss
28	57.0	Loss
29	101.0	Loss
30	97.0	Loss
31	66.0	Loss
32	88.0	Loss
33	88.0	Loss
34	142.0	Loss
35	136.0	Loss
36	124.0	Loss
37	43.0	Loss
38	83.0	Loss
39	108.0	Loss
40	86.0	Loss
41	102.0	Loss
42	97.0	Loss
43	83.0	Loss
44	41.0	Loss
45	185.0	Loss
46	149.0	Loss
47	75.0	Loss
48	72.0	Loss
49	114.0	Loss
50	121.0	Loss
51	103.0	Loss
52	76.0	Loss
53	72.0	Loss
54	107.0	Loss
55	45.0	Loss
56	127.0	Loss
57	89.0	Loss
58	81.0	Loss
59	98.0	Loss
60	159.0	Loss
61	104.0	Loss
62	125.0	Loss
63	94.0	Loss
64	100.0	Loss
65	128.0	Loss
66	137.0	Loss
67	144.0	Loss
68	108.0	Loss
69	89.0	Loss
70	113.0	Loss
71	140.0	Loss
72	83.0	Loss
73	70.0	Loss
7

## 以下程式来自：[‘From Scratch_ AI Balancing Act in 50 Lines of Python’](https://towardsdatascience.com/from-scratch-ai-balancing-act-in-50-lines-of-python-7ea67ef717)

In [3]:
import numpy as np

env = gym.make('CartPole-v1')

def play(env, policy):
    observation = env.reset()
    
    done = False
    score = 0
    observations = []
    
    # 训练5000步
    for _ in range(5000):
        observations += [observation.tolist()] # 记录历次状态
        
        if done: # 回合是否胜负已分
            break
                
        # 行动策略选择
        outcome = np.dot(policy, observation)
        action = 1 if outcome > 0 else 0
        
        # 触发下一步
        observation, reward, done, info = env.step(action)
        score += reward

    return score, observations

In [4]:
np.random.rand(1,4)

array([[0.4980173 , 0.67918521, 0.10102454, 0.29095686]])

In [5]:
# 训练 10 回合
max = (0, [], [])
for _ in range(10):
    policy = np.random.rand(1,4) # 产生4个随机变数 [0, 1)
    score, observations = play(env, policy) # 开始玩
    
    if score > max[0]: # 取最大分数
        max = (score, observations, policy)

print('Max Score', max[0])

Max Score 500.0


In [6]:
# 最终版本
max = (0, [], [])

for _ in range(100): # 训练 100 回合    
    policy = np.random.rand(1,4) - 0.5  # 改为 [-0.5, 0.5]
    score, observations = play(env, policy)
    
    if score > max[0]:  # 取最大分数
        max = (score, observations, policy)
        
print('Max Score', max[0])

Max Score 500.0


## 以最大分数的policy进行实验，验证最佳策略是否有效

In [11]:
# 取得最佳策略
policy = max[2]
policy

array([[ 0.05171018, -0.0624274 ,  0.23256838,  0.22154222]])

## 以最佳策略取代随机policy，进行 10 回合验证    

In [12]:
for _ in range(10): 
    score, observations = play(env, policy)
    print('Score: ', score)

Score:  500.0
Score:  210.0
Score:  500.0
Score:  205.0
Score:  143.0
Score:  500.0
Score:  500.0
Score:  215.0
Score:  500.0
Score:  500.0
