# 木棒台車(CartPole)

In [1]:
import gymnasium as gym

## 隨機行動

In [2]:
# 載入木棒台車(CartPole)遊戲
env = gym.make("CartPole-v1")

# 參數設定
no = 50 # 設定比賽回合數
all_rewards=[] # 每回合總報酬
all_steps=[] # 每回合總步數
total_rewards = 0
total_steps=0

# 重置
observation, info = env.reset()
while no > 0:   # 執行 50 比賽回合數
    # 隨機行動
    action = env.action_space.sample() 
    total_steps+=1

    # 觸動下一步
    observation, reward, terminated, truncated, info = env.step(action)
    done = terminated or truncated
    # 累計報酬
    total_rewards += reward

    # 比賽回合結束，重置
    if done:
        observation, info = env.reset()
        all_rewards.append(total_rewards)
        all_steps.append(total_steps)
        total_rewards = 0
        total_steps=0
        no-=1

env.close()

In [3]:
# 顯示執行結果
print('回合\t報酬\t結果')
for i, (rewards, steps) in enumerate(zip(all_rewards, all_steps)):
    result = 'Win' if steps >= 200 else 'Loss'
    print(f'{i}\t{rewards}\t{result}')

回合	報酬	結果
0	16.0	Loss
1	61.0	Loss
2	13.0	Loss
3	19.0	Loss
4	14.0	Loss
5	21.0	Loss
6	32.0	Loss
7	9.0	Loss
8	18.0	Loss
9	18.0	Loss
10	21.0	Loss
11	36.0	Loss
12	19.0	Loss
13	25.0	Loss
14	40.0	Loss
15	18.0	Loss
16	25.0	Loss
17	10.0	Loss
18	36.0	Loss
19	19.0	Loss
20	17.0	Loss
21	52.0	Loss
22	51.0	Loss
23	10.0	Loss
24	11.0	Loss
25	22.0	Loss
26	19.0	Loss
27	27.0	Loss
28	19.0	Loss
29	15.0	Loss
30	41.0	Loss
31	11.0	Loss
32	20.0	Loss
33	24.0	Loss
34	24.0	Loss
35	13.0	Loss
36	14.0	Loss
37	18.0	Loss
38	14.0	Loss
39	39.0	Loss
40	53.0	Loss
41	19.0	Loss
42	19.0	Loss
43	10.0	Loss
44	35.0	Loss
45	33.0	Loss
46	18.0	Loss
47	40.0	Loss
48	13.0	Loss
49	19.0	Loss


## 傳統解法

In [4]:
import math 

# 參數設定
left, right = 0, 1  # 台車行進方向
max_angle = 8       # 偏右8度以上，就往右前進，偏左也是同樣處理

In [5]:
class Agent:
    # 初始化
    def __init__(self):
        self.direction = left
        self.last_direction=right
        
    # 自訂策略
    def act(self, observation):
        # 台車位置、台車速度、平衡桿角度、平衡桿速度
        cart_position, cart_velocity, pole_angle, pole_velocity = observation
        
        '''
        行動策略：
        1. 設定每次行動採一左一右，盡量不離中心點。
        2. 平衡桿角度偏右8度以上，就往右前進，直到角度偏右小於8度。
        3. 反之，偏左也是同樣處理。
        '''
        if pole_angle < math.radians(max_angle) and \
            pole_angle > math.radians(-max_angle):
            self.direction = (self.last_direction + 1) % 2
        elif pole_angle >= math.radians(max_angle):
            self.direction = right
        else:
            self.direction = left

        self.last_direction = self.direction
        
        return self.direction  

In [10]:
# 重置
observation, info  = env.reset()
all_rewards=[] # 每回合總報酬
all_steps=[] # 每回合總步數
total_rewards = 0
total_steps=0
no = 50        # 比賽回合數

agent = Agent()
while no > 0:   # 執行 50 比賽回合數
    # 行動
    action = agent.act(observation) #env.action_space.sample()
    total_steps+=1

    # 觸動下一步
    observation, reward, terminated, truncated, info = env.step(action)
    done = terminated or truncated
    # 累計報酬
    total_rewards += reward

    # 比賽回合結束，重置
    if done:
        observation, info = env.reset()
        all_rewards.append(total_rewards)
        total_rewards = 0
        all_steps.append(total_steps)
        total_steps = 0
        no-=1

env.close()

In [11]:
# 顯示執行結果
print('回合\t報酬\t結果')
for i, (rewards, steps) in enumerate(zip(all_rewards, all_steps)):
    result = 'Win' if steps >= 200 else 'Loss'
    print(f'{i}\t{rewards}\t{result}')

回合	報酬	結果
0	116.0	Loss
1	46.0	Loss
2	105.0	Loss
3	90.0	Loss
4	132.0	Loss
5	111.0	Loss
6	118.0	Loss
7	139.0	Loss
8	94.0	Loss
9	108.0	Loss
10	102.0	Loss
11	84.0	Loss
12	108.0	Loss
13	50.0	Loss
14	98.0	Loss
15	66.0	Loss
16	43.0	Loss
17	117.0	Loss
18	94.0	Loss
19	80.0	Loss
20	94.0	Loss
21	86.0	Loss
22	69.0	Loss
23	79.0	Loss
24	104.0	Loss
25	76.0	Loss
26	113.0	Loss
27	121.0	Loss
28	92.0	Loss
29	122.0	Loss
30	49.0	Loss
31	96.0	Loss
32	106.0	Loss
33	63.0	Loss
34	118.0	Loss
35	101.0	Loss
36	189.0	Loss
37	163.0	Loss
38	88.0	Loss
39	121.0	Loss
40	73.0	Loss
41	145.0	Loss
42	94.0	Loss
43	66.0	Loss
44	59.0	Loss
45	83.0	Loss
46	77.0	Loss
47	49.0	Loss
48	80.0	Loss
49	135.0	Loss


## 以下程式來自：[『From Scratch_ AI Balancing Act in 50 Lines of Python』](https://towardsdatascience.com/from-scratch-ai-balancing-act-in-50-lines-of-python-7ea67ef717)

In [12]:
def play(env, policy):
    observation, info = env.reset()
    
    done = False
    score = 0
    observations = []
    
    # 訓練5000步
    for _ in range(5000):
        observations += [observation.tolist()] # 記錄歷次狀態
        
        if done: # 回合是否勝負已分
            break
                
        # 行動策略選擇
        outcome = np.dot(policy, observation)
        action = 1 if outcome > 0 else 0
        
        # 觸發下一步
        observation, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        score += reward

    return score, observations

In [14]:
import numpy as np

# 訓練 10 回合
max = (0, [], [])
for _ in range(10):
    policy = np.random.rand(1,4) # 產生4個隨機變數 [0, 1)
    score, observations = play(env, policy) # 開始玩
    
    if score > max[0]: # 取最大分數
        max = (score, observations, policy)

print('Max Score', max[0])

Max Score 236.0


In [16]:
# 最終版本
max = (0, [], [])

for _ in range(100): # 訓練 100 回合    
    policy = np.random.rand(1,4) - 0.5  # 改為 [-0.5, 0.5]
    score, observations = play(env, policy)
    
    if score > max[0]:  # 取最大分數
        max = (score, observations, policy)
        
print('Max Score', max[0])

Max Score 500.0


## 以最大分數的policy進行實驗，驗證最佳策略是否有效

In [17]:
# 取得最佳策略
policy = max[2]
policy

array([[ 0.06344476, -0.10146005,  0.33464762,  0.2047181 ]])

## 以最佳策略取代隨機policy，進行 10 回合驗證    

In [18]:
for _ in range(10): 
    score, observations = play(env, policy)
    print('Score: ', score)

Score:  500.0
Score:  112.0
Score:  500.0
Score:  500.0
Score:  154.0
Score:  500.0
Score:  500.0
Score:  500.0
Score:  500.0
Score:  500.0
