In [1]:
import torch
import gym
import numpy as np

In [2]:
from tensorboardX import SummaryWriter

In [3]:
import random

In [4]:
## 区分 q-value iterator 和 Q-learning
## q-value iterator: dp ，model-based, 
## q-learning: td, model-free

In [5]:
ENV_NAME = "FrozenLake-v0"
import collections
GAMMA = 0.9
ALPHA = 0.2
TEST_EPISODES = 20

In [6]:
class Agent:
    def __init__(self):
        self.env = gym.make(ENV_NAME)
        self.state = self.env.reset()
        self.values = collections.defaultdict(float)

    def sample_env(self):
        action = self.env.action_space.sample()
        old_state = self.state
        new_state, reward, is_done, _ = self.env.step(action)
        self.state = self.env.reset() if is_done else new_state
        return (old_state, action, reward, new_state)

    def best_value_and_action(self, state):
        best_value, best_action = None, None
        for action in range(self.env.action_space.n):
            action_value = self.values[(state, action)]
            if best_value is None or best_value < action_value:
                best_value = action_value
                best_action = action
        return best_value, best_action

    def value_update(self, s, a, r, next_s):
        best_v, _ = self.best_value_and_action(next_s)
        new_val = r + GAMMA * best_v
        old_val = self.values[(s, a)]
        self.values[(s, a)] = old_val * (1-ALPHA) + new_val * ALPHA

    def play_episode(self, env):
        total_reward = 0.0
        state = env.reset()
        while True:
            _, action = self.best_value_and_action(state)
            new_state, reward, is_done, _ = env.step(action)
            total_reward += reward
            if is_done:
                break
            state = new_state
        return total_reward


In [11]:
num_exp = 1
writer = SummaryWriter('./runs/q_learning/exp%d'%(num_exp))

In [12]:
test_env = gym.make('FrozenLake-v0')
agent = Agent()
random.seed(12345)

num_iteraton = 6000
test_eposides = 20
best_reward = 0.0
for iter_no in range(num_iteraton): 
    
    ##更新策略
    s,a,r,ns = agent.sample_env()
    #print('reward',r)
    agent.value_update(s,a,r,ns)
    ##查看策略效果
    total_reward = 0.0
    for i in range(test_eposides):
        gain = agent.play_episode(test_env)
        total_reward += gain
    expect_gain = total_reward/float(test_eposides)
    writer.add_scalar('reward',expect_gain,global_step=iter_no)
    if (1+iter_no) % 100 == 0:
        print('%d, gain:%f'%(iter_no,expect_gain))
    
    if expect_gain > best_reward:
        print("iter:%d Best reward updated %.3f -> %.3f" % (iter_no,best_reward, expect_gain,))
        best_reward = expect_gain
    if(1+iter_no)% 500==0:
        #print(agent.q_value)
        pass
   
    
writer.close()          

99, gain:0.000000
199, gain:0.000000
299, gain:0.000000
399, gain:0.000000
499, gain:0.000000
599, gain:0.000000
iter:625 Best reward updated 0.000 -> 0.050
iter:633 Best reward updated 0.050 -> 0.100
699, gain:0.000000
iter:743 Best reward updated 0.100 -> 0.150
iter:771 Best reward updated 0.150 -> 0.250
799, gain:0.150000
iter:827 Best reward updated 0.250 -> 0.300
899, gain:0.000000
999, gain:0.000000
1099, gain:0.050000
1199, gain:0.000000
1299, gain:0.050000
1399, gain:0.050000
1499, gain:0.150000
1599, gain:0.050000
1699, gain:0.150000
iter:1713 Best reward updated 0.300 -> 0.350
iter:1730 Best reward updated 0.350 -> 0.400
1799, gain:0.050000
1899, gain:0.250000
iter:1938 Best reward updated 0.400 -> 0.500
1999, gain:0.300000
2099, gain:0.250000
2199, gain:0.150000
2299, gain:0.050000
2399, gain:0.100000
2499, gain:0.150000
2599, gain:0.100000
2699, gain:0.300000
iter:2777 Best reward updated 0.500 -> 0.550
iter:2778 Best reward updated 0.550 -> 0.650
2799, gain:0.350000
2899, 