In [1]:
import gym
import numpy as np
from collections import defaultdict
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
env = gym.make('FrozenLake-v0')
# env = gym.make('FrozenLake8x8-v0')

# 0-left 1-down 2-right 3-up
observations = env.observation_space
actions_dict = {0: 'left', 1: 'down', 2: 'right', 3: 'up'}
actions = env.action_space
print('possible observations: ', observations)
print('possible actions: ', actions)

[2018-08-27 20:55:11,641] Making new env: FrozenLake-v0


possible observations:  Discrete(16)
possible actions:  Discrete(4)


In [3]:
Q = defaultdict(float)

# q-learning hyper parameter
gamma = 0.95  # Discounting factor
alpha = 0.8  # soft update param
epsilon = 0.1  # 10% chances to apply a random action

n_episode = 10000
MAX_STEP = 1000

In [4]:
def update_Q(s, r, a, s_next, done):
    max_q_next = max([Q[s_next, a] for a in range(actions.n)])
    # Do not include the next state's value if currently at the terminal state.
    Q[s, a] += alpha * (r + gamma * max_q_next * (1.0 - done) - Q[s, a])

def act(state, episode):
#     if np.random.random() < epsilon:
#         # action_space.sample() is a convenient function to get a random action
#         # that is compatible with this given action space.
#         return env.action_space.sample()

    # Pick the action with highest q value.
    qvals = {a: Q[state, a] + np.random.randn()*(1./(episode+1)) for a in range(actions.n)}
    max_q = max(qvals.values())
    # In case multiple actions have the same maximum q value.
    actions_with_max_q = [a for a, q in qvals.items() if q == max_q]
    return np.random.choice(actions_with_max_q)

def optimal_act(state):
    # Pick the action with highest q value.
    qvals = {a: Q[state, a] for a in range(actions.n)}
    max_q = max(qvals.values())
    # In case multiple actions have the same maximum q value.
    actions_with_max_q = [a for a, q in qvals.items() if q == max_q]
    return np.random.choice(actions_with_max_q)


In [5]:
n_avg = 100

In [6]:
Q = defaultdict(float)
reward_list = []
for episode in range(n_episode):
    observation = env.reset()
    total_reward = 0
    for step in range(MAX_STEP):
#         env.render()
        action = act(observation, episode)
        observation_next, reward, done, info = env.step(action)
        # print(observation_next, reward, done, info)
        total_reward += reward
        update_Q(observation, reward, action, observation_next, done)
        observation = observation_next
        if done:
            # print('-'*10+'end'+'-'*10)
            break
    reward_list.append(total_reward)
#     if episode%n_avg == 0:
#         print('Avg reward for {:d}-{:d} episode: {:f}'.format(episode-n_avg, episode, np.average(reward_list[episode-n_avg:episode])))
            

In [None]:
plt.plot(reward_list)

In [10]:
test_episode = 1000
test_reward_list = []
test_reward = 0
observation = env.reset()
for episode in range(test_episode):
    observation = env.reset()
    test_reward = 0
    for step in range(MAX_STEP):
        # env.render()
        action = optimal_act(observation)
        observation_next, reward, done, info = env.step(action)
        test_reward += reward
        observation = observation_next
        if done:
#             print("Episode finished after {} timesteps".format(step+1))
#             print('Test reward: ', test_reward)
            observation = env.reset()
            break
    test_reward_list.append(test_reward)
    
print('Avg test reward for {:d} episode: {:f}'.format(test_episode, np.average(test_reward_list)))

Avg test reward for 1000 episode: 0.752000
