In [8]:
import gym
import collections


ENV_NAME = "FrozenLake-v0"
GAMMA = 0.9
TEST_EPISODES = 20

In [9]:
env = gym.make(ENV_NAME)
state = env.reset()
rewards = collections.defaultdict(float)
transits = collections.defaultdict(collections.Counter)
values = collections.defaultdict(float)

In [10]:
def play_n_random_steps(count):
  state = env.reset()
  for _ in range(count):
      action = env.action_space.sample()
      new_state, reward, is_done, _ = env.step(action)
      rewards[(state, action, new_state)] = reward
      transits[(state, action)][new_state] += 1
      state = env.reset() if is_done else new_state

In [11]:
def select_action(state):
  best_action, best_value = None, None
  ## for all the actions an given state find the best action value
  for action in range(env.action_space.n):
      action_value = values[(state, action)]
      if best_value is None or best_value < action_value:
          best_value = action_value
          best_action = action
  return best_action

In [12]:
def play_episode(env):
  total_reward = 0.0
  state = env.reset()
  while True:
      action = select_action(state)
      new_state, reward, is_done, _ = env.step(action)
      # rewards[(state, action, new_state)] = reward
      # transits[(state, action)][new_state] += 1
      total_reward += reward
      if is_done:
          break
      state = new_state
  return total_reward

In [13]:
## This is the learning algorithm
## the value_iteration function in the Q-Learning case
def q_iteration():
    ## Find the max(Q(s,a)) for all the states
    ## for all the possible states
    for state in range(env.observation_space.n):
        ## for all the possible actions
        for action in range(env.action_space.n):
            action_value = 0.0
            ## find the transits for a given state action pair
            target_counts = transits[(state, action)]
            ## total transits for the given state, action pair
            total = sum(target_counts.values())
            ## For all possible target states find the action value
            for tgt_state, count in target_counts.items():
                reward = rewards[(state, action, tgt_state)]
                ## get the best action value for a given target state 
                best_action = select_action(tgt_state)
                ## self.values[(tgt_state, best_action) is the max_{a`}Q(s`,a`)
                action_value += reward + GAMMA * (count / total) * values[(tgt_state, best_action)]
            values[(state, action)] = action_value

In [14]:
test_env = gym.make(ENV_NAME)
# writer = SummaryWriter(comment="-q-iteration")

iter_no = 0
best_reward = 0.0
while True:
    iter_no += 1
    play_n_random_steps(100)
    q_iteration()

    reward = 0.0
    for _ in range(TEST_EPISODES):
        reward += play_episode(test_env)
    reward /= TEST_EPISODES
    # writer.add_scalar("reward", reward, iter_no)
    if reward > best_reward:
        print("Best reward updated %.3f -> %.3f" % (best_reward, reward))
        best_reward = reward
    if reward > 0.80:
        print("Solved in %d iterations!" % iter_no)
        break
# writer.close()

Best reward updated 0.000 -> 0.150
Best reward updated 0.150 -> 0.200
Best reward updated 0.200 -> 0.400
Best reward updated 0.400 -> 0.450
Best reward updated 0.450 -> 0.600
Best reward updated 0.600 -> 0.650
Best reward updated 0.650 -> 0.800
Best reward updated 0.800 -> 0.900
Solved in 143 iterations!
