In [1]:
import random; random.seed(90)
import operator
import itertools
import numpy as np
import matplotlib.pyplot as plt

from time import sleep
from tqdm.notebook import tqdm
from IPython.display import clear_output

import gym
from frozenlake import FrozenLakeEnv

## Explore The Environment

In [2]:
env = FrozenLakeEnv(map_name="8x8", is_slippery=True)

In [3]:
print('Environment Display:')
env.render()

Environment Display:

[41mS[0mFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG


## Set Utils

In [4]:
def create_random_policy(env):
    
    policy = {}
    for key in range(0, env.observation_space.n):
        
        p = {}
        for action in range(0, env.action_space.n):
            p[action] = 1/ env.action_space.n
        policy[key] = p
    
    return policy

In [5]:
def create_Q_dictionary(env, policy):
    
    Q = {}
    for key in policy.keys():
        Q[key] = {action: 0.0 for action in range(0, env.action_space.n) }
    return Q

In [6]:
def run_game(env, policy, display=True):
    
    env.reset()
    episode = []
    finished = False
    
    while not finished:
        state = env.s
        if display:
            clear_output(True)
            env.render()
            sleep(1)
            
        timestep = []
        timestep.append(state)
        n = random.uniform(0, sum(policy[state].values()))
        top_range = 0
        for prob in policy[state].items():
            top_range += prob[1]
            if n < top_range:
                action = prob[0]
                break
                
        state, reward, finished, info = env.step(action)
        timestep.append(action)
        timestep.append(reward)
        
        episode.append(timestep)
        
    if display:
        clear_output(True)
        env.render()
        sleep(1)
        
    return episode

In [7]:
def evaluate_policy(env, policy, num_episodes=100):
    wins = 0; loss = 0
    
    print('Now, the agent is playing...')
    for i in range(num_episodes):
        episode_reward = run_game(env, policy, display=True)[-1][-1]
        if episode_reward == 1:
            wins += 1
                
    return wins/ num_episodes

## Apply Monte Carlo Method

In [8]:
def monte_carlo_epsilon_soft_first_visit(env, policy=None, num_episodes=100, epsilon=0.01):
    
    if not policy:
        policy = create_random_policy(env) # create an empty dictionary to store state-action values
    Q = create_Q_dictionary(env, policy) # empty dictionary for storing rewards for each state-action pair
    returns = {}
    
    print('The agent is learning...')
    for _ in tqdm(range(num_episodes)): # looping through episodes
        G = 0 # store cumulative reward in G (initialized at 0)
        episode = run_game(env=env, policy=policy, display=False) # store state, action and value respectively
        
        # for loop through reversed indices of episode array
        # the logic behind it being reversed is that the eventual reward would be at the end
        # so we have to go back from the last timestep to the first one propagating result from the future
        
        for i in reversed(range(0, len(episode))):
            
            state_t, action_t, reward_t = episode[i]
            state_action = (state_t, action_t)
            G += reward_t # increment total reward by reward on current timestep
            
            # this filter implements first-visit MC control
            if not state_action in [(x[0], x[1]) for x in episode[0:i]]:
                if returns.get(state_action):
                    returns[state_action].append(G)
                else:
                    returns[state_action] = [G]
                    
                Q[state_t][action_t] = sum(returns[state_action]) / len(returns[state_action]) # average reward across episodes
                
                Q_list = list(map(lambda x: x[1], Q[state_t].items())) # find the action with maximum value
                indices = [i for i, x in enumerate(Q_list) if x == max(Q_list)]
                Q_max = random.choice(indices)
                
                A_star = Q_max
                for action in policy[state_t].items(): # update action probability for state_t in policy
                    if action[0] == A_star:
                        policy[state_t][action[0]] = 1 - epsilon + (epsilon / abs(sum(policy[state_t].values())))
                    else:
                        policy[state_t][action[0]] = (epsilon / abs(sum(policy[state_t].values())))
    
    return policy

In [9]:
num_episodes = 100000
optimal_policy = monte_carlo_epsilon_soft_first_visit(env, num_episodes=num_episodes)

The agent is learning...


A Jupyter Widget




## Time To Play!

In [10]:
num_episodes = 10
print('Policy Score: ', evaluate_policy(env, optimal_policy, num_episodes=num_episodes))

  (Right)
SFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFF[41mG[0m
Policy Score:  1.0


---