In [1]:
import gym
import numpy as np

### understand the environment

In [2]:
env= gym.make('FrozenLake-v1')
env.render()


[41mS[0mFFF
FHFH
FFFH
HFFG


In [3]:
print(f'total states {env.nS}')
print(f'total actions {env.nA}')

total states 16
total actions 4


In [4]:
print('win threshold:', env.spec.reward_threshold)  

win threshold: 0.7


In [5]:
print(env.action_space) 
print(env.observation_space)

Discrete(4)
Discrete(16)


In [6]:
#transition model
#probability, next_state, reward, is_terminated
state=5
action=1
env.P[state][action]

[(1.0, 5, 0, True)]

In [7]:
V=np.zeros(env.nS)
state=env.reset()
print(f'state {state}')
V.shape

state 0


(16,)

In [8]:
env.render()


[41mS[0mFFF
FHFH
FFFH
HFFG


In [9]:
    # - 0: LEFT
    # - 1: DOWN
    # - 2: RIGHT
    # - 3: UP
    
    #     `is_slippery`: True/False. If True will move in intended direction with
    # probability of 1/3 else will move in either perpendicular direction with
    # equal probability of 1/3 in both directions.
    #     For example, if action is left and is_slippery is True, then:
    #     - P(move left)=1/3
    #     - P(move up)=1/3
    #     - P(move down)=1/3

In [10]:
action_mapping = {
    3: '^',  # UP
    2: '>',  # RIGHT
    1: 'v',  # DOWN
    0: '<',  # LEFT
}
action_mapping = {
    3: '\u2191',  # UP
    2: '\u2192',  # RIGHT
    1: '\u2193',  # DOWN
    0: '\u2190',  # LEFT
} 

In [11]:
env.render()


[41mS[0mFFF
FHFH
FFFH
HFFG


### Let's play an episode from keyboard input

In [12]:
state=env.reset() 
rewards=0
step=0
while True:
    step+=1
    print('step=',step)
    env.render()
    try:
        action=int(input('give action (0=L, 1=D, 2=R, 3=U)=') )
    except:
        continue
    next_state,reward,done,info=env.step(action)
    print(f's={state} a={action} s\'={next_state}')
    rewards+=reward 
    if done:
        print('--------ended-------------')
        break
        
env.render()
print('rewards=',rewards)

step= 1

[41mS[0mFFF
FHFH
FFFH
HFFG


give action (0=L, 1=D, 2=R, 3=U)= 1


s=0 a=1 s'=0
step= 2
  (Down)
[41mS[0mFFF
FHFH
FFFH
HFFG


give action (0=L, 1=D, 2=R, 3=U)= 1


s=0 a=1 s'=1
step= 3
  (Down)
S[41mF[0mFF
FHFH
FFFH
HFFG


give action (0=L, 1=D, 2=R, 3=U)= 1


s=0 a=1 s'=5
--------ended-------------
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
rewards= 0.0


In [18]:
best_policy=[
    [0,0,0,0],
    [1,1,1,1],
    [2,2,2,2],
    [3,3,3,3]
]

In [22]:
best_policy=[
    [0,3,3,3],
    [0,0,0,0],
    [3,1,0,0],
    [0,2,1,0]
]

In [23]:
def play_episodes(policy, episodes=1000):
    rewards=0
    wins=0
    for i in range(episodes):
        state=env.reset()
        done=False
        while not done:
            r=state//4
            c=state%4
            action=policy[r][c]
            next_state,reward,done,info=env.step(action)
            rewards+=reward
            state=next_state
            if done and reward==1.0:
                wins+=1
    return wins,rewards

In [28]:
episodes=1000
wins, rewards=play_episodes(best_policy, episodes=episodes)
print(f'total play {episodes} total wins {wins} total rewards {rewards}')
print(f'success rate {(wins/episodes)*100}%')

total play 1000 total wins 771 total rewards 771.0
success rate 77.10000000000001%
