In [186]:
import numpy as np
import gymnasium as gym
import random
import time
from IPython.display import clear_output  #You can use IPython.display.clear_output to clear the output of a cell

In [187]:
env = gym.make('FrozenLake-v1', render_mode='ansi')
#“ansi”: Return a strings (str) or StringIO.StringIO containing a terminal-style text representation for each time step. The text can include newlines and ANSI escape sequences (e.g. for colors).

# The Frooze Lake : 
<div style="text-align: center;">
    <img src="./frozenLake.png" alt="Frozen Lake" style="max-width: 35%;" />
</div>


Creating our Q-table: 

In [188]:
action_size = env.action_space.n
state_size = env.observation_space.n

q_table = np.zeros((state_size,action_size))
q_table

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [189]:
num_episodes =  10000         #we define the total number of episodes we want the agent to play during training
max_steps_per_episode = 100  #we define the max number of steps the agent takes within an epesode, end of episode = HOLE, GOAL, or steps = 100


learning_rate = 0.15
discount_rate = 0.99

# to implement epselon greedy policy
exploration_rate = 1
max_exploration_rate = 1
min_exploration_rate = 0.01
exploration_decay_rate = 0.001

# exploitation <-   0 < EPSELON < 1    -> exploration

## Training : 


In [190]:
rewards_all_episodes = []    # we save the reward we get after each episode (0 or 1)
for episode in range(num_episodes):
    state = env.reset()[0]   # init state to first one (where the agent in the beggining of the game is)
    done = False
    rewards_current_episode = 0 # init reward for that episode

    for step in range(max_steps_per_episode):
        #pick action : exploration  or explouitation ? 
        exploration_rate_threshold = random.uniform(0, 1)
        if exploration_rate_threshold > exploration_rate:
            # Exploitation : choose the best action that gives max of reward for that state
            action = np.argmax(q_table[state,:]) 
        else:
            # Exploration : randomly select one of the available actions.
            action = env.action_space.sample() 

        # New state + Diagnostic : 
        new_state, reward, done, truncated, info = env.step(action)


        # Update Q-table for Q(s,a)
        q_table[state, action] = q_table[state, action] * (1 - learning_rate) + learning_rate * (reward + discount_rate * np.max(q_table[new_state, :]))
        
        state = new_state
        rewards_current_episode += reward   # update reward 
        
        if done == True: 
            break

    # Reduce epsilon (because we need less and less exploration) by The exponontial decayed-epsilon-greedy strategy
    exploration_rate = min_exploration_rate + (max_exploration_rate - min_exploration_rate)*np.exp(-exploration_decay_rate*episode) 
    rewards_all_episodes.append(rewards_current_episode)


In [191]:
# Calculate and print the average reward per thousand episodes
rewards_per_thousand_episodes = np.split(np.array(rewards_all_episodes),num_episodes/1000)
count = 1000

print("********Average reward per thousand episodes********\n")
for r in rewards_per_thousand_episodes:
    print(count, ": ", str(sum(r/1000)))
    count += 1000

********Average reward per thousand episodes********

1000 :  0.036000000000000025
2000 :  0.20200000000000015
3000 :  0.3930000000000003
4000 :  0.5510000000000004
5000 :  0.6300000000000004
6000 :  0.6610000000000005
7000 :  0.6720000000000005
8000 :  0.6690000000000005
9000 :  0.6770000000000005
10000 :  0.7070000000000005


In [192]:
# Print updated Q-table
print("\n\n********Q-table********\n")
print(q_table)



********Q-table********

[[0.52449898 0.4897745  0.49169598 0.48700615]
 [0.25963281 0.38987051 0.31448314 0.47556911]
 [0.37706937 0.41045103 0.41021191 0.44208053]
 [0.26070333 0.28273258 0.25408185 0.43023636]
 [0.5475035  0.45283134 0.40483908 0.47418671]
 [0.         0.         0.         0.        ]
 [0.14390421 0.12891361 0.23353356 0.12831921]
 [0.         0.         0.         0.        ]
 [0.46092634 0.38903496 0.50168196 0.58510379]
 [0.49757424 0.62860682 0.50329825 0.41879714]
 [0.55226451 0.34915357 0.25115489 0.29077742]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.41382297 0.57192419 0.7587762  0.48156323]
 [0.75520883 0.86257971 0.77209303 0.76351852]
 [0.         0.         0.         0.        ]]


#### Now we can watch the agent play by playing the best action :
Remember when we introduced Frozen Lake, part of the description noted that the agent won't always take the action that it chooses to take because, since the ice is slippery, even if we choose to go right, for example, we may slip and go up instead. So keep this in mind as you watch the agent play because you may see the chosen action show as right but then see the agent take a step up, for example. The slippery ice is the reason for this.

In [194]:
# from each state according to the Q-table

for episode in range(3):
    # initialize new episode params
    state = env.reset()[0]
    done = False
    print("*****EPISODE ", episode+1, "*****\n\n\n\n")
    time.sleep(5)
    for step in range(max_steps_per_episode):        
        # Show current state of environment on screen
        clear_output(wait=True)
        print(env.render())
        time.sleep(0.3)
        # Choose action with highest Q-value for current state     
        action = np.argmax(q_table[state,:])        
        new_state, reward, done, truncated, info = env.step(action)  
        #If agent fell in hole or won
        if done:
            clear_output(wait=True)
            print(env.render())
            if reward == 1:
                print("****You reached the goal!****")
                time.sleep(3)
            else:
                print("****You fell through a hole!****")
                time.sleep(3)
                clear_output(wait=True)
            break

        # Set new state
        state = new_state
env.close()

  (Down)
SFFF
FHFH
FFFH
H[41mF[0mFG

