In [None]:
!pip install gymnasium



In [None]:
import numpy as np
import gymnasium as gym

In [None]:
env = gym.make('FrozenLake-v1', desc=None, map_name="4x4", is_slippery=True,render_mode='rgb_array')#rgb_array for recording video
n_observations = env.observation_space.n
n_actions = env.action_space.n

In [None]:
print('Number of States',n_observations)
print('Number of possible actions',n_actions)

Number of States 16
Number of possible actions 4


In [None]:
#Initialize the Q-table to 0
Q_table = np.zeros((n_observations,n_actions))
print(Q_table)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [None]:
Q_table.shape #16 States with 4 actions each

(16, 4)

In [None]:
Q_table[9,:]

array([0., 0., 0., 0.])

In [None]:
#number of episode we will run
n_episodes = 10000

#maximum of iteration per episode- Or number of steps per episode
steps_allowed = 100

#initialize the exploration probability to 1
epsilon = 1

#exploartion decreasing decay for exponential decreasing
decay_rate = 0.001

# minimum of exploration proba
min_epsilon = 0.01

#max exploration rate
max_epsilon=1
#discounted factor
gamma = 0.99

#learning rate
lr = 0.1

In [None]:
#Storing rewards after each episode in a list
rewards_per_episode = list()

In [None]:
#we iterate over episodes
for e in range(n_episodes):
  #we initialize the first state of the episode
  state = env.reset()[0]
  done = False

  #sum the rewards that the agent gets from the environment
  total_reward = 0

  for i in range(steps_allowed):
    # epsilon greedy strategy
    # we initiate a random number between 0 and 1
    # if the random_number is less than the exploration proba(epsilon)
    #     the agent explores
    # else
    #     he exploits his knowledge

    if np.random.uniform(0,1) < epsilon:
        action = env.action_space.sample()
    else:
        action = np.argmax(Q_table[state,:])

    # The environment runs the chosen action and returns
    # the next state, a reward and true if the episode is ended.
    next_state, reward, done, truncated , info = env.step(action)

    # We update our Q-table using the Q-learning iteration
    Q_table[state, action] = (1-lr) * Q_table[state, action] \
                       + lr*(reward + gamma* np.max(Q_table[next_state,:]))
    total_reward = total_reward + reward

    state = next_state
    # If the episode is finished, we leave the for loop
    if done:
        break

  #We update the exploration proba using exponential decay formula
  epsilon = exploration_rate = min_epsilon + \
    (max_epsilon - min_epsilon) * np.exp(-decay_rate*e)
  rewards_per_episode.append(total_reward)

In [None]:
len(rewards_per_episode)

10000

In [None]:
rewards_per_thousand_episodes=np.split(np.array(rewards_per_episode),n_episodes/1000)

In [None]:
count=1000
print('-----Average reward per thousand episodes-------')
for r in rewards_per_thousand_episodes:
  print(count,':' ,str(sum(r/1000)))
  count+=1000

-----Average reward per thousand episodes-------
1000 : 0.046000000000000034
2000 : 0.20400000000000015
3000 : 0.4290000000000003
4000 : 0.5400000000000004
5000 : 0.6470000000000005
6000 : 0.6730000000000005
7000 : 0.6810000000000005
8000 : 0.6920000000000005
9000 : 0.6910000000000005
10000 : 0.6780000000000005


In [None]:
print('-----------Updated Q Table-----------')
print(Q_table)

-----------Updated Q Table-----------
[[0.49953982 0.49558774 0.4951926  0.49566962]
 [0.37893735 0.33953432 0.35553364 0.48787633]
 [0.39085249 0.38823234 0.39099387 0.47883159]
 [0.27311253 0.29462774 0.25684449 0.4679252 ]
 [0.51304915 0.36780719 0.25261645 0.30170197]
 [0.         0.         0.         0.        ]
 [0.15113458 0.1439372  0.38399485 0.10415677]
 [0.         0.         0.         0.        ]
 [0.47210501 0.41703959 0.46241314 0.53316662]
 [0.52751484 0.56425739 0.40998929 0.45201216]
 [0.53405459 0.37298406 0.46256222 0.29382857]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.57660249 0.49244143 0.65675257 0.54553046]
 [0.74272296 0.82822389 0.78475456 0.74705631]
 [0.         0.         0.         0.        ]]


In [None]:
import time
from IPython.display import clear_output

In [None]:
#For recording video
%pip install gymnasium[classic_control] comet_ml
import comet_ml
comet_ml.init(project_name="frozen_lake")
env = gym.wrappers.RecordVideo(env, 'gameplay video') #fdiox3CS3j2WHY7yzjJclVgbt



In [None]:
#Visualising the game
for episode in range(3):
  state=env.reset()[0]
  done=False
  print('----------EPISODE:',episode+1,'---------\n\n\n\n')
  time.sleep(1)

  for step in range(steps_allowed):
    clear_output(wait=True)
    print(env.render())
    time.sleep(0.4)

    action = np.argmax(Q_table[state,:])
    new_state, reward, done, truncated, info = env.step(action)

    if done:
      clear_output(wait=True)
      print(env.render())
      if reward == 1:
          print("****You reached the goal!****")
          time.sleep(3)
      else:
          print("****You fell through a hole!****")
          time.sleep(3)
      clear_output(wait=True)
      break

    state=new_state

env.close()

----------EPISODE: 1 ---------




[[[180 200 230]
  [180 200 230]
  [180 200 230]
  ...
  [180 200 230]
  [180 200 230]
  [180 200 230]]

 [[180 200 230]
  [204 230 255]
  [204 230 255]
  ...
  [204 230 255]
  [204 230 255]
  [180 200 230]]

 [[180 200 230]
  [235 245 249]
  [204 230 255]
  ...
  [204 230 255]
  [204 230 255]
  [180 200 230]]

 ...

 [[180 200 230]
  [235 245 249]
  [235 245 249]
  ...
  [204 230 255]
  [235 245 249]
  [180 200 230]]

 [[180 200 230]
  [235 245 249]
  [235 245 249]
  ...
  [204 230 255]
  [204 230 255]
  [180 200 230]]

 [[180 200 230]
  [180 200 230]
  [180 200 230]
  ...
  [180 200 230]
  [180 200 230]
  [180 200 230]]]
[[[180 200 230]
  [180 200 230]
  [180 200 230]
  ...
  [180 200 230]
  [180 200 230]
  [180 200 230]]

 [[180 200 230]
  [204 230 255]
  [204 230 255]
  ...
  [204 230 255]
  [204 230 255]
  [180 200 230]]

 [[180 200 230]
  [235 245 249]
  [204 230 255]
  ...
  [204 230 255]
  [204 230 255]
  [180 200 230]]

 ...

 [[180 200 230]
 