In [11]:
import numpy as np
import gym
import random
import time
from IPython.display import clear_output


env = gym.make("FrozenLake-v1", render_mode="ansi",is_slippery=False)

In [12]:
# Actions are going to be left, right, up and down. Movement direction.
actionSpaceSize = env.action_space.n
# States are going to be S,F,H and G. S is the starting point, F is frozen, H is a hole and G is the goal.
stateSpaceSize = env.observation_space.n

qtable = np.zeros((stateSpaceSize, actionSpaceSize))
# As you can see, for a 4x4 grid, we have 16 states and 4 actions for each state.
# So, our Q-table will have 16 rows and 4 columns, filled with zeros.
print(qtable)

# The rewards for each state are as follows:
# S: 0 (starting point)
# F: 0 (frozen surface)
# H: 0 (hole)
# G: 1 (goal)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [13]:
#Create and initialize all the hyperparameters that will be used in the algorithm. These are the parameters that will be used to learn the Q-values for the actions in the environment.
numEpisodes = 10000 # Total number of episodes. Episodes are basically the number of times we train the agent.
maxStepsPerEpisode = 100 # Max steps allowed for each episode. If the agent falls in a hole, then the episode ends and we start a new episode.IF the agent reaches the goal, then the episode ends and we start a new episode. ıF the agent takes 100 steps BUT does not reach the goal, then the episode ends and we start a new episode.
learningRate = 0.1 # Learning rate
discountRate = 0.99 # Discount rate

explorationRate = 1 # Exploration rate. Is used to determine the probability of agent to take a random action. This is important because we want the agent to explore the environment and not just exploit it. exploitation is when the agent takes the action that has the highest Q-value for the current state. Exploration is when the agent takes a random action, so that it can discover new states and rewards.

maxExplorationRate = 1 # Exploration probability at start
minExplorationRate = 0.01 # Minimum exploration probability at the end.
explorationDecayRate = 0.001 # Exponential decay rate for exploration probability. This is used to reduce the exploration probability over time. This is done so that the agent lies more on exploiting the environment rather than exploring it. This is because we want the agent to explore the environment more at the start and then exploit it more towards the end. Greedy epsilon approach.

In [14]:
# All of ther rewards from each episode is stored to keep track of how our game scores change over time.
TotalRewards = []

# Q learning algorithm
for episode in range(numEpisodes):
    # Reset the environment to start a new episode. State is the current state that the agent is in. In form of a number. For example, 0 is the starting point, 1 is the frozen surface, 2 is the hole and 3 is the goal. state is in form of (0, {'prob': 1}) where 0 is the state and {'prob': 1} is the probability of the agent being in that state. The probability is 1 because the agent is definitely in that state.
    state = env.reset()[0]

    done = False  # Boolean variable to check if the episode has ended or not.
    # Variable to store the total rewards for the current episode.
    episodeRewards = 0
    for step in range(maxStepsPerEpisode):
        rand = np.random.uniform(0, 1)  # Get a random number between 0 and 1.
        if rand > explorationRate:
            # Exploit the environment, use the q table to get the action that has the highest Q-value for the current state.
            action = np.argmax(qtable[state, :])
            # This is taking the action that has the highest Q-value for the current state by slicing the Q-table for the current state and then taking the action that has the highest Q-value. The colon means that we are taking all the actions for the current state, and then we are taking the action that has the highest Q-value by calling the argmax function.
        else:
            # Explore the environment, take a random action. A random action is taken because the random number is less than the exploration rate. This is done so that the agent can explore the environment.
            action = env.action_space.sample()
            # env.action_space.sample() returns a random action from the action space. The action space is the set of all the actions that the agent can take in the environment.In this case, the actions can be left, right, up and down.

        # We have defined the action. Now, with this action, we can get the new state, reward, done and info from the environment by playing the action in the environment.
        observation, reward, terminated, truncated, info = env.step(action)

        # We need to update the Q table with the new knowledge that we have gained from the environment. The formula for updating the Q table is,
        # Q(s,a) = Q(s,a) + learningRate * (reward + discountRate * maxQ(s',a') - Q(s,a))
        # where, s is the current state, a is the action that we took, s' is the new state that we reached after taking the action, a' is the action that we will take in the new state, learningRate is the learning rate, discountRate is the discount rate, reward is the reward that we got from the environment for taking the action and info is the information that we got from the environment for taking the action, Q(s,a) is the Q-value for the current state and action, Q(s',a') is the Q-value for the new state and action, maxQ(s',a') is the maximum Q-value for the new state and for all the actions that we can take in the new state.

        qtable[state, action] = qtable[state, action] + learningRate * \
            (reward + discountRate *
             np.max(qtable[observation, :]) - qtable[state, action])

        state = observation  # Update the current state to the new state.
        # Update the reward for the current episode. Episode reward is the total reward that we get from the environment for the current episode.
        episodeRewards += reward

        if terminated == True or truncated == True:
            break

    # We have finished playing the episode. Now, we need to update the exploration rate. We need to do this because we want the agent to explore the environment more at the start and then exploit it more towards the end. Greedy epsilon approach.
    explorationRate = minExplorationRate + \
        (maxExplorationRate - minExplorationRate) * \
        np.exp(-explorationDecayRate*episode)
    # Updating the exploration rate using the formula for exponential decay formula. The formula, generally is y = a + (b - a) * e^(-cx), where y is the value that we want to calculate, a is the minimum value that y can take, b is the maximum value that y can take, c is the decay rate and x is the current iteration(episode). In our case, y is the exploration rate, a is the minimum exploration rate, b is the maximum exploration rate, c is the decay rate and x is the current episode number. The exploration rate value is decreased proportionally to the episode number.

    # Append the episode reward to the list of rewards.
    TotalRewards.append(episodeRewards)

# Calculate and print the average reward per thousand episodes.
# This will get the rewards for every thousand episodes and put them in a list. TotalRewards is a list that contains total reward for each episode. We are converting it to a numpy array so that it contains the reards for each episode as a list. Then, we are splitting the numpy array into a list of numpy arrays, where each numpy array contains the rewards for every thousand episodes. We are doing this so that we can calculate the average reward for every thousand episodes. Finally, rewardsPerThousandEpisodes will be a list of numpy arrays, where each numpy array contains the rewards for every thousand episodes.
rewardsPerThousandEpisodes = np.split(np.array(TotalRewards), numEpisodes/1000)
# Structure of rewardsPerThousandEpisodes: [[1000 rewards], [1000 rewards], [1000 rewards], ...] with a total of 10 lists.
count = 1000
print("*********Average reward per thousand episodes*********\n")
# In a episode, a reward can be 0 or 1.
# Rewards
# Reward schedule:
# Reach goal(G): +1
# Reach hole(H): 0
# Reach frozen(F): 0
# So, the average reward for every thousand episodes will be the sum of rewards for every thousand episodes divided by 1000.
# This value can vary from 0 to 1. It can not exceed 1 because the maximum reward that we can get from the environment is 1.

for r in rewardsPerThousandEpisodes:
    print(count, ": ", str(sum(r/1000)))
    # Calculate the average reward for every thousand episodes by dividing the sum of rewards for every thousand episodes by 1000.

print("\n\n*********Q-table*********\n")
print(qtable)

  if not isinstance(terminated, (bool, np.bool8)):


*********Average reward per thousand episodes*********

1000 :  0.2610000000000002
1000 :  0.7390000000000005
1000 :  0.9040000000000007
1000 :  0.9670000000000007
1000 :  0.9780000000000008
1000 :  0.9810000000000008
1000 :  0.9920000000000008
1000 :  0.9900000000000008
1000 :  0.9900000000000008
1000 :  0.9900000000000008


*********Q-table*********

[[0.94148015 0.93206534 0.95099005 0.94148015]
 [0.94148015 0.         0.96059601 0.95099005]
 [0.95099005 0.970299   0.95099005 0.96059601]
 [0.96059601 0.         0.81245738 0.89409313]
 [0.81083179 0.87845793 0.         0.94148015]
 [0.         0.         0.         0.        ]
 [0.         0.9801     0.         0.960596  ]
 [0.         0.         0.         0.        ]
 [0.44696128 0.         0.956501   0.33205914]
 [0.70350887 0.8493151  0.98009999 0.        ]
 [0.97029879 0.99       0.         0.970299  ]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.         0.82266508 0.98999987

In [15]:
# Visualizing the envbironment and the agent playing the game.
# We are going to render the environment and the agent playing and training in the environment.

for episode in range(3):
    state = env.reset()[0]
    print("*********Episode ", episode+1, "*********\n\n\n\n")
    time.sleep(1)

    for step in range(maxStepsPerEpisode):
        clear_output(wait=True)
        print(env.render())
        time.sleep(0.3)
        action = np.argmax(qtable[state, :])
        observation, reward, terminated, truncated, info = env.step(action)

        if terminated == True or truncated == True:
            clear_output(wait=True)
            print(env.render())
            if reward == 1:
                print("*********You reached the goal!*********\n")
                time.sleep(3)
            else:
                print("*********You fell in a hole!*********\n")
                time.sleep(3)
            clear_output(wait=True)
            break
        state = observation
env.close()

# We can see that the agent is able to reach the goal in all the episodes. This is because the agent has learned the optimal policy for the environment. The optimal policy is the policy that will give the maximum reward for the ag

  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m

*********You reached the goal!*********

