# Q Learning

## Aim
To find the maximum rewards obtained by agent as the episodes are increased.

## Environment
Frozen Lake environment V1


In [None]:
# Importing the libraries
import numpy as np
import gym
import warnings
warnings.filterwarnings("ignore")

In [None]:
environment = gym.make("FrozenLake-v1")
n_observations = environment.observation_space.n
n_actions = environment.action_space.n

  "Initializing wrapper in old step API which returns one bool instead of two. It is recommended to set `new_step_api=True` to use new step API. This will be the default behaviour in future."
  "Initializing environment in old step API which returns one bool instead of two. It is recommended to set `new_step_api=True` to use new step API. This will be the default behaviour in future."


In [None]:
# Creating a Qtable and initialize it to 0
Q_table = np.zeros((n_observations,n_actions))
print(Q_table)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [None]:
# number of episode we will run
n_episodes = 10000


In [None]:
#maximum of iteration per episode
max_iter_episode = 100

In [None]:
#initialize the exploration probability to 1
exploration_proba = 1

In [None]:
#exploartion decreasing decay for exponential decreasing
exploration_decreasing_decay = 0.001

In [None]:
# minimum of exploration proba
min_exploration_proba = 0.01

In [None]:
#discounted factor
gamma = 0.99

#learning rate
lr = 0.1

In [None]:
# Storing the total rewards
total_rewards_episode = list()

In [None]:
# iterating over episodes
for e in range(n_episodes):
    current_state = environment.reset()  # initialize the first state of the episode
    done = False

    #sum the rewards that the agent gets from the environment
    total_episode_reward = 0

    for i in range(max_iter_episode):
        if np.random.uniform(0,1) < exploration_proba:
            action = environment.action_space.sample()
        else:
            action = np.argmax(Q_table[current_state,:])
        next_state, reward, done, _ = environment.step(action)

        # updating the Q-table using the Q-learning iteration
        Q_table[current_state, action] = (1-lr) * Q_table[current_state, action]
        +lr*(reward + gamma*max(Q_table[next_state,:]))
        total_episode_reward = total_episode_reward + reward
        if done:
            break
        current_state = next_state
    # updating the exploration proba using exponential decay formula
    exploration_proba = max(min_exploration_proba, np.exp(
        -exploration_decreasing_decay*e))
    total_rewards_episode.append(total_episode_reward)

In [None]:
# Evaluating his performance
print("Mean reward per thousand episodes")
for i in range(10):
    print((i+1)*1000, "mean_espiode_reward: ", np.mean(
        total_rewards_episode[1000*i:1000*(i+1)]))


Mean reward per thousand episodes
1000 mean_espiode_reward:  0.027
2000 mean_espiode_reward:  0.21
3000 mean_espiode_reward:  0.417
4000 mean_espiode_reward:  0.613
5000 mean_espiode_reward:  0.66
6000 mean_espiode_reward:  0.665
7000 mean_espiode_reward:  0.674
8000 mean_espiode_reward:  0.698
9000 mean_espiode_reward:  0.678
10000 mean_espiode_reward:  0.682
