# Q Learning 

In [3]:
!pip install gym

Collecting gym
  Using cached gym-0.18.3.tar.gz (1.6 MB)
Collecting pyglet<=1.5.15,>=1.4.0
  Using cached pyglet-1.5.15-py3-none-any.whl (1.1 MB)
Building wheels for collected packages: gym
  Building wheel for gym (setup.py): started
  Building wheel for gym (setup.py): finished with status 'done'
  Created wheel for gym: filename=gym-0.18.3-py3-none-any.whl size=1657515 sha256=0821acbfe935540260c41e6636ed266c57f9411692aab95dcc9b4bfa8541cb0d
  Stored in directory: c:\users\nguye\appdata\local\pip\cache\wheels\b3\03\54\9141c232861b89be935b37bdde0ea5ab472f5e18fc20623aed
Successfully built gym
Installing collected packages: pyglet, gym
Successfully installed gym-0.18.3 pyglet-1.5.15


In [5]:
import numpy as np
import gym
import random
import time
from IPython.display import clear_output

In [6]:
env = gym.make("FrozenLake-v0")

In [7]:
action_space_size = env.action_space.n
state_space_size = env.observation_space.n

In [8]:
q_table = np.zeros((state_space_size, action_space_size))
print(q_table)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [11]:
print(action_space_size)
print(state_space_size)

4
16


## Parameters for Q Learning Algorithms

In [16]:
num_episodes = 10000 # number of episodes we want the agent to play during training
max_steps_per_episode = 100 # max number of steps that agent can take in 1 episode

learning_rate = 0.1 # alpha
discount_rate = 0.99 # gamma, discounted rewards

# epsilon greedy strategy
exploration_rate = 1
max_exploration_rate = 1
min_exploration_rate = 0.01
exploration_decay_rate = 0.001

## Q Learning Algorithms

In [24]:
rewards_all_episode = []

# Q Learning Algorithms
for episode in range(num_episodes):
    # for each episode reset the state, done state, and episode rewards
    state = env.reset()
    
    done = False
    rewards_current_episode = 0
    
    for step in range(max_steps_per_episode):
        # for each step, do epsilon greedy to pick the action, update new state, reward, update the Q table
        
        # Exploration-exploitation trade off
        exploration_rate_threshold = random.uniform(0,1)
        if exploration_rate_threshold > exploration_rate:
            action = np.argmax(q_table[state,:])
        else:
            action = env.action_space.sample()
        
        new_state, reward, done, info = env.step(action)
        
        # Update Q-table for Q(s,a)
        q_table[state, action] = q_table[state, action] * (1-learning_rate) \
        + learning_rate * (reward + discount_rate * np.max(q_table[new_state,:]))
        
        state = new_state
        rewards_current_episode += reward
        
        if done == True:
            break

    # Exploration rate decay after an episode
    exploration_rate = min_exploration_rate + (max_exploration_rate  - min_exploration_rate) * np.exp(-exploration_decay_rate*episode)
    
    rewards_all_episode.append(rewards_current_episode) #append reward for an episode
    
# Calculate and print the average reward per thousand episodes
rewards_per_thousand_episodes = np.split(np.array(rewards_all_episode), num_episodes/1000)
count = 1000
print("*****Average reward per thousand episodes*****\n")

for r in rewards_per_thousand_episodes:
    print(f"{count}: {str(sum(r/1000))}")
    count += 1000
    
# Print updated Q-table
print("\n\n ****Q-table*****\n")
print(q_table)

*****Average reward per thousand episodes*****

1000: 0.057000000000000044
2000: 0.18100000000000013
3000: 0.3820000000000003
4000: 0.5410000000000004
5000: 0.6270000000000004
6000: 0.6560000000000005
7000: 0.6960000000000005
8000: 0.6780000000000005
9000: 0.7000000000000005
10000: 0.6590000000000005


 ****Q-table*****

[[0.52610779 0.49602763 0.50746303 0.47741521]
 [0.38059297 0.32099116 0.30966587 0.48727241]
 [0.42736786 0.41088971 0.4244271  0.47376316]
 [0.37845886 0.2577131  0.30450433 0.4657776 ]
 [0.54009735 0.25020471 0.3877593  0.30307579]
 [0.         0.         0.         0.        ]
 [0.18557865 0.14195544 0.25390525 0.14275628]
 [0.         0.         0.         0.        ]
 [0.35427192 0.37163518 0.33213888 0.59533001]
 [0.41659376 0.61633842 0.53229361 0.31722433]
 [0.64244671 0.38474965 0.31293694 0.33149706]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.28250728 0.34921215 0.74174958 0.51625664]
 [0.75460163 0.8810

## Watch Q Learning Agent Play Game

In [27]:
# After training, test the agent on frozen lake env

for episode in range(3):
    # reset env state and done state
    state = env.reset()
    done = False
    print(f"Episode {episode+1} \n\n")
    time.sleep(1) # halt for printing
    
    for step in range(max_steps_per_episode):
        # for each step render the nenv, take ation, update new state and check done
        clear_output(wait=True) # clear output from the jupiter notebook cell
        env.render() # render the current state so we can see visually
        time.sleep(1)
        
        action = np.argmax(q_table[state,:])
        new_state, reward, done, info = env.step(action)
        
        if done:
            clear_output(wait = True)
            env.render()
            if reward == 1:
                print("You reached the goal")
                time.sleep(3)
            else:
                print("You fell through the hold")
                time.sleep(3)
            clear_output(wait=True)
            break
        state = new_state
    env.close()

  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
You reached the goal


## --------------------------------------------------------------------------------------------
# Deep Q Learning - NN + RL
- Policy Network - First pass
- Target Network - Second pass