# Lab 4: Q-table based reinforcement learning



Solve [`FrozenLake8x8-v1`](https://www.gymlibrary.dev/environments/toy_text/frozen_lake/) using a Q-table.


1. Import Necessary Packages (e.g. `gym`, `numpy`):

In [35]:
import gym
import sys
import matplotlib.pyplot as plt; plt.rcdefaults()
import numpy as np
import random
from gym.utils.play import play


2. Instantiate the Environment and Agent

In [61]:
#env = gym.make('FrozenLake8x8-v1', desc=None, map_name="8x8", is_slippery=False, render_mode='human')
#env = gym.make('FrozenLake-v1', desc=None, map_name="4x4", is_slippery=False, render_mode='human')
env = gym.make('FrozenLake8x8-v1', desc=None, map_name="8x8", is_slippery=False)
#env = gym.make('FrozenLake-v1', desc=None, map_name="4x4", is_slippery=False)
action_size = env.action_space.n
state_size = env.observation_space.n

3. Set up the QTable:

In [4]:
learning_rate = 0.1
discount_rate = 0.9

def update_q_table(qt, reward, prev_state, action, state):
    qt[prev_state, action] = qt[prev_state, action] + learning_rate * \
                                   (reward + discount_rate * np.max(qt[state]) - qt[prev_state, action])

def get_best_action(qt, state):
    return np.argmax(qt[state])

#q_table = np.zeros((state_size, action_size))

4. The Q-Learning algorithm training

In [69]:
def train_q_table_penalty(env, max_steps=40, episodes_count=10000, exploration=0.9, explore_reduction=0.99, explore_min=0.01):
    q_table = np.zeros((state_size, action_size))

    fcs = 0 # first consistent solution
    csr = 0 # consistent solution runs
    
    bsf = 0 # best solution found
    bsr = 0 # best solution runs

    for i in range(0, episodes_count):
        last_observation = env.observation_space.sample()
        terminated = False
        steps = 0

        while not terminated:

            if exploration < random.uniform(0, 1):
                action = get_best_action(q_table, last_observation)
            else:
                action = env.action_space.sample()

            observation, reward, terminated, truncated, info = env.step(action)

            if reward == 1:
                if fcs == 0:
                    if csr == 10:
                        fcs = i
                        #print("First solution found at episode: ", fcs)
                    else:
                        csr += 1

                if steps < bsf:
                    bsr += 1
                    if bsr == 10:
                        bsf = steps

            if terminated or truncated or steps >= max_steps:
                if reward == 0:
                    update_q_table(q_table, -1, last_observation, action, observation)
                    last_observation = observation

                if exploration > explore_min:
                    exploration *= explore_reduction

                observation, info = env.reset()
                break

            update_q_table(q_table, reward, last_observation, action, observation)

            last_observation = observation
            steps += 1
    return q_table, (fcs, bsf, 'Lake penalty')

def train_q_table_naive(env, max_steps=40, episodes_count=10000, exploration=0.9, explore_reduction=0.95, explore_min=0.01):
    q_table = np.zeros((state_size, action_size))

    fcs = 0 # first consistent solution
    csr = 0 # consistent solution runs
    
    bsf = 0 # best solution found
    bsr = 0 # best solution runs

    for i in range(0, episodes_count):
        last_observation,_ = env.reset()
        terminated = False
        steps = 0
    
        while not terminated:
        
            # Get action either randomly or from the q table:
            if exploration < random.uniform(0, 1):
                action = get_best_action(q_table, last_observation)
            else:
                action = env.action_space.sample()

            # Take action in the environment
            observation, reward, terminated, truncated, info = env.step(action)
            steps += 1

            if reward == 1:
                if fcs == 0:
                    if csr == 10:
                        fcs = i
                        print("First solution found at episode: ", fcs)
                    else:
                        csr += 1

                if steps < bsf:
                    bsr += 1
                    if bsr == 10:
                        bsf = steps

            update_q_table(q_table, reward, last_observation, action, observation)

            if steps >= max_steps:
                break
    
            last_observation = observation

        
        # Reduce exploration rate after a run
        if exploration > explore_min:
            exploration *= explore_reduction
    
    #print("First solution found at episode: ", fcs)
    #print("Best overall solution: ", bsf)
    return q_table, (fcs, bsf, 'Naive')


def train_q_table_cost(env, max_steps=40, episodes_count=10000, exploration=0.9, explore_reduction=0.95, explore_min=0.01):
    q_table = np.zeros((state_size, action_size))

    fcs = 0 # first consistent solution
    csr = 0 # consistent solution runs
    
    bsf = max_steps # best solution found
    bsr = 0 # best solution runs

    for i in range(0, episodes_count):
        last_observation,_ = env.reset()
        terminated = False
        steps = 0
        cost = 0
    
        while not terminated:

            if exploration < random.uniform(0, 1):
                action = get_best_action(q_table, last_observation)
            else:
                action = env.action_space.sample()

            # Take action in the environment
            observation, reward, terminated, truncated, info = env.step(action)
            steps += 1
            cost -= 1

            if terminated and reward != 1:
                cost -= 100

            cost += reward

            update_q_table(q_table, cost, last_observation, action, observation)

            if reward == 1:
                if fcs == 0:
                    if csr == 10:
                        fcs = i
                        #print("First solution found at episode: ", fcs)
                    else:
                        csr += 1
                        #print("csr: ", csr)

                if steps < bsf:
                    bsr += 1
                    if bsr == 10:
                        bsf = steps
            elif terminated:
                csr = 0
                bsr = 0

            if steps >= max_steps:
                break
    
            last_observation = observation
        
        if exploration > explore_min:
            exploration *= explore_reduction

    return q_table, (fcs, bsf, 'Step cost')


episode_count = 10000

cost_q_table, cost_stats = train_q_table_cost(env, episodes_count=episode_count)
naive_q_table, naive_stats = train_q_table_naive(env, episodes_count=episode_count)
penalty_q_table, penalty_stats = train_q_table_penalty(env, episodes_count=episode_count)

def train_statistics(solutions):
    
    names = []
    firsts = []
    bests = []

    for sol in solutions:
        first, best, name = sol
        names.append(name)
        firsts.append(first)

    #names = tuple(names)

    y_pos = np.arange(len(names))

    plt.bar(y_pos, firsts, align='center', alpha=0.5)
    plt.xticks(y_pos, names)
    plt.ylabel('Episode of first consistent solution')
    plt.title('First consistent solutions')

    plt.show()
    

solutions = [cost_stats, naive_stats, penalty_stats]

train_statistics(solutions)

5. Evaluate how well your agent performs
* Render output of one episode
* Give an average episode return

In [None]:
def run(env):
    state = env.observation_space.sample()
    state, _ = env.reset()
    terminated = False
    truncated = False
    while not terminated and not truncated:
        # Get action either randomly or from the q table
        action = get_best_action(state)

        # Take action in the environment
        state, reward, terminated, truncated, _ = env.step(action)
    return reward


# Run one episode
def q_table_statistics(episode_count=100):
    env2 = gym.make('FrozenLake8x8-v1', desc=None, map_name="8x8", is_slippery=False, render_mode='human')
    run(env2)
    env2.close()

    score_sum = 0

    for i in range(episode_count):
        score = run(env)
        score_sum += score
    
    print("Average episode score: ", float(score_sum)/ float(episode_count))

q_table_statistics()

Average episode score:  1.0


6. (<i>Optional</i>) Adapt code for one of the continuous [Classical Control](https://www.gymlibrary.dev/environments/classic_control/) problems. Think/talk about how you could use our  `Model` class from last Thursday to decide actions.