In [60]:
import gym
from IPython.display import clear_output
from time import sleep
import numpy as np
import random
import itertools
#from numba import njit, cuda


# Functions

In [61]:
def env_render(env_name,env_version):
    env = env_name.make(env_version).env
    env.render()
    env.reset()  # reset environment to a new, random state
    env.render()
    print("Action Space {}".format(env.action_space))
    print("State Space {}".format(env.observation_space))
    return env


In [62]:
def Solve_without_RL(env):
    env.s = 328  # set environment to illustration's state
    epochs = 0
    penalties, reward = 0, 0
    frames = []  # for animation
    done = False
    while not done:
        action = env.action_space.sample()
        state, reward, done, info = env.step(action)
        if reward == -10:
            penalties += 1
        # Put each rendered frame into dict for animation
        frames.append({
            'frame': env.render(mode='ansi'),
            'state': state,
            'action': action,
            'reward': reward
        }
        )
        epochs += 1
    print("Timesteps taken: {}".format(epochs))
    print("Penalties incurred: {}".format(penalties))
    return frames


In [63]:
def print_frames(frames):
    for i, frame in enumerate(frames):
        clear_output(wait=True)
        print(frame['frame'])
        print(f"Timestep: {i + 1}")
        print(f"State: {frame['state']}")
        print(f"Action: {frame['action']}")
        print(f"Reward: {frame['reward']}")
        sleep(.1)

In [None]:
def q_learning(env,alpha,gamma,epsilon,episodes):
    q_table = np.zeros([env.observation_space.n, env.action_space.n])
    # For plotting metrics
    all_epochs = []
    all_penalties = []
    
    for i in range(1, episodes):
        state = env.reset()
        epochs, penalties, reward, = 0, 0, 0
        done = False

        while not done:
            if random.uniform(0, 1) < epsilon:
                action = env.action_space.sample()  # Explore action space
            else:
                action = np.argmax(q_table[state])  # Exploit learned values

            next_state, reward, done, info = env.step(action)

            old_value = q_table[state, action]
            next_max = np.max(q_table[next_state])

            new_value = (1 - alpha) * old_value + alpha * \
                (reward + gamma * next_max)
            q_table[state, action] = new_value

            if reward == -10:
                penalties += 1

            state = next_state
            epochs += 1

        #if i % 100 == 0:
         #   clear_output(wait=True)
            #print(f"Episode: {i}")

    #print("Training finished.\n")
    return q_table


In [65]:
#@njit
def test_q_learning(env, q_table,episodes):
    total_epochs, total_penalties, total_rewards = 0, 0, 0

    for _ in range(episodes):
        state = env.reset()
        epochs, penalties, reward = 0, 0, 0

        done = False

        while not done:
            action = np.argmax(q_table[state])
            state, reward, done, info = env.step(action)

            if reward == -10:
                penalties += 1

            epochs += 1
        total_rewards += reward
        total_penalties += penalties
        total_epochs += epochs

    #print(f"Results after {episodes} episodes:")
    #print(f"Average timesteps per episode: {total_epochs / episodes}")
    #print(f"Average penalties per episode: {total_penalties / episodes}")
    return (total_rewards/total_epochs)


In [145]:
#@njit
def decay_q_learning(env,alpha,gamma,epsilon,episodes):
    q_table = np.zeros([env.observation_space.n, env.action_space.n])
    decay = 0.5
    min_epsilon = 0.1
    min_gamma = 0.4
    # For plotting metrics
    all_epochs = []
    all_penalties = []
    
    for i in range(1, episodes):
        state = env.reset()
        epochs, penalties, reward, = 0, 0, 0
        done = False

        while not done:
            if random.uniform(0, 1) < epsilon:
                action = env.action_space.sample()  # Explore action space
            else:
                action = np.argmax(q_table[state])  # Exploit learned values

            next_state, reward, done, info = env.step(action)

            old_value = q_table[state, action]
            next_max = np.max(q_table[next_state])

            new_value = (1 - alpha) * old_value + alpha * \
                (reward + gamma * next_max)
            q_table[state, action] = new_value

            if reward == -10:
                penalties += 1

            state = next_state
            epochs += 1

        if i % 1000 == 0:
            clear_output(wait=True)
            print(f"Episode: {i}")
            alpha = alpha*0.5
            gamma = max(min_gamma,gamma*decay)
            epsilon = max(min_epsilon,epsilon*decay)

    #print("Training finished.\n")
    return q_table


In [137]:
#@njit
def GridSearch(env,hyperparametersgrid):
    count = 1
    alpha, gamma, epsilon = 0, 0, 0
    rew, values = [],[]
    for i in hyperparametersgrid:
        alpha = round(i[0], 1)
        gamma = round(i[1], 1)
        epsilon = round(i[2], 1)
        print("-"*10 + "training and testing of model {} Running".format(count) + "-"*10)
        q_values = q_learning(env, alpha, gamma, epsilon,100001)
        value = test_q_learning(env, q_values, 100)
        values.append(value)
        print("-"*10 + "training and testing of model {} Finished".format(count) + "-"*10)
        count += 1
    return np.array(values)


# Main

In [138]:
env = env_render(gym,"Taxi-v3")
frames = Solve_without_RL(env)
#print_frames(frames)

Action Space Discrete(6)
State Space Discrete(500)
Timesteps taken: 5524
Penalties incurred: 1817


In [146]:
q_table = q_learning(env,0.7,0.8,0.4,100001)

In [147]:
test_q_learning(env,q_table,100)

1.524390243902439

In [11]:
alpha_li = np.arange(0.1, 1.0, 0.3)
gamma_li = np.arange(0.4, 1.0, 0.1)
epsilon_li = np.arange(0.1, 1.0, 0.3)

c = tuple(itertools.product(alpha_li, gamma_li, epsilon_li))
len(c)


54

In [12]:
import warnings
warnings.filterwarnings('ignore')

scores = GridSearch(env,c)
ind = scores.argmax()
print("Highest Reward/Timestep ratio: {}".format(scores[ind]))
print("Best HyperParameters:(\n alpha: {}, \n Gamma: {}, \n Epsilon: {})"
      .format(round(c[ind][0], 1), round(c[ind][1], 1), round(c[ind][2], 1)))


----------training and testing of model 1 Running----------
----------training and testing of model 1 Finished----------
----------training and testing of model 2 Running----------
----------training and testing of model 2 Finished----------
----------training and testing of model 3 Running----------
----------training and testing of model 3 Finished----------
----------training and testing of model 4 Running----------
----------training and testing of model 4 Finished----------
----------training and testing of model 5 Running----------
----------training and testing of model 5 Finished----------
----------training and testing of model 6 Running----------
----------training and testing of model 6 Finished----------
----------training and testing of model 7 Running----------
----------training and testing of model 7 Finished----------
----------training and testing of model 8 Running----------
----------training and testing of model 8 Finished----------
----------training and testing o

In [13]:
ind = scores.argmax()


In [14]:
print("Best HyperParameters:(\n alpha: {}, \n Gamma: {}, \n Epsilon: {})"
      .format(round(c[ind][0], 1), round(c[ind][1], 1), round(c[ind][2], 1)))


Best HyperParameters:(
 alpha: 0.7, 
 Gamma: 0.8, 
 Epsilon: 0.4)


In [18]:
scores

array([1.53846154, 1.54320988, 1.51745068, 1.57728707, 1.52671756,
       1.53374233, 1.53491942, 1.53609831, 1.47275405, 1.55520995,
       1.51860289, 1.5037594 , 1.55279503, 1.49031297, 1.54679041,
       1.49812734, 1.55884645, 1.49476831, 1.47601476, 1.51630023,
       1.50715901, 1.49588631, 1.46735143, 1.52091255, 1.54083205,
       1.59235669, 1.55400155, 1.53964588, 1.52091255, 1.53256705,
       1.53846154, 1.53256705, 1.5503876 , 1.52788388, 1.54202005,
       1.52788388, 1.53964588, 1.45454545, 1.56739812, 1.53022188,
       1.50715901, 1.54918668, 1.57232704, 1.52788388, 1.52439024,
       1.51745068, 1.53022188, 1.53964588, 1.53491942, 1.60513644,
       1.53609831, 1.52439024, 1.52207002, 1.55520995])