### Reinforcement Learning example for the motion of an driving agent on a straight road.

Discrete State Space
-   see simple_road_env.py

Action Space:
-	“Maintain” current lane and speed,
-	“Accelerate” at rate = a1[m/s2], provided velocity does not exceed vmax[km/h],
-	“Decelerate” at rate = −a1[m/s2], provided velocity is above vmin[km/h],
-	“Hard Accelerate” at rate = a2[m/s2], provided velocity does not exceed vmax[km/h],
-	“Hard Decelerate” at rate = −a2[m/s2], provided velocity is above vmin[km/h],
(acceleration are given for a constant amount this time step)

In [None]:
import os
import sys
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import time  # to time the learning process
import json  # to get the configuration of the environment
from env.simple_road_env import Road

In [None]:
from models.training_agent import train_agent
from models.simple_brains import QLearningTable
from models.simple_brains import DP
from models.simple_DQN_tensorflow import DeepQNetwork
from utils.visualization import display_results
from collections import deque
import math
from utils.logger import Logger

In [None]:
# seed = np.random.seed(0)
plt.rcParams['figure.figsize'] = [20, 10]
np.set_printoptions(formatter={'float': lambda x: f"{x:0.2f}"})

In [24]:
actions_list = ["no_change", "speed_up", "speed_up_up", "slow_down", "slow_down_down"]
state_features_list = ["position", "velocity"]  # , "obstacle_position"]

# the environment
flag_tkinter = False
initial_state = [0, 3, 12]
goal_velocity = 3
env = Road(flag_tkinter, actions_list, state_features_list, initial_state, goal_velocity)

# getting the configuration of the test
env_configuration = vars(env)
dict_configuration = dict(env_configuration)

# avoid special types:
not_to_consider = ["tk", "children", "canvas", "_tclCommands", "master", "_tkloaded", "colour_action_code",
                   "colour_velocity_code", "origin_coord", "display_canvas", "origin", "_last_child_ids", "rect",
                   "logger"]
for elem in not_to_consider:
    if elem in dict_configuration:
        del dict_configuration[elem]
# saving the configuration in a json
with open('env/simple_road_env_configuration.json', 'w') as outfile:
    json.dump(dict_configuration, outfile)

# Different possible algorithms to update the state-action table:

# -1- Temporal-Difference  # all are working - "q" performs the best
method_used = "q"

# Instanciate an Agent
brain_agent = None
brain_agent = QLearningTable(actions=actions_list, state=state_features_list, load_q_table=False)

# Training and/or Testing
flag_training_once = True
flag_testing = False
flag_training_hyper_parameter_tuning = False  # Tkinter is not used when tuning hyper-parameters
display_learning_results = False  # only used for training_once

# for testing
max_nb_steps_testing = 50
nb_tests = 10
sleep_time_between_steps_testing = 0.5  # slow to see the steps

# for learning
# hyper-parameters
gamma_learning = 0.99
learning_rate_learning = 0.02
eps_start_learning = 1.0
eps_end_training = 0.01
# reach eps_end at episode_id = log10(eps_end/eps_start) / log10(eps_decay)
# 0.99907 for 5000 at 0.01/1.0
eps_decay_training = 0.998466
# eps_decay_training = 0.99907  # - when 70000 episode
# 0.99907  # for getting to 0.01 in ~5000 episodes

# to reach eps_end at episode episode_id, eps_decay = (eps_end / eps_start) ** (1/episode_id)
max_nb_episodes_training = 7000
max_nb_steps_training = 25
sleep_time_between_steps_learning = 0.0005

# success conditions
window_success_res = 100
threshold_success_training = 17
dict_info_training = {}
# 22.97 for self.reward = 1 + self.reward / max(self.rewards_dict.values())
# q_max = 9.23562904132267 for expected_sarsa

if flag_training_hyper_parameter_tuning:

    # No tkinter used
    learning_rate_list = [0.003, 0.01, 0.03, 0.1, 0.3, 1]

    gamma_learning_list = [0.1, 0.3, 0.5, 0.7, 0.9, 0.95, 0.99, 1]
    nb_episodes_to_plateau_list = [300, 500, 800, 1000, 3000, 5000]
    # [0.954992586021, 0.9847666521101, 0.995405417351, 0.998466120868, 0.9995395890030, 0.9999846495505]
    eps_decay_list = [(eps_end_training / eps_start_learning) ** (1/nb) for nb in nb_episodes_to_plateau_list]

    for i, param in enumerate(eps_decay_list):
        brain_agent.reset_q_table()  # re-initialize the model!!

        folder_name_training = str(i) + '/'
        logger_name = str(i) + '.log'
        logger = Logger(folder_name_training, logger_name, 0)

        hyper_parameters = (
            method_used,
            gamma_learning,
            learning_rate_learning,
            eps_start_learning,
            eps_end_training,
            param  # decay
        )
        logger.log(str(hyper_parameters), 1)
        # after = Register an alarm callback that is called after a given time.
        # give results as reference
        returns_list_res, steps_counter_list_res = [], []
        dict_info_training = {}

        train_agent(flag_tkinter, brain_agent, *hyper_parameters,
                    window_success_res, threshold_success_training, returns_list_res,
                    steps_counter_list_res, dict_info_training,
                    max_nb_episodes_training, max_nb_steps_training, sleep_time_between_steps_learning,
                    folder_name_training)
        logger.log(dict_info_training, 1)

        try:
            display_results(brain_agent, method_used, returns_list_res, window_success_res,
                            threshold_success_training, steps_counter_list_res,
                            display_flag=False, folder_name=folder_name_training)
        except Exception as e:
            print('Exception = {}'.format(e))

        # testing
        returns_list_testing = []  # passed as a reference
        test_agent(flag_tkinter, brain_agent, returns_list_testing, nb_tests, max_nb_steps_testing,
                   sleep_time_between_steps_learning, folder_name_training + "q_table.pkl")
        logger.log(returns_list_testing, 1)

if flag_training_once:
    hyper_parameters = (
        method_used,
        gamma_learning,
        learning_rate_learning,
        eps_start_learning,
        eps_end_training,
        eps_decay_training
    )
    print("hyper_parameters = {}".format(hyper_parameters))
    returns_list_res, steps_counter_list_res = [], []
    if flag_tkinter:
        # after(self, time [ms] before execution of func(*args), func=None, *args):
        # !! callback function. No return value can be read
        env.after(100, train_agent, flag_tkinter, brain_agent,
                  *hyper_parameters,
                  window_success_res, threshold_success_training, returns_list_res,
                  steps_counter_list_res, dict_info_training,
                  max_nb_episodes_training, max_nb_steps_training, sleep_time_between_steps_learning)
        env.mainloop()
        print("returns_list_res = {}, window_success_res = {}, steps_counter_list_res = {}".format(
            returns_list_res, window_success_res, steps_counter_list_res))
    else:
        train_agent(flag_tkinter, brain_agent, *hyper_parameters,
                    window_success_res, threshold_success_training, returns_list_res,
                    steps_counter_list_res, dict_info_training,
                    max_nb_episodes_training, max_nb_steps_training, sleep_time_between_steps_learning)
    try:
        display_results(brain_agent, method_used, returns_list_res, window_success_res,
                        threshold_success_training, steps_counter_list_res,
                        display_flag=display_learning_results)
    except Exception as e:
        print('Exception = {}'.format(e))
    print("hyper_parameters = {}".format(hyper_parameters))

    # print(brain_agent.reference_list)

if flag_testing:
    returns_list_testing = []
    if flag_tkinter:
        env.after(100, test_agent, flag_tkinter, brain_agent, returns_list_testing, nb_tests, max_nb_steps_testing,
                  sleep_time_between_steps_testing)
        env.mainloop()
    else:
        test_agent(flag_tkinter, brain_agent, returns_list_testing, nb_tests, max_nb_steps_testing,
                   sleep_time_between_steps_testing)

reset_q_table - self.q_table has shape = (0, 7)
hyper_parameters = ('q', 0.99, 0.02, 1.0, 0.01, 0.998466)

 --- Episode=0 ---
 eps=0.998466
 Average Score in returns_window = -77.00 
 duration=0.06
Episode 1 / 7000. Eps = 0.998466. Total_steps = 7. Return = -77. Max return = -inf, Top 10 = [-77]
Episode 21 / 7000. Eps = 0.9682753947729185. Total_steps = 8. Return = -44. Max return = 14, Top 10 = [14, -24, -44, -44, -66, -76, -77, -77, -86, -90]
Episode 41 / 7000. Eps = 0.9389976625369826. Total_steps = 10. Return = -84. Max return = 14, Top 10 = [14, -24, -44, -44, -66, -70, -76, -77, -77, -82]
Episode 61 / 7000. Eps = 0.9106052007618131. Total_steps = 15. Return = -111. Max return = 14, Top 10 = [14, -24, -30, -43, -44, -44, -66, -67, -70, -72]
Episode 81 / 7000. Eps = 0.8830712415344311. Total_steps = 8. Return = 8. Max return = 14, Top 10 = [14, 8, -14, -18, -19, -24, -30, -43, -44, -44]

 --- Episode=100 ---
 eps=0.8563698263229421
 Average Score in returns_window = -110.22 
 durat

KeyboardInterrupt: 