# Training the models
In this notebook we will be training several different and increasingly more complex Deep Q-Learning models and saving their best result and run logs alongside. We will be experimenting with different types of greedy policies and improvements pon the deep learning model, such as: dueling networks and double deep q learning.

**NOTE ON RUNNING WITH TENSORFLOW** The infrequent updates actually make the GPU backend slower than CPU due to data transfer overhead. If using tensorflow as a backend for keras, it is best to use the CPU backend with optimizations.

In [1]:
import sys
import os

from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam
from rl.agents.dqn import DQNAgent, Sequential
from rl.memory import SequentialMemory
from rl.policy import EpsGreedyQPolicy, LinearAnnealedPolicy, BoltzmannQPolicy
from rl.callbacks import FileLogger

# add local path
sys.path.append(os.path.join(os.getcwd(), "./src"))

from rl_model import TorcsKerasTrainer

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
NB_OF_RUNS = 10  # the number of runs for each one of experiments
NB_STEPS = 100000  # the number of steps for each episode 

## Simple deep q-learning
We begin with defining the model for our most simple method. This is a simple deep q-learning model with an Epsilon Q-greedy policy. The epsilon is fixed at 5%. The deep model is a 3-layer 35-25-18 network with relu activations. 10 training of 10^5 steps each are performed,.

In [3]:
def build_model(obs_shape, nb_actions):
    # Next, we build a very simple model.
    model = Sequential()
    model.add(Flatten(input_shape=(1,) + obs_shape))
    model.add(Dense(35))
    model.add(Activation('relu'))
    model.add(Dense(25))
    model.add(Activation('relu'))
    model.add(Dense(nb_actions))
    model.add(Activation('linear'))
    print(model.summary())

    # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
    # even the metrics!
    memory = SequentialMemory(limit=NB_STEPS, window_length=1)

    # deep q learning agent, uses greedy-epsilon policy, does NB_STEPS steps of warmup and
    # updates the gradient every 1000 steps
    dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10000,
                   target_model_update=1e-2, policy=EpsGreedyQPolicy(0.05))
    dqn.compile(Adam(lr=1e-3), metrics=['mae'])
    return dqn

Run the simulation for the given number of times(10).

In [None]:
NB_OF_RUNS = 10

for run_nb in range(NB_OF_RUNS):
    dqn = TorcsKerasTrainer(throttle=True, discrete_actions=9, model_function=build_model,
                            obs_fields=['speedX', 'speedY', 'speedZ', 'track', 'focus', 'trackPos', 'angle'])
    # saving best model fo reach on of the runs
    # laos log every episode result in a separate file for later use
    dqn.fit(nb_steps=NB_STEPS, best_filename='saved_weights/simple_dqn_best_{}.h5f'.format(run_nb), verbose=1,
            callbacks=[FileLogger('logs/simple_dqn_{}.json'.format(run_nb))])

## Linearly annealed eps
Thi one is the same as the one before, but the epsilon starts large and slowly decreases over time. In theory this should increase exploration.

In [5]:
def build_model(obs_shape, nb_actions):
    # Next, we build a very simple model.
    model = Sequential()
    model.add(Flatten(input_shape=(1,) + obs_shape))
    model.add(Dense(35))
    model.add(Activation('relu'))
    model.add(Dense(25))
    model.add(Activation('relu'))
    model.add(Dense(nb_actions))
    model.add(Activation('linear'))
    print(model.summary())

    # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
    # even the metrics!
    memory = SequentialMemory(limit=NB_STEPS, window_length=1)
    policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.05,
                                  value_test=.0, nb_steps=40000)

    # deep q learning agent, uses greedy-epsilon policy, does NB_STEPS steps of warmup and
    # updates the gradient every 1000 steps
    dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10000,
                   target_model_update=1e-2, policy=policy)
    dqn.compile(Adam(lr=1e-3), metrics=['mae'])
    
    return dqn

In [None]:
for run_nb in range(NB_OF_RUNS):
    dqn = TorcsKerasTrainer(throttle=True, discrete_actions=9, model_function=build_model,
                            obs_fields=['speedX', 'speedY', 'speedZ', 'track', 'focus', 'trackPos', 'angle'])
    # saving best model fo reach on of the runs
    # laos log every episode result in a separate file for later use
    dqn.fit(nb_steps=NB_STEPS, best_filename='saved_weights/simple_dqn_linear_eps_best_{}.h5f'.format(run_nb), verbose=1,
            callbacks=[FileLogger('logs/simple_dqn_linear_eps_{}.json'.format(run_nb))], min_step_save=40000)

## Boltzmann Exploration

In [7]:
def build_model(obs_shape, nb_actions):
    # Next, we build a very simple model.
    model = Sequential()
    model.add(Flatten(input_shape=(1,) + obs_shape))
    model.add(Dense(35))
    model.add(Activation('relu'))
    model.add(Dense(25))
    model.add(Activation('relu'))
    model.add(Dense(nb_actions))
    model.add(Activation('linear'))
    print(model.summary())

    # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
    # even the metrics!
    memory = SequentialMemory(limit=NB_STEPS, window_length=1)
    policy = BoltzmannQPolicy()

    # deep q learning agent, uses greedy-epsilon policy, does NB_STEPS steps of warmup and
    # updates the gradient every 1000 steps
    dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10000,
                   target_model_update=1e-2, policy=policy)
    dqn.compile(Adam(lr=1e-3), metrics=['mae'])
    
    return dqn

In [None]:
for run_nb in range(NB_OF_RUNS):
    yqn = TorcsKerasTrainer(throttle=True, discrete_actions=9, model_function=build_model,
                            obs_fields=['speedX', 'speedY', 'speedZ', 'track', 'focus', 'trackPos', 'angle'])
    # saving best model fo reach on of the runs
    # laos log every episode result in a separate file for later use
    dqn.fit(nb_steps=NB_STEPS, best_filename='saved_weights/simple_dqn_boltzmann_best_{}.h5f'.format(run_nb), verbose=1,
            callbacks=[FileLogger('logs/simple_dqn_boltzmann_{}.json'.format(run_nb))])

## Dueling DQN
Uses two networks, one predicting the reward for a particular action, and one for the state. The results are then averaged.

In [9]:
def build_model(obs_shape, nb_actions):
    # Next, we build a very simple model.
    model = Sequential()
    model.add(Flatten(input_shape=(1,) + obs_shape))
    model.add(Dense(35))
    model.add(Activation('relu'))
    model.add(Dense(25))
    model.add(Activation('relu'))
    model.add(Dense(nb_actions))
    model.add(Activation('linear'))
    print(model.summary())

    # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
    # even the metrics!
    memory = SequentialMemory(limit=NB_STEPS, window_length=1)

    # deep q learning agent, uses greedy-epsilon policy, does NB_STEPS steps of warmup and
    # updates the gradient every 1000 steps
    dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10000,
                   target_model_update=1e-2, policy=EpsGreedyQPolicy(0.05), enable_dueling_network=True,
                   dueling_type='avg')
    dqn.compile(Adam(lr=1e-3), metrics=['mae'])
    return dqn

In [None]:
for run_nb in range(NB_OF_RUNS):
    dqn = TorcsKerasTrainer(throttle=True, discrete_actions=9, model_function=build_model,
                            obs_fields=['speedX', 'speedY', 'speedZ', 'track', 'focus', 'trackPos', 'angle'])
    # saving best model fo reach on of the runs
    # laos log every episode result in a separate file for later use
    dqn.fit(nb_steps=NB_STEPS, best_filename='saved_weights/dueling_dqn_best-{}.h5f'.format(run_nb), verbose=1,
            callbacks=[FileLogger('logs/dueling_dqn_{}.json'.format(run_nb))])

Waiting for server on 3101............
Count Down : 5
Waiting for server on 3101............
Count Down : 4
Waiting for server on 3101............
Count Down : 3
Waiting for server on 3101............
Count Down : 2
Waiting for server on 3101............
Count Down : 1
Waiting for server on 3101............
Count Down : 0
Waiting for server on 3101............
Count Down : -1
relaunch torcs
Waiting for server on 3101............
Count Down : 4
Waiting for server on 3101............
Count Down : 3
Waiting for server on 3101............
Count Down : 2
Waiting for server on 3101............
Count Down : 1
Waiting for server on 3101............
Count Down : 0
Waiting for server on 3101............
Count Down : -1
relaunch torcs
Client connected on 3101..............
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_30 (Flatten)         (None, 29)                0         
_____________________________

Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connec

Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Reward of 3826.583588605449 for episode 90 better than last -- SAVING

Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client conn

 1095/10000 [==>...........................] - ETA: 3:37 - reward: 20.2193Waiting for server on 3101............
Count Down : 5
Client connected on 3101..............
 1130/10000 [==>...........................] - ETA: 3:41 - reward: 20.1406Waiting for server on 3101............
Count Down : 5
Client connected on 3101..............
 1169/10000 [==>...........................] - ETA: 3:43 - reward: 19.6916Waiting for server on 3101............
Count Down : 5
Client connected on 3101..............
 1204/10000 [==>...........................] - ETA: 3:47 - reward: 19.7717Waiting for server on 3101............
Count Down : 5
Client connected on 3101..............
 1246/10000 [==>...........................] - ETA: 3:50 - reward: 20.4211Waiting for server on 3101............
Count Down : 5
Client connected on 3101..............
 1281/10000 [==>...........................] - ETA: 3:52 - reward: 20.4068Waiting for server on 3101............
Count Down : 5
Client connected on 3101.............

Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connec

Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Reward of 8486.261653403255 for episode 232 better than last -- SAVING

Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
117 episodes - episode_reward: 2958.371 [-662.201, 8486.262] - loss: 14137.000 - mean_absolute_error: 1741.018 - mean_q: 1936.478

Interval 3 (20000 steps performed)
  134/10000 [..............................] - ETA: 1:46 - reward: -6.9545Waiting for server on 3101............
Count Down : 5
Client connected on 3101..............
 1003/10000 [==>...........................] - ETA: 1:49 - reward: 4.7287Waiting for server on 3101............
Count Down :

Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Reward of 21886.25521378849 for episode 282 better than last -- SAVING

Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
16 episodes - episode_reward: 6580.671 [-432.916, 21886.255] - loss: 72338.578 - mean_absolute_error: 5458.431 - mean_q: 5900.391

Interval 6 (50000 steps performed)
  164/10000 [..............................] - ETA: 1:45 - reward: -4.2109Waiting for server on 3101............
Count Down : 5
Client connected on 3101..............
  666/10000 [>..

Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
17 episodes - episode_reward: 56978.624 [14591.941, 146135.039] - loss: 47156.078 - mean_absolute_error: 5094.264 - mean_q: 5509.278

Interval 8 (70000 steps performed)
   42/10000 [..............................] - ETA: 2:26 - reward: 88.3885 Waiting for server on 3101............
Count Down : 5
Client connected on 3101..............
  152/10000 [..............................] - ETA: 3:17 - reward: 80.3295Waiting for server on 3101............
Count Down : 5
Client connected on 3101..............
  467/10000 [>.............................] - ETA: 2:33 - reward: 88.4015Waiting for server on 3101............
Count Down : 5
Client connected on 3101..............
  648/10000 [>.............................] - ETA: 2:38 - reward: 85.7958Waiting for server on 3101............
Count Down : 5
Client connected on 3101..............


Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
36 episodes - episode_reward: 26720.379 [4307.690, 87088.150] - loss: 104891.164 -

Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
Count Down : 5
Client connected on 3101..............
done, took 1469.173 seconds
Waiting for server on 3101............
Count Down : 5
Waiting for server on 3101............
Count Down : 4
Waiting for server on 3101............
Count Down : 3
Waiting for server on 3101............
Count Down : 2
Waiting for server on 3101............
Count Down : 1
Waiting for server on 3101............
Count Down : 0
Waiting for server on 3101............
Count Down : -1
relaunch torcs
Client connected on 3101..............
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_31 (Flatten)         (None, 29)                0         
_________________________________________________________________
dense_92 (Dense)             (None, 35)                1050

## Double DQN
This approach separates the model network from the q-value network. Basically a network predicts the actions(like a policy function) and the other one predicts the q value of the given action.

In [None]:
def build_model(obs_shape, nb_actions):
    # Next, we build a very simple model.
    model = Sequential()
    model.add(Flatten(input_shape=(1,) + obs_shape))
    model.add(Dense(35))
    model.add(Activation('relu'))
    model.add(Dense(25))
    model.add(Activation('relu'))
    model.add(Dense(nb_actions))
    model.add(Activation('linear'))
    print(model.summary())

    # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
    # even the metrics!
    memory = SequentialMemory(limit=NB_STEPS, window_length=1)

    # deep q learning agent, uses greedy-epsilon policy, does NB_STEPS steps of warmup and
    # updates the gradient every 1000 steps
    dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10000,
                   target_model_update=1e-2, policy=EpsGreedyQPolicy(0.05), enable_double_dqn=True)
    dqn.compile(Adam(lr=1e-3), metrics=['mae'])
    return dqn

In [None]:
for run_nb in range(NB_OF_RUNS):
    dqn = TorcsKerasTrainer(throttle=True, discrete_actions=9, model_function=build_model,
                            obs_fields=['speedX', 'speedY', 'speedZ', 'track', 'focus', 'trackPos', 'angle'])
    # saving best model fo reach on of the runs
    # laos log every episode result in a separate file for later use
    dqn.fit(nb_steps=NB_STEPS, best_filename='saved_weights/double_dqn_best-{}.h5f'.format(run_nb), verbose=1,
            callbacks=[FileLogger('logs/double_dqn_{}.json'.format(run_nb))])