In [1]:
from __future__ import with_statement
import matplotlib.pyplot as plt
import gym
import random
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Convolution2D
from tensorflow.keras.optimizers import Adam
from rl.agents import DQNAgent
from rl.memory import SequentialMemory
from rl.policy import LinearAnnealedPolicy, EpsGreedyQPolicy
import numba

In [2]:
def build_model(height, width, channels, actions):
    model = Sequential()
    model.add(Convolution2D(32, (8, 8), strides=(4, 4),
              activation='relu', input_shape=(3, height, width, channels)))
    model.add(Convolution2D(64, (4, 4), strides=(2, 2), activation='relu'))
    model.add(Convolution2D(64, (3, 3), activation='relu'))
    model.add(Flatten())
    model.add(Dense(512, activation='relu'))
    model.add(Dense(256, activation='relu'))
    model.add(Dense(actions, activation='linear'))
    return model

In [3]:
def build_agent(model, actions):
    policy = LinearAnnealedPolicy(EpsGreedyQPolicy(
    ), attr='eps', value_max=1, value_min=.1, value_test=.2, nb_steps=100000)
    memory = SequentialMemory(limit=1000, window_length=3)
    dqn = DQNAgent(model=model, memory=memory, policy=policy,
                   enable_dueling_network=True, dueling_type='avg',
                   nb_actions=actions, nb_steps_warmup = 100000)
    return dqn

In [4]:
env = gym.make("Freeway-v0")
height, width, channels = env.observation_space.shape
actions = env.action_space.n
model = build_model(height, width, channels, actions)

In [5]:
dqn = build_agent(model, actions)
dqn.compile(Adam(learning_rate=0.01))
dqn.fit(env, nb_steps=100000, visualize=True, verbose=2)
print(model.summary())

Training for 100000 steps ...
  2725/100000: episode: 1, duration: 118.381s, episode steps: 2725, steps per second:  23, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.016 [0.000, 2.000],  loss: --, mean_q: --, mean_eps: --
  5470/100000: episode: 2, duration: 116.325s, episode steps: 2745, steps per second:  24, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 0.995 [0.000, 2.000],  loss: --, mean_q: --, mean_eps: --
  8194/100000: episode: 3, duration: 114.674s, episode steps: 2724, steps per second:  24, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.008 [0.000, 2.000],  loss: --, mean_q: --, mean_eps: --
 10923/100000: episode: 4, duration: 115.105s, episode steps: 2729, steps per second:  24, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 0.992 [0.000, 2.000],  loss: --, mean_q: --, mean_eps: --
 13669/100000: episode: 5, duration: 115.822s, episode steps: 2746, steps 

In [6]:
scores = dqn.test(env, nb_episodes = 10, visualize=True)
print(np.mean(scores.history['episode_reward']))

Testing for 10 episodes ...
Episode 1: reward: 24.000, steps: 2727
Episode 2: reward: 21.000, steps: 2743
Episode 3: reward: 23.000, steps: 2739
Episode 4: reward: 24.000, steps: 2739
Episode 5: reward: 21.000, steps: 2739
Episode 6: reward: 21.000, steps: 2722
Episode 7: reward: 25.000, steps: 2737
Episode 8: reward: 22.000, steps: 2737
Episode 9: reward: 22.000, steps: 2727
Episode 10: reward: 25.000, steps: 2739
22.8
