# Environment Setup

In [1]:
import gym
import random
from statistics import mean, median
import numpy as np

from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam

from rl.agents import DQNAgent
from rl.memory import SequentialMemory
from rl.policy import EpsGreedyQPolicy

In [2]:
env = gym.make('CartPole-v1')
state_space = env.observation_space.shape[0]
action_space = env.action_space.n

In [3]:
state_space # position of cart, velocity of cart, angle of pole, rotation rate of pole

4

In [4]:
action_space # move left, move right

2

# Random Agent Performance

In [5]:
n = 20
scores = []

for episode in range(n):
    state = env.reset()
    score = 0
    done = False
    
    while done == False:
        env.render()
        action = random.choice([0,1])
        next_state, reward, done, info = env.step(action)
        score += reward
        
    scores.append(score)
    print('Episode', episode+1, 'score:', score)

Episode 1 score: 40.0
Episode 2 score: 10.0
Episode 3 score: 19.0
Episode 4 score: 28.0
Episode 5 score: 13.0
Episode 6 score: 8.0
Episode 7 score: 28.0
Episode 8 score: 24.0
Episode 9 score: 13.0
Episode 10 score: 25.0
Episode 11 score: 18.0
Episode 12 score: 18.0
Episode 13 score: 15.0
Episode 14 score: 15.0
Episode 15 score: 26.0
Episode 16 score: 13.0
Episode 17 score: 9.0
Episode 18 score: 21.0
Episode 19 score: 37.0
Episode 20 score: 21.0


In [6]:
print('Mean reward random agent:\t', mean(scores))
print('Median reward random agent:\t', median(scores))
print('Max reward random agent:\t', max(scores))
print('Total reward random agent:\t', sum(scores))

Mean reward random agent:	 20.05
Median reward random agent:	 18.5
Max reward random agent:	 40.0
Total reward random agent:	 401.0


# Deep Q Agent Performance

## Build Neural Network Architecture

In [7]:
def build_nn(state_space, action_space):
    model = Sequential()
    model.add(Flatten(input_shape=(action_space,state_space)))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(16, activation='relu'))
    model.add(Dense(action_space, activation='linear')) #Output: Q value estimate
    return model

In [8]:
model = build_nn(state_space, action_space)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten (Flatten)            (None, 8)                 0         
_________________________________________________________________
dense (Dense)                (None, 32)                288       
_________________________________________________________________
dense_1 (Dense)              (None, 16)                528       
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 34        
Total params: 850
Trainable params: 850
Non-trainable params: 0
_________________________________________________________________


## Create RL Agent

In [14]:
def build_network(model, action_space):
    policy = EpsGreedyQPolicy(eps=0.1) # Epsilon-Greedy policy for exploration vs exploitation trade-off
    memory = SequentialMemory(limit=5000, window_length=2)
    network = DQNAgent(model=model, memory=memory, policy=policy, nb_actions=action_space, nb_steps_warmup=10, target_model_update=1e-2)
    return network

In [15]:
network = build_network(model, action_space)
network.compile(Adam(lr=1e-3), metrics=['mae'])
network.fit(env, nb_steps=30000, visualize=False, verbose=1)

Training for 30000 steps ...
Interval 1 (0 steps performed)
56 episodes - episode_reward: 177.107 [16.000, 500.000] - loss: 6.852 - mae: 50.975 - mean_q: 102.040

Interval 2 (10000 steps performed)
27 episodes - episode_reward: 358.852 [180.000, 500.000] - loss: 6.182 - mae: 51.221 - mean_q: 102.826

Interval 3 (20000 steps performed)
done, took 158.758 seconds


<tensorflow.python.keras.callbacks.History at 0x22fbad752b0>

In [16]:
dqn_perf = network.test(env, nb_episodes=20, visualize=False) # DQN Agent gets the maximum possible score every time
dqn_scores = dqn_perf.history['episode_reward']

Testing for 20 episodes ...
Episode 1: reward: 500.000, steps: 500
Episode 2: reward: 500.000, steps: 500
Episode 3: reward: 500.000, steps: 500
Episode 4: reward: 500.000, steps: 500
Episode 5: reward: 500.000, steps: 500
Episode 6: reward: 500.000, steps: 500
Episode 7: reward: 500.000, steps: 500
Episode 8: reward: 500.000, steps: 500
Episode 9: reward: 500.000, steps: 500
Episode 10: reward: 500.000, steps: 500
Episode 11: reward: 500.000, steps: 500
Episode 12: reward: 500.000, steps: 500
Episode 13: reward: 500.000, steps: 500
Episode 14: reward: 500.000, steps: 500
Episode 15: reward: 500.000, steps: 500
Episode 16: reward: 500.000, steps: 500
Episode 17: reward: 500.000, steps: 500
Episode 18: reward: 500.000, steps: 500
Episode 19: reward: 500.000, steps: 500
Episode 20: reward: 500.000, steps: 500


In [17]:
# Random Agent performance for 20 episodes
print('Mean reward random agent:\t', mean(scores))
print('Median reward random agent:\t', median(scores))
print('Max reward random agent:\t', max(scores))
print('Total reward random agent:\t', sum(scores))

print('----------------------------------------')

# DQN Agent performance for 20 episodes
print('Mean reward DQN agent:\t\t', np.mean(dqn_scores))
print('Median reward DQN agent:\t', np.median(dqn_scores))
print('Max reward DQN agent:\t\t', max(dqn_scores))
print('Total reward DQN agent:\t\t', sum(dqn_scores))

Mean reward random agent:	 20.05
Median reward random agent:	 18.5
Max reward random agent:	 40.0
Total reward random agent:	 401.0
----------------------------------------
Mean reward DQN agent:		 500.0
Median reward DQN agent:	 500.0
Max reward DQN agent:		 500.0
Total reward DQN agent:		 10000.0


In [18]:
# visual example
_ = network.test(env, nb_episodes=5, visualize=True)

Testing for 5 episodes ...
Episode 1: reward: 500.000, steps: 500
Episode 2: reward: 500.000, steps: 500
Episode 3: reward: 500.000, steps: 500
Episode 4: reward: 500.000, steps: 500
Episode 5: reward: 500.000, steps: 500


# Save Trained DQN Agent Weights

In [19]:
network.save_weights('dqn_weights.h5f', overwrite=True)