In [1]:
import os
import random
import numpy as np
from time import sleep
from IPython.display import clear_output
from collections import deque

import gym
import warnings
warnings.filterwarnings('ignore')

In [2]:
import keras
from keras.models import Model, load_model
from keras.layers import Input, Dense
from keras.optimizers import Adam, RMSprop

Using TensorFlow backend.


## Set Environment

In [3]:
ENV_NAME = 'CartPole-v0'
env = gym.make(ENV_NAME).unwrapped; env.seed(90);

In [4]:
print('Environment Display:')
env.reset() # reset environment to a new, random state
env.render()

print('State space {}'.format(env.observation_space))
print('Action space {}'.format(env.action_space))

Environment Display:
State space Box(4,)
Action space Discrete(2)


## Build DQN Architecture

In [5]:
def DQN(state_size, action_size):
    
    x_input = Input(state_size)
    x = Dense(512, input_shape=state_size, activation='relu', kernel_initializer='he_uniform')(x_input)
    x = Dense(256, activation='relu', kernel_initializer='he_uniform')(x)
    x = Dense(64, activation='relu', kernel_initializer='he_uniform')(x)
    x = Dense(action_size, activation="linear", kernel_initializer='he_uniform')(x)
    
    model = Model(inputs=x_input, outputs=x, name='CartPole-DQN model')
    model.compile(loss="mse", optimizer=RMSprop(lr=0.00025, rho=0.95, epsilon=0.01), metrics=["accuracy"])

    model.summary()
    return model

## Define DQN Agent

In [12]:
class DQNAgent():
    """The agent interacting with and learning from the environment."""
    
    def __init__(self, env_name, state_size, action_size, seed):
        """Initialize an agent object."""
        
        self.env = gym.make(env_name).unwrapped
        self.env.max_episode_steps = 500
        
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.num_episodes = 1000
        self.memory = deque(maxlen=2000)
        
        self.gamma = 0.95
        self.epsilon = 1.0
        self.epsilon_min = 0.001
        self.epsilon_decay = 0.999
        self.batch_size = 64
        self.train_start = 1000
        
        # create main model
        self.model = DQN(state_size=(self.state_size,), action_size=self.action_size)
        
    def memorize(self, state, action, reward, next_state, done):
        """Save experience in replay memory."""
        
        self.memory.append((state, action, reward, next_state, done))
        if len(self.memory) > self.train_start:
            if self.epsilon > self.epsilon_min:
                self.epsilon *= self.epsilon_decay
      
    def replay(self):
        
        if len(self.memory) < self.train_start: return
        
        # randomly sample mini-batch from the memory
        batch = random.sample(self.memory, min(len(self.memory), self.batch_size))
        
        state = np.zeros((self.batch_size, self.state_size))
        next_state = np.zeros((self.batch_size, self.state_size))
        action, reward, done = [], [], []
        
        for i in range(self.batch_size):
            state[i] = batch[i][0]
            
            action.append(batch[i][1])
            reward.append(batch[i][2])
            next_state[i] = batch[i][3]
            done.append(batch[i][4])
            
        # do batch prediction to save speed
        target = self.model.predict(state)
        target_next = self.model.predict(next_state)
        
        for i in range(self.batch_size):
            # correction on the Q value for the action used
            if done[i]:
                target[i][action[i]] = reward[i]
            else:
                # DQN chooses the max Q value among next actions
                # selection and evaluation of action is on the target Q Network
                # Q_max = max_a' Q_target(s', a')
                target[i][action[i]] = reward[i] + self.gamma * (np.amax(target_next[i]))
                
        # train the neural network with batches
        self.model.fit(state, target, batch_size=self.batch_size, verbose=0)
    
    def act(self, state):
        """Returns actions for given state as per current policy."""
        
        if np.random.random() <= self.epsilon:
            action = random.randrange(self.action_size)
            return action
        else:
            action = np.argmax(self.model.predict(state))
            return action
        
    def save(self, agent_location):
        self.model.save(agent_location)
        
    def load(self, agent_location):
        self.model = load_model(agent_location)
        
    def train(self):
        
        for i_episode in range(1, self.num_episodes+1):
            
            state = self.env.reset()
            state = np.reshape(state, [1, self.state_size])
            done = False
            
            score = 0
            while not done:
                self.env.render()
                action = self.act(state)
                next_state, reward, done, _ = self.env.step(action)
                next_state = np.reshape(next_state, [1, self.state_size])
                if not done or score == self.env.max_episode_steps-1:
                    reward = reward
                else:
                    reward = -100
                    
                self.memorize(state, action, reward, next_state, done)
                state = next_state
                
                score += 1
                if done:
                    clear_output(wait=True)
                    print(f'Episode: {i_episode}/{self.num_episodes}, Score: {score}, Epsilon: {self.epsilon:.2}')
                    if score > 500:
                        print(f'Saving trained agent as DQN_{ENV_NAME}.h5')
                        self.save(f'./agents/DQN_{ENV_NAME}.h5')
                        return
                    
                self.replay()
                
    def test(self, num_episodes):
        
        self.load(f'./agents/DQN_{ENV_NAME}.h5')
        best_score = -np.inf
        for i_episode in range(1, num_episodes+1):
            
            state = self.env.reset()
            state = np.reshape(state, [1, self.state_size])
            done = False
            
            score = 0
            while not done:
                
                self.env.render()
                action = np.argmax(self.model.predict(state))
                next_state, reward, done, _ = self.env.step(action)
                
                score += 1
                if score > best_score: best_score = score
                state = np.reshape(next_state, [1, self.state_size])
                if done:
                    clear_output(wait=True)
                    print(f'Episode: {i_episode}/{self.num_episodes}, Best Score: {best_score}', end='')
                    break

In [13]:
agent = DQNAgent(env_name='CartPole-v1', 
                 state_size=env.observation_space.shape[0], action_size=env.action_space.n, seed=90)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 4)                 0         
_________________________________________________________________
dense_5 (Dense)              (None, 512)               2560      
_________________________________________________________________
dense_6 (Dense)              (None, 256)               131328    
_________________________________________________________________
dense_7 (Dense)              (None, 64)                16448     
_________________________________________________________________
dense_8 (Dense)              (None, 2)                 130       
Total params: 150,466
Trainable params: 150,466
Non-trainable params: 0
_________________________________________________________________


In [8]:
# watch an untrained agent
state = env.reset()
for time_step in range(200):
    
    # select an action
    action = agent.act(state)
    env.render()
    
    next_state, reward, done, _ = env.step(action)
    state = next_state
    if done:
        break

env.close()

## Train The Agent

In [9]:
agent.train()

Episode: 93/1000, Score: 553, Epsilon: 0.001
Saving trained agent as DQN_CartPole-v0.h5


## Watch The Smart Agent

In [14]:
agent.load(f'./agents/DQN_{ENV_NAME}.h5')

In [15]:
agent.test(num_episodes=100)

Episode: 100/1000, Best Score: 916

---