# Developing a DQN algorithm from Scratch for Cartpole

In [1]:
# See that, we are using the correct environment
import sys
sys.executable

'/Users/sandeep/miniconda3/envs/rlenv/bin/python'

- Code modified from: https://github.com/keon/deep-q-learning

# Imports

In [3]:
import numpy as np
import gym
import random
import keras.backend as k
from collections import deque

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

In [6]:
ENV_NAME = 'CartPole-v0'

# Get the environment and extract the number of actions available in the Cartpole problem
env = gym.make(ENV_NAME)
np.random.seed(1)
env.seed(1)
nb_actions = env.action_space.n

In [7]:
input_shape = env.observation_space.shape[0]
print(input_shape)

4


# Simple NN Model

In [9]:
model = Sequential()
model.add(Dense(16, input_dim=input_shape, activation='relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
model.compile(loss='mse',
                      optimizer=Adam(lr=0.001))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 16)                80        
_________________________________________________________________
dense_4 (Dense)              (None, 2)                 34        
_________________________________________________________________
activation_2 (Activation)    (None, 2)                 0         
Total params: 114
Trainable params: 114
Non-trainable params: 0
_________________________________________________________________
None


# Developing a DQN Agent

In [10]:
# Deep Q-learning Agent
class SimpleAgent:
    def __init__(self, action_size, model,exp=1.0):
        self.action_size = action_size
        self.memory = deque(maxlen=10000)
        self.gamma = 0.99    # discount factor
        
        self.epsilon = exp  # exploration rate
        
        self.epsilon_min = 0.01 # We always do some exploration
        
        self.epsilon_decay = 0.999
        
        self.model = model
        
    def buffer(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
        
    def get_act(self, state):
        #Return Random action
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])  # returns action
    
    #This is where training happens
    def replay(self, batch_size):
        
        #Get a Random Minibatch
        minibatch = random.sample(self.memory, batch_size)
        
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
              target = reward + self.gamma * \
                       np.amax(self.model.predict(next_state)[0])
            target_f = self.model.predict(state)
            
            #only update the target for action taken
            target_f[0][action] = target
            
            
            self.model.fit(state, target_f, epochs=1, verbose=0)
        
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

In [11]:
# initialize gym environment and the agent
env = gym.make('CartPole-v0')
nb_actions = env.action_space.n
agent = SimpleAgent(nb_actions,model)

# Load the Model

In [13]:
model.load_weights('Trained_Scratch_Cartpole.h5f')

# Training of the Model: Let us Play!!

In [14]:
state_size = env.observation_space.shape[0]

In [15]:
done = False
batch_size = 64
num_episodes = 20
max_steps = 200


for e in range(num_episodes):
    state = env.reset()
    state = np.reshape(state, [1, input_shape]) #Reshape for later NN Training
    for steps in range(max_steps):
        #Visualization of the env
        #env.render()

        #Get the action from the Agent
        action = agent.get_act(state)
        
        #Take this action
        next_state, reward, done, _ = env.step(action)
        
        
        reward = reward if not done else -10
        next_state = np.reshape(next_state, [1, state_size])
        agent.buffer(state, action, reward, next_state, done)
        state = next_state
        if done:
            print("episode: {}/{}, steps: {}, exploration: {:.2}"
                  .format(e+1, num_episodes, steps, agent.epsilon))
            break
        if len(agent.memory) > batch_size:
            agent.replay(batch_size)

episode: 0/20, steps: 19, exploration: 1.0
episode: 1/20, steps: 15, exploration: 1.0
episode: 2/20, steps: 17, exploration: 1.0
episode: 3/20, steps: 25, exploration: 0.99
episode: 4/20, steps: 18, exploration: 0.97
episode: 5/20, steps: 17, exploration: 0.95
episode: 6/20, steps: 33, exploration: 0.92
episode: 7/20, steps: 12, exploration: 0.91
episode: 8/20, steps: 15, exploration: 0.9
episode: 9/20, steps: 43, exploration: 0.86
episode: 10/20, steps: 34, exploration: 0.83
episode: 11/20, steps: 13, exploration: 0.82
episode: 12/20, steps: 79, exploration: 0.76
episode: 13/20, steps: 147, exploration: 0.65
episode: 14/20, steps: 25, exploration: 0.64
episode: 15/20, steps: 89, exploration: 0.58
episode: 16/20, steps: 65, exploration: 0.55
episode: 17/20, steps: 80, exploration: 0.5
episode: 18/20, steps: 22, exploration: 0.49
episode: 19/20, steps: 174, exploration: 0.41


# Saving the model

In [11]:
model.save_weights("Trained_Scratch_Cartpole.h5f", overwrite=False)

In [18]:
env.close()

# Test the agent

In [17]:
env = gym.make('CartPole-v0')
test_episode= 5
model.load_weights('Trained_Scratch_Cartpole.h5f')

agent = SimpleAgent(nb_actions,model,exp=0.0)

for e in range(test_episode):
    state = env.reset()
    
    for steps in range(200):
        
        state = np.reshape(state, [1, input_shape]) #Reshape for later NN input
        
        #Visualization of the env
        env.render()

        #Get the action from the Agent
        action = agent.get_act(state)
        
        #Take this action
        state, reward, done, _ = env.step(action)
        
        if done:
            print("test_episode: {}/{}, steps: {}, exploration: {:.2}"
                  .format(e+1, test_episode, steps+1, agent.epsilon))
            break

env.close()

test_episode: 1/5, steps: 200, exploration: 0.0
test_episode: 2/5, steps: 200, exploration: 0.0
test_episode: 3/5, steps: 200, exploration: 0.0
test_episode: 4/5, steps: 200, exploration: 0.0
test_episode: 5/5, steps: 200, exploration: 0.0
