In [1]:
# %load cartpole_dqn.py
import sys
import gym
import pylab
import random
import numpy as np
from collections import deque
from keras.layers import Dense
from keras.optimizers import Adam
from keras.models import Sequential

EPISODES = 300


# DQN Agent for the Cartpole
# it uses Neural Network to approximate q function
# and replay memory & target q network
class DQNAgent:
    def __init__(self, state_size, action_size, load_model=False):
        # if you want to see Cartpole learning, then change to True
        self.render = False
        #self.load_model = False

        # get size of state and action
        self.state_size = state_size
        self.action_size = action_size

        # These are hyper parameters for the DQN
        self.discount_factor = 0.99
        self.learning_rate = 0.001
        self.epsilon = 1.0
        self.epsilon_decay = 0.999
        self.epsilon_min = 0.01
        self.batch_size = 64
        self.train_start = 1000
        # create replay memory using deque
        self.memory = deque(maxlen=2000)

        # create main model and target model
        self.model = self.build_model()
        self.target_model = self.build_model()

        # initialize target model
        self.update_target_model()

        if load_model:
            self.model.load_weights("./save_model/cartpole_dqn.h5")

    # approximate Q function using Neural Network
    # state is input and Q Value of each action is output of network
    def build_model(self):
        model = Sequential()
        model.add(Dense(24, input_dim=self.state_size, activation='relu',
                        kernel_initializer='he_uniform'))
        model.add(Dense(24, activation='relu',
                        kernel_initializer='he_uniform'))
        model.add(Dense(self.action_size, activation='linear',
                        kernel_initializer='he_uniform'))
        model.summary()
        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
        return model

    # after some time interval update the target model to be same with model
    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())

    # get action from model using epsilon-greedy policy
    def get_action(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        else:
            q_value = self.model.predict(state)
            return np.argmax(q_value[0])

    # save sample <s,a,r,s'> to the replay memory
    def append_sample(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    # pick samples randomly from replay memory (with batch_size)
    def train_model(self):
        if len(self.memory) < self.train_start:
            return
        batch_size = min(self.batch_size, len(self.memory))
        mini_batch = random.sample(self.memory, batch_size)

        update_input = np.zeros((batch_size, self.state_size))
        update_target = np.zeros((batch_size, self.state_size))
        action, reward, done = [], [], []

        for i in range(self.batch_size):
            update_input[i] = mini_batch[i][0]
            action.append(mini_batch[i][1])
            reward.append(mini_batch[i][2])
            update_target[i] = mini_batch[i][3]
            done.append(mini_batch[i][4])

        target = self.model.predict(update_input)
        target_val = self.target_model.predict(update_target)

        for i in range(self.batch_size):
            # Q Learning: get maximum Q value at s' from target model
            if done[i]:
                target[i][action[i]] = reward[i]
            else:
                target[i][action[i]] = reward[i] + self.discount_factor * (
                    np.amax(target_val[i]))

        # and do the model fit!
        self.model.fit(update_input, target, batch_size=self.batch_size,
                       epochs=1, verbose=0)


if __name__ == "__main__":
    # In case of CartPole-v1, maximum length of episode is 500
    env = gym.make('CartPole-v1')
    # get size of state and action from environment
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n

    agent = DQNAgent(state_size, action_size)

    scores, episodes = [], []

    for e in range(EPISODES):
        done = False
        score = 0
        state = env.reset()
        state = np.reshape(state, [1, state_size])

        while not done:
            if agent.render:
                env.render()

            # get action for the current state and go one step in environment
            action = agent.get_action(state)
            next_state, reward, done, info = env.step(action)
            next_state = np.reshape(next_state, [1, state_size])
            # if an action make the episode end, then gives penalty of -100
            reward = reward if not done or score == 499 else -100
            
            # more reward for staying at the center...
            #reward = reward - abs(next_state[0][0]-0)/5
            #reward = reward - abs(next_state[0][0]-0)/2 makes the performance fluctuate...???

            # save the sample <s, a, r, s'> to the replay memory
            agent.append_sample(state, action, reward, next_state, done)
            # every time step do the training
            agent.train_model()
            score += reward
            state = next_state

            if done:
                # every episode update the target model to be same with model
                agent.update_target_model()

                # every episode, plot the play time
                score = score if score == 500 else score + 100
                scores.append(score)
                episodes.append(e)
                pylab.plot(episodes, scores, 'b')
                pylab.savefig("./save_graph/cartpole_dqn.png")
                print("episode:", e, "  score:", score, "  memory length:",
                      len(agent.memory), "  epsilon:", agent.epsilon)

                # if the mean of scores of last 10 episode is bigger than 490
                # stop training
                if np.mean(scores[-min(10, len(scores)):]) > 490:
                    agent.model.save_weights("./save_model/cartpole_dqn.h5")  
                    print("Model saved.")
                    sys.exit()

          


Using TensorFlow backend.


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 24)                120       
_________________________________________________________________
dense_2 (Dense)              (None, 24)                600       
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 50        
Total params: 770
Trainable params: 770
Non-trainable params: 0
_________________________________________________________________
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_4 (Dense)              (None, 24)                120       
_________________________________________________________________
dense_5 (Dense)              (None, 24)                600       
_________________________________________________________________
dense_6 (Den

episode: 88   score: 82.0   memory length: 2000   epsilon: 0.05205377368451819
episode: 89   score: 60.0   memory length: 2000   epsilon: 0.04897190533800461
episode: 90   score: 109.0   memory length: 2000   epsilon: 0.043868290040190716
episode: 91   score: 380.0   memory length: 2000   epsilon: 0.029964133290734697
episode: 92   score: 293.0   memory length: 2000   epsilon: 0.022328279439586606
episode: 93   score: 436.0   memory length: 2000   epsilon: 0.014420277353081263
episode: 94   score: 188.0   memory length: 2000   epsilon: 0.01193578227836474
episode: 95   score: 449.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 96   score: 500.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 97   score: 193.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 98   score: 114.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 99   score: 500.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 100   score: 311.0   mem

SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [2]:
env = gym.make('CartPole-v1')
    # get size of state and action from environment
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

agent = DQNAgent(state_size, action_size, load_model=True)
agent.render = True
agent.epsilon=0

scores, episodes = [], []

done = False
score = 0
state = env.reset()
state = np.reshape(state, [1, state_size])
  
while not done:
    if agent.render:
        env.render()

            # get action for the current state and go one step in environment
    action = agent.get_action(state)
    next_state, reward, done, info = env.step(action)
    next_state = np.reshape(next_state, [1, state_size])
    print(next_state)
            # if an action make the episode end, then gives penalty of -100
    reward = reward if not done or score == 499 else -100

            # save the sample <s, a, r, s'> to the replay memory
    #agent.append_sample(state, action, reward, next_state, done)
            # every time step do the training
    #agent.train_model()
    score += reward
    state = next_state

if (score>490):
    print("Mission Complete!")
else:
    print(":(")


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_7 (Dense)              (None, 24)                120       
_________________________________________________________________
dense_8 (Dense)              (None, 24)                600       
_________________________________________________________________
dense_9 (Dense)              (None, 2)                 50        
Total params: 770
Trainable params: 770
Non-trainable params: 0
_________________________________________________________________
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_10 (Dense)             (None, 24)                120       
_________________________________________________________________
dense_11 (Dense)             (None, 24)                600       
_________________________________________________________________
dense_12 (De

[[ 0.33221006  0.13953122  0.00862026 -0.0140053 ]]
[[ 0.33500068  0.33452849  0.00834015 -0.30395601]]
[[ 0.34169125  0.13928868  0.00226103 -0.00865451]]
[[ 0.34447703  0.33437813  0.00208794 -0.30062319]]
[[ 0.35116459  0.13922648 -0.00392452 -0.0072825 ]]
[[ 0.35394912 -0.05583897 -0.00407017  0.28415961]]
[[ 0.35283234  0.1393408   0.00161302 -0.00980424]]
[[ 0.35561916 -0.05580425  0.00141694  0.28338718]]
[[ 0.35450307  0.13929746  0.00708468 -0.00884852]]
[[ 0.35728902  0.3343171   0.00690771 -0.29928775]]
[[ 0.36397536  0.13909737  0.00092196 -0.00443429]]
[[ 0.36675731  0.33420608  0.00083327 -0.29682618]]
[[ 0.37344143  0.13907226 -0.00510325 -0.00388057]]
[[ 0.37622288 -0.05597613 -0.00518086  0.28718786]]
[[ 0.37510335  0.13921932  0.00056289 -0.00712456]]
[[ 0.37788774 -0.0559107   0.0004204   0.28573591]]
[[ 0.37676953  0.13920526  0.00613512 -0.00681439]]
[[ 0.37955363  0.33423869  0.00599883 -0.29755532]]
[[  3.86238405e-01   1.39031738e-01   4.77254420e-05  -2.9865152

[[ 0.78142823 -0.05696203 -0.00139771  0.30893028]]
[[ 0.78028899  0.13817981  0.00478089  0.01580688]]
[[ 0.78305259  0.33323287  0.00509703 -0.27536379]]
[[  7.89717246e-01   1.38038573e-01  -4.10247246e-04   1.89223768e-02]]
[[  7.92478018e-01   3.33166405e-01  -3.17997101e-05  -2.73889960e-01]]
[[ 0.79914135  0.13804491 -0.0055096   0.01878294]]
[[ 0.80190224 -0.0569976  -0.00513394  0.30972244]]
[[ 0.80076229  0.13819712  0.00106051  0.01542485]]
[[ 0.80352623  0.33330385  0.00136901 -0.27692329]]
[[ 0.81019231  0.13816239 -0.00416946  0.01619111]]
[[ 0.81295556 -0.05689951 -0.00384564  0.30755561]]
[[ 0.81181757  0.13827702  0.00230547  0.01366236]]
[[ 0.81458311  0.33336583  0.00257872 -0.27829228]]
[[ 0.82125043  0.13820719 -0.00298712  0.01520286]]
[[ 0.82401457 -0.0568718  -0.00268307  0.30694182]]
[[ 0.82287713  0.13828829  0.00345577  0.01341393]]
[[ 0.8256429   0.33336051  0.00372405 -0.27817666]]
[[ 0.83231011  0.13818563 -0.00183949  0.0156785 ]]
[[ 0.83507382 -0.0569099

[[  1.23941689e+00   1.38933172e-01  -6.51479894e-03  -8.11784537e-04]]
[[ 1.24219556 -0.05609474 -0.00653103  0.28980854]]
[[  1.24107366e+00   1.39119725e-01  -7.34863740e-04  -4.92702388e-03]]
[[  1.24385606e+00   3.34252207e-01  -8.33404217e-04  -2.97841719e-01]]
[[ 1.2505411   0.13914215 -0.00679024 -0.00542175]]
[[ 1.25332394 -0.05588177 -0.00689867  0.28511106]]
[[  1.25220631e+00   1.39337890e-01  -1.19645230e-03  -9.73964944e-03]]
[[ 1.25499307 -0.05576688 -0.00139125  0.28256554]]
[[ 1.25387773  0.13937489  0.00426007 -0.01055585]]
[[ 1.25666523  0.33443548  0.00404895 -0.30189164]]
[[ 1.26335394  0.13925606 -0.00198888 -0.00793451]]
[[ 1.26613906 -0.05583731 -0.00214757  0.28412023]]
[[ 1.26502231  0.1393152   0.00353483 -0.00923925]]
[[ 1.26780861  0.33438628  0.00335005 -0.3008048 ]]
[[ 1.27449634  0.13921674 -0.00266605 -0.00706723]]
[[ 1.27728068 -0.05586687 -0.0028074   0.28477334]]
[[ 1.27616334  0.13929501  0.00288807 -0.0087937 ]]
[[ 1.27894924  0.33437542  0.0027122

In [20]:
next_state

array([[-0.16527311, -1.14460448,  0.23848534,  2.00763415]])