# Cartpole Game
![](https://keon.io/images/deep-q-learning/animation.gif)

CartPole is one of the simplest environments in **OpenAI gym** ( a game simulator). The goal of CartPole is to balance a pole connected with one joint on top of a moving cart. Instead of pixel information, there are 4 kinds of information given by the state, such as angle of the pole and position of the cart. An agent can move the cart by performing a series of actions of 0 or 1 to the cart, pushinh it left or right.

The **goal** is to keep the pole stay as long as possible. So agent will learn to take maximum reward at each step, avoid letting pole fall down.

##  Import libraries

In [1]:
import random
import gym
import numpy as np
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## Initialize environment

In [18]:
env = gym.make('CartPole-v1')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
print("State size:", state_size, ", Action sizea:", action_size)

State size: 4 , Action sizea: 2


There are 4 states () and 2 actions (move left, right)

## Set up our hyperparameters

In [28]:
EPISODES = 1000 # a number of games we want the agent to play.
GAMMA =  0.95   # aka decay or discount rate, to calculate the 
                # future discounted reward.
EPSILON_DECAY = 0.995 # Decay rate for epsilon, we do less 
                      # exploration over time

EPSILON_MIN = 0.01 # we want the agent to explore 
                   # at least this amount.
LEARNING_RATE = 0.001 # Determines how much neural net 
                      # learns in each iteration.

## Create our Deep Q-learning Neural Network model

**Multi-layer perceptron**:
- Input: 4 units (space size)
- Dense 1: 24 units, ReLU activation
- Dense 2: 24 units, ReLU activation
- Dense 3: 2 units (action size), Linear activation

In [20]:
# Neural Net for Deep Q Learning
# Sequential() creates the foundation of the layers.
model = Sequential()

# 'Dense' is the basic form of a neural network layer
# Input Layer of state size(4) and Hidden Layer with 24 nodes
model.add(Dense(24, input_dim=state_size, activation='relu'))

# Hidden layer with 24 nodes
model.add(Dense(24, activation='relu'))

# Output Layer with # of actions: 2 nodes (left, right)
model.add(Dense(action_size, activation='linear'))

# Create the model based on the information above
model.compile(
    loss='mse',
    optimizer=Adam(lr=LEARNING_RATE)
)

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 24)                120       
_________________________________________________________________
dense_2 (Dense)              (None, 24)                600       
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 50        
Total params: 770
Trainable params: 770
Non-trainable params: 0
_________________________________________________________________


## Replay & Act

In [54]:
def act(state, epsilon):
    """
    return action from state
    :param state: array shape (4,)
    """
    # get a random number
    # if the number no larger than EPSILON
    # take exploration
    if np.random.rand() <= epsilon:
        return random.randrange(action_size)
    
    # else take exploitation
    # Keras model predicts with data size (bath_size, ...)
    # then we have to reshape state, get from environment
    act_values = model.predict(
        np.reshape(state, [1, state_size])
    )
    # Take action with max Q-value
    return np.argmax(act_values[0])  

**Remember that**:
$$
loss = \left[\underbrace{r\quad+\quad\gamma\max_{a'}\hat{\mathcal{Q}}(s,a')}_{\text{Target}} \quad-\quad \underbrace{\mathcal{Q}(s,a)}_{\text{Prediction}}\right]^2
$$

In [60]:
def replay(memory, batch_size, model, epsilon):
    """
    replay when number of steps in memory exeeds batch_size
    replay => retrain model
    
    """
    # get batch_size past actions including
    # state -> action -> reward -> next_state -> done if pole falls down
    minibatch = random.sample(memory, batch_size)
    
    for state, action, reward, next_state, done in minibatch:
        target = reward
        if not done:
            # update reward follow the above formula
            target = (reward + GAMMA * np.amax(model.predict(next_state)[0]))
            
        target_f = model.predict(state)
        target_f[0][action] = target
        # retrian model
        model.fit(state, target_f, epochs=1, verbose=0)
    
    # reset epsilon 
    epsilon1 = epsilon    
    if epsilon > EPSILON_MIN:
        epsilon1 = epsilon*EPSILON_DECAY

    return model, epsilon1

## Train our Agent

In [61]:
done = False
batch_size = 32
epsilon = 1

# save maximally only 2000 actions
memory = deque(maxlen=2000)

# In each episode
for e in range(EPISODES):
    # Reset state to initial
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    
    # reward will be cummulate in episode
    total_reward = 0
    
    # In each time
    for time in range(500):
        
        # render is error in Jupyter
        # env.render()
        
        # take action
        action = act(state, epsilon)
        
        # do the action, change to next sate, get reward and check if fail (state = done)
        next_state, reward, done, _ = env.step(action)
        
        reward = reward if not done else -10
        # Cumulate reward
        total_reward += reward
        
        # reshape and add next state to memory
        next_state = np.reshape(next_state, [1, state_size])
        memory.append((state, action, reward, next_state, done))
        
        # reset surrent state
        state = next_state
        
        if done: # the Pole falls down
            if e % 10 == 0:
                print("episode: %d/%d, total steps: %d, score: %.6f, e: %.6f" 
                   %(e, EPISODES, time+1, total_reward, epsilon))
            break
            
        # retrain model & reset epsilon when there's more than batch_size steps
        if len(memory) > batch_size:
            model, epsilon = replay(memory, batch_size, model, epsilon)

episode: 0/1000, total steps: 28, score: 17.000000, e: 1.000000
episode: 10/1000, total steps: 249, score: 238.000000, e: 0.023065
episode: 20/1000, total steps: 186, score: 175.000000, e: 0.009986
episode: 30/1000, total steps: 10, score: -1.000000, e: 0.009986
episode: 40/1000, total steps: 10, score: -1.000000, e: 0.009986
episode: 50/1000, total steps: 211, score: 200.000000, e: 0.009986
episode: 60/1000, total steps: 202, score: 191.000000, e: 0.009986
episode: 70/1000, total steps: 14, score: 3.000000, e: 0.009986
episode: 80/1000, total steps: 234, score: 223.000000, e: 0.009986
episode: 90/1000, total steps: 28, score: 17.000000, e: 0.009986
episode: 100/1000, total steps: 151, score: 140.000000, e: 0.009986
episode: 110/1000, total steps: 22, score: 11.000000, e: 0.009986
episode: 120/1000, total steps: 247, score: 236.000000, e: 0.009986
episode: 130/1000, total steps: 142, score: 131.000000, e: 0.009986
episode: 140/1000, total steps: 195, score: 184.000000, e: 0.009986
epis

KeyboardInterrupt: 

## NOTE:

Over time the total steps and score increase, that mean agent is improved after each episode.

## Put everything together

In [None]:
# -*- coding: utf-8 -*-
import random
import gym
import numpy as np
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

EPISODES = 1000

class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95    # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.model = self._build_model()

    def _build_model(self):
        # Neural Net for Deep-Q learning Model
        model = Sequential()
        model.add(Dense(24, input_dim=self.state_size, activation='relu'))
        model.add(Dense(24, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss='mse',
                      optimizer=Adam(lr=self.learning_rate))
        return model

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])  # returns action

    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                target = (reward + self.gamma *
                          np.amax(self.model.predict(next_state)[0]))
            target_f = self.model.predict(state)
            target_f[0][action] = target
            self.model.fit(state, target_f, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights(name)


if __name__ == "__main__":
    env = gym.make('CartPole-v1')
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n
    agent = DQNAgent(state_size, action_size)
    # agent.load("./save/cartpole-dqn.h5")
    done = False
    batch_size = 32

    for e in range(EPISODES):
        state = env.reset()
        state = np.reshape(state, [1, state_size])
        for time in range(500):
            # env.render()
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action)
            reward = reward if not done else -10
            next_state = np.reshape(next_state, [1, state_size])
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            if done:
                print("episode: {}/{}, score: {}, e: {:.2}"
                      .format(e, EPISODES, time, agent.epsilon))
                break
            if len(agent.memory) > batch_size:
                agent.replay(batch_size)
        # if e % 10 == 0:
#     agent.save("./save/cartpole-dqn.h5")