# Deep Q-Learning For Cartpole 

In [1]:
# Import Packages
import random
import gym
import numpy as np
from collections import deque

import tensorflow as tf
from tensorflow.keras.optimizers import Adam

In [2]:
# create the envirnment
env = gym.make("CartPole-v1")

In [5]:
# Get the observation space and actionn space
observation_space = env.observation_space.shape[0]
action_space = env.action_space.n

print('The observation space is: ', observation_space)
print('The action space: ', action_space)

The observation space is:  4
The action space:  2


In [6]:
# Parameters
GAMMA = 0.95                       # discount factor
LEARNING_RATE = 0.001              # learning rate for optimiser

MEMORY_SIZE = 1000000              # memeory size of the deque
BATCH_SIZE = 20                    # Batch size for training 

EXPLORATION_MAX = 1.0              
EXPLORATION_MIN = 0.01
EXPLORATION_DECAY = 0.995

exploration_rate = EXPLORATION_MAX

In [8]:
# create the model
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Dense(24, input_shape=(observation_space,), activation="relu"))
model.add(tf.keras.layers.Dense(24, activation="relu"))
model.add(tf.keras.layers.Dense(action_space, activation="linear"))
model.compile(loss="mse", optimizer=Adam(lr=LEARNING_RATE))

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 24)                120       
_________________________________________________________________
dense_4 (Dense)              (None, 24)                600       
_________________________________________________________________
dense_5 (Dense)              (None, 2)                 50        
Total params: 770
Trainable params: 770
Non-trainable params: 0
_________________________________________________________________


In [9]:
# Append experience to memory
def remember(state, action, reward, next_state, done):
    memory.append((state, action, reward, next_state, done))

In [10]:
# Function to retuen an action based on the state
def act(state):
    
    # Return random action if epsilon 
    if np.random.rand() < exploration_rate:
        return random.randrange(action_space)
    
    # Else return the action predicted by model
    q_values = model.predict(state)
    
    return np.argmax(q_values[0])

In [11]:
### Learning function ###

def experience_replay():
    
    global exploration_rate
    
    # Start training only if memeory size has reached beyond batch size
    if len(memory) < BATCH_SIZE:
        return
    
    # Take a sample batch from the memory
    batch = random.sample(memory, BATCH_SIZE)
    
    # for each experience in the batch, update the network with new q_update
    for state, action, reward, state_next, terminal in batch:
        
        # q_target is reward if terminal 
        q_update = reward
        
        # else q_target is the discounted reward 
        if not terminal:
            q_update = (reward + GAMMA * np.amax(model.predict(state_next)[0]))
            
        # Get the vector of Q_target = [Q_target(1), Q_target(2)], only the Q_target index of the action is updated 
        q_values = model.predict(state)
        q_values[0][action] = q_update
        
        # Fit a single sample to the model
        model.fit(state, q_values, verbose=0)
        
    # Reduce the exploration 
    exploration_rate *= EXPLORATION_DECAY
    exploration_rate = max(EXPLORATION_MIN, exploration_rate)


In [13]:
### Learning in the environment ###

# Initialise the run
memory = deque(maxlen=MEMORY_SIZE)
run = 0
N = 20

# Run for N episodes
for i in range(0,N):
    
    # update run number
    run += 1
    
    # Initialise environment 
    state = env.reset()
    terminal = False
    state = np.reshape(state, [1, observation_space])
    step = 0
    
    # Run each episode
    while not terminal:
        
        # update the step number 
        step += 1
        
        # choose an action
        action = act(state)
        
        # Take a step
        state_next, reward, terminal, info = env.step(action)
        
        # Reward negatively if episode ended
        reward = reward if not terminal else -reward
        
        # reshape state_next to format of model.predict()
        state_next = np.reshape(state_next, [1, observation_space])
        
        # Add experience to memory
        remember(state, action, reward, state_next, terminal)
        
        # set next state
        state = state_next
        
        # Print message at the end of every episode 
        if terminal:
            print("Run: " + str(run) + ", exploration: " + str(exploration_rate) + ", score: " + str(step))

        # Train for a batch of batch size 
        experience_replay()
    
print('Done Training!')        

Run: 1, exploration: 0.9703725093562657, score: 26
Run: 2, exploration: 0.7628626641409962, score: 48
Run: 3, exploration: 0.7183288830986236, score: 12
Run: 4, exploration: 0.6763948591909945, score: 12
Run: 5, exploration: 0.6242658676435396, score: 16
Run: 6, exploration: 0.5425201222922789, score: 28
Run: 7, exploration: 0.510849320360386, score: 12
Run: 8, exploration: 0.47862223409330756, score: 13
Run: 9, exploration: 0.4529463432347434, score: 11
Run: 10, exploration: 0.43080185560799106, score: 10
Run: 11, exploration: 0.40769130904675194, score: 11
Run: 12, exploration: 0.3897078735047413, score: 9
Run: 13, exploration: 0.36512303261753626, score: 13
Run: 14, exploration: 0.3438081748424137, score: 12
Run: 15, exploration: 0.3253644408394192, score: 11
Run: 16, exploration: 0.3017979588795719, score: 15
Run: 17, exploration: 0.2799384215094006, score: 15
Run: 18, exploration: 0.2649210072611673, score: 11
Run: 19, exploration: 0.24328132378095624, score: 17
Run: 20, explorati

In [14]:
def test(model, n_episodes):
    ''' 
    function to test the result of the model for n_episodes and return average
    rewards
    '''
    
    # store average rewards
    avg_rewards = 0
    
    for i in range(1, n_episodes+1):

        state = env.reset()
        done = False 
        total_rewards = 0
        
        # until done 
        while not done:
            
            # take an action in the max q_table
            action = np.argmax(model.predict(np.array([state]))[0])
            state, reward, done, info = env.step(action)
            
            # acculmulate rewards
            total_rewards += reward
        
        avg_rewards = avg_rewards + 1/(i) * (total_rewards - avg_rewards)
          
    return avg_rewards

In [15]:
# run test
n_episodes = 50
avg_rewards = test(model, n_episodes)
print("After " + str(n_episodes) + " episodes, the average score is " + str(avg_rewards))

After 50 episodes, the average score is 9.320000000000002


In [17]:
len(memory)

313