In [None]:
#pip install gym

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import RMSprop
import gym
from gym import wrappers
import random
from datetime import datetime
import time

In [None]:
env = gym.make('CartPole-v0')
state_size = len(env.observation_space.sample())
K = env.action_space.n

In [None]:
# Neural Network model for Deep Q Learning
def NNModel(input_shape, K, hidden_layer_sizes,act):
    X_input=Input(input_shape)
    #First Layer
    X = Dense(hidden_layer_sizes[0], input_shape=(state_size,), activation=act, kernel_initializer='he_uniform')(X_input)
    if len(hidden_layer_sizes)>1:
        for M in hidden_layer_sizes[1:]:
            X = Dense(M, activation=act, kernel_initializer='he_uniform')(X)
    # Output Layer with # of actions: 2 nodes (left, right)
    X = Dense(K, activation="linear", kernel_initializer='he_uniform')(X)

    model = Model(inputs = X_input, outputs = X, name='CartPole_DQN_model')
    model.compile(loss="mse", optimizer=RMSprop(lr=0.00025, rho=0.95, epsilon=0.01), metrics=["accuracy"])
    model.summary()
    return model

In [None]:
NNModel(input_shape=(state_size,),K=env.action_space.n, act="relu", hidden_layer_sizes=[512,256,64])

In [None]:
def epsilon_greedy(model,s, eps):
    p = np.random.random()
    if p < (1 - max(eps,0.05)):
        return np.argmax(model.predict(s))
    else:
        return env.action_space.sample()

In [None]:
np.random.seed(1)
DQNModel= NNModel(input_shape=(state_size,),K=env.action_space.n, act="relu",hidden_layer_sizes=[200,200,200])
GAMMA=0.95
state_size = env.observation_space.shape[0]
n_episodes = 500
scores=[]
for it in range(n_episodes):
    # begin a new episode
    state=env.reset()
    state= np.reshape(state, [1, state_size])
    done=False
    score=0
    while not done:
#        epsilon=0.97**(it)
#        time.sleep(0.05)
        env.render()
        Q_values = DQNModel.predict(state)
        action = epsilon_greedy(DQNModel,state, eps=epsilon)
        next_state, reward, done, info = env.step(action)
        next_state = np.reshape(next_state, [1, state_size])
        if not done or score == env._max_episode_steps-1:
            reward = reward
        else:
            reward = -100
        nextQ_values = DQNModel.predict(next_state)
        target=Q_values
        #Update the target values for the action chosen
        target[0][action] = reward + GAMMA * np.max(nextQ_values)               
        # update the weights
        DQNModel.fit(state, target, batch_size=1, verbose=0)
        # update state
        state = next_state
        score=score+1
        if done:                   
            print("episode: {}, score: {}".format(it, score))
            scores.append(score)
DQNModel.save('DQNPoleNoReplayNoTarget')
plt.plot(scores)

In [None]:
plt.plot(scores)

In [None]:
def watch_agent(model):
    done = False
    episode_reward = 0
    s = env.reset()
    s = np.reshape(s, [1, state_size])
    while not done:
        a = epsilon_greedy(model,s,eps=0)
        s, r, done, info = env.step(a)
        s = np.reshape(s, [1, state_size])
        time.sleep(0.05)
        env.render()
        episode_reward += r
    print("Episode reward:", episode_reward)

In [None]:
Model = tf.keras.models.load_model('DQNPoleNoReplayNoTarget')
for i in range(10):
    watch_agent(Model)
env.close()

Deep Q-learning is inherently unstable, your q values improve first than might abruptly get worse. This is due to the fact that the target values are non-stationary, and neural networks are very sensitive to overfitting and will try to catch this non-stationary and trajectory dependent target. To deal with this issue, we implement two main approaches: <br> 
-Experience replay: For instance, we put 10 thousand transitions into a buffer and sample a mini-batch of samples of size 64 from this buffer to train the deep network. This forms an input dataset which is stable enough for training. As we randomly sample from the replay buffer, the data is more independent of each other and closer to i.i.d. <br>
-Target network: We create two deep networks. We use the first one to retrieve Q values while the second one includes all updates in the training. After say 20 episodes, we synchronize the weights of two networks. The purpose is to fix the Q-value targets temporarily so we don’t have a moving target to chase.

In [None]:
from collections import deque
d = deque(maxlen=3)
for i in range(1, 7):
    d.append(i)
    print(d)

In [None]:
# With Replay Experince and Batch update no target network memory 2000
memory = deque(maxlen=2000)
min_experiences=100
batch_sz=64
np.random.seed(1)
state_size = env.observation_space.shape[0]
DQNModel= NNModel(input_shape=(state_size,),K=env.action_space.n, act="relu",hidden_layer_sizes=[200,200,200])
GAMMA=0.95
n_episodes = 500
scores=[]
for it in range(n_episodes):
    # begin a new episode
    state=env.reset()
    state= np.reshape(state, [1, state_size])
    done=False
    score = 0
    best_score=0
    epsilon=0.97**(it)
    while not done:
        action = epsilon_greedy(DQNModel,state, eps=epsilon)
        time.sleep(0.05)
        env.render()
        next_state, reward, done, info = env.step(action)
        next_state = np.reshape(next_state, [1, state_size])
        if not done or score == env._max_episode_steps-1:
            reward = reward
        else:
            reward = -100
        #Stochastic Gradient is generally unstable to fit DQN in RL
        #Instead we will use minibatch and experince replay idea
        #Experince replay will alleviate the problem of non-stationarity as well
        memory.append((state, action, reward, next_state, done))
        
        if len(memory) > min_experiences:
            # randomly select a batch
            minibatch = random.sample(memory, min(len(memory), batch_sz))
            states = np.zeros((batch_sz, state_size))
            next_states = np.zeros((batch_sz, state_size))
            actions, rewards, dones = [], [], []

            for i in range(batch_sz):
                states[i] = minibatch[i][0]
                actions.append(minibatch[i][1])
                rewards.append(minibatch[i][2])
                next_states[i] = minibatch[i][3]
                dones.append(minibatch[i][4])
            
            targets = DQNModel.predict(states)
            target_nexts = DQNModel.predict(next_states)
           
            for i in range(batch_sz):
                # correction on the Q value for the action used
                if dones[i]:
                    targets[i][actions[i]] = rewards[i]
                else:
                #Targets are computed using the target network which is not updated all the time
                    targets[i][actions[i]] = rewards[i] + GAMMA * (np.amax(target_nexts[i]))   
                                
            # update the weights of the current network
            DQNModel.fit(states, targets, batch_size=batch_sz, verbose=0)
        # update state
        state = next_state
        score=score+1
        if done:                   
            print("episode: {}, score: {}".format(it, score))
    scores.append(score)
DQNModel.save('DQNPoleBatchNoTarget')
plt.plot(scores)

In [None]:
Model = tf.keras.models.load_model('DQNPoleBatchNoTarget')
for i in range(10):
    watch_agent(Model)
env.close()

In [None]:
#With replay and batch update and with Target Network memory 10000 update every 20
memory = deque(maxlen=10000)
min_experiences=1000
batch_sz=64
np.random.seed(1)
DQNModel= NNModel(input_shape=(state_size,),K=env.action_space.n, act="relu",hidden_layer_sizes=[200,200,200])
#We will also create a copy of the network which we don't update very frequently, this will also resolve
#the unstability issue observed in DQNs
TargetNetwork= NNModel(input_shape=(state_size,),K=env.action_space.n, act="relu", hidden_layer_sizes=[200,200,200])
TargetNetwork.set_weights(DQNModel.get_weights()) 
GAMMA=0.95
state_size = env.observation_space.shape[0]
n_episodes = 500
scores=[]
for it in range(n_episodes):
    # begin a new episode
    state=env.reset()
    state= np.reshape(state, [1, state_size])
    done=False
    score = 0
    best_score=0
    epsilon=0.97**(it)
    while not done:
        action = epsilon_greedy(DQNModel,state, eps=epsilon)
#        time.sleep(0.05)
#        env.render()
        next_state, reward, done, info = env.step(action)
        next_state = np.reshape(next_state, [1, state_size])
        if not done or score == env._max_episode_steps-1:
            reward = reward
        else:
            reward = -100
        #Stochastic Gradient is generally unstable to fit DQN in RL
        #Instead we will use minibatch and experince replay idea
        #Experince replay will alleviate the problem of non-stationarity as well
        memory.append((state, action, reward, next_state, done))
        
        if len(memory) > min_experiences:
            # randomly select a batch
            minibatch = random.sample(memory, min(len(memory), batch_sz))
            states = np.zeros((batch_sz, state_size))
            next_states = np.zeros((batch_sz, state_size))
            actions, rewards, dones = [], [], []

            for i in range(batch_sz):
                states[i] = minibatch[i][0]
                actions.append(minibatch[i][1])
                rewards.append(minibatch[i][2])
                next_states[i] = minibatch[i][3]
                dones.append(minibatch[i][4])
            
            targets = DQNModel.predict(states)
            target_nexts = TargetNetwork.predict(next_states)
           
            for i in range(batch_sz):
                # correction on the Q value for the action used
                if dones[i]:
                    targets[i][actions[i]] = rewards[i]
                else:
                #Targets are computed using the target network which is not updated all the time
                    targets[i][actions[i]] = rewards[i] + GAMMA * (np.amax(target_nexts[i]))   
                                
            # update the weights of the current network
            DQNModel.fit(states, targets, batch_size=batch_sz, verbose=0)
        # update state
        state = next_state
        score=score+1
        if done:                   
            print("episode: {}, score: {}".format(it, score))
    scores.append(score)
    if (it+1)%20 == 0:
        TargetNetwork.set_weights(DQNModel.get_weights()) 
TargetNetwork.save('DQNPoleBatch')

In [None]:
plt.plot(scores)

In [None]:
Model = tf.keras.models.load_model('DQNPoleBatch')
for i in range(20):
    watch_agent(Model)
env.close()

In [None]:
#With replay and batch update and with Target Network memory 10000 update every 50
memory = deque(maxlen=10000)
min_experiences=1000
batch_sz=64
np.random.seed(1)
DQNModel= NNModel(input_shape=(state_size,),K=env.action_space.n, act="relu", hidden_layer_sizes=[200,200,200])
#We will also create a copy of the network which we don't update very frequently, this will also resolve
#the unstability issue observed in DQNs
TargetNetwork= NNModel(input_shape=(state_size,),K=env.action_space.n, act="relu", hidden_layer_sizes=[200,200,200])
TargetNetwork.set_weights(DQNModel.get_weights()) 
GAMMA=0.95
state_size = env.observation_space.shape[0]
n_episodes = 500
scores=[]
for it in range(n_episodes):
    # begin a new episode
    state=env.reset()
    state= np.reshape(state, [1, state_size])
    done=False
    score = 0
    best_score=0
    epsilon=0.97**(it)
    while not done:
        action = epsilon_greedy(DQNModel,state, eps=epsilon)
#        time.sleep(0.05)
#        env.render()
        next_state, reward, done, info = env.step(action)
        next_state = np.reshape(next_state, [1, state_size])
        if not done or score == env._max_episode_steps-1:
            reward = reward
        else:
            reward = -100
        #Stochastic Gradient is generally unstable to fit DQN in RL
        #Instead we will use minibatch and experince replay idea
        #Experince replay will alleviate the problem of non-stationarity as well
        memory.append((state, action, reward, next_state, done))
        
        if len(memory) > min_experiences:
            # randomly select a batch
            minibatch = random.sample(memory, min(len(memory), batch_sz))
            states = np.zeros((batch_sz, state_size))
            next_states = np.zeros((batch_sz, state_size))
            actions, rewards, dones = [], [], []

            for i in range(batch_sz):
                states[i] = minibatch[i][0]
                actions.append(minibatch[i][1])
                rewards.append(minibatch[i][2])
                next_states[i] = minibatch[i][3]
                dones.append(minibatch[i][4])
            
            targets = DQNModel.predict(states)
            target_nexts = TargetNetwork.predict(next_states)
           
            for i in range(batch_sz):
                # correction on the Q value for the action used
                if dones[i]:
                    targets[i][actions[i]] = rewards[i]
                else:
                #Targets are computed using the target network which is not updated all the time
                    targets[i][actions[i]] = rewards[i] + GAMMA * (np.amax(target_nexts[i]))   
                                
            # update the weights of the current network
            DQNModel.fit(states, targets, batch_size=batch_sz, verbose=0)
        # update state
        state = next_state
        score=score+1
        if done:                   
            print("episode: {}, score: {}".format(it, score))
    scores.append(score)
    if (it+1)%50 == 0:
        TargetNetwork.set_weights(DQNModel.get_weights()) 
TargetNetwork.save('DQNPoleBatchTargetUpdate50')
plt.plot(scores)

In [None]:
Model = tf.keras.models.load_model('DQNPoleBatchTargetUpdate50')
for i in range(5):
    watch_agent(Model)
env.close()

In [None]:
#With replay and batch update and with Target Network memory soft update tanh activation
memory = deque(maxlen=10000)
min_experiences=1000
batch_sz=64
TAU=0.05
np.random.seed(1)
DQNModel= NNModel(input_shape=(state_size,),K=env.action_space.n, act="tanh", hidden_layer_sizes=[200,200,200])
TargetNetwork= NNModel(input_shape=(state_size,),K=env.action_space.n, act="tanh", hidden_layer_sizes=[200,200,200])
GAMMA=0.95
state_size = env.observation_space.shape[0]
n_episodes = 500
scores=[]
for it in range(n_episodes):
    # begin a new episode
    state=env.reset()
    state= np.reshape(state, [1, state_size])
    done=False
    score = 0
    best_score=0
    epsilon=0.97**(it)
    while not done:
        action = epsilon_greedy(DQNModel,state, eps=epsilon)
#        time.sleep(0.05)
#        env.render()
        next_state, reward, done, info = env.step(action)
        next_state = np.reshape(next_state, [1, state_size])
        if not done or score == env._max_episode_steps-1:
            reward = reward
        else:
            reward = -100
        #Stochastic Gradient is generally unstable to fit DQN in RL
        #Instead we will use minibatch and experince replay idea
        #Experince replay will alleviate the problem of non-stationarity as well
        memory.append((state, action, reward, next_state, done))
        
        if len(memory) > min_experiences:
            # randomly select a batch
            minibatch = random.sample(memory, min(len(memory), batch_sz))
            states = np.zeros((batch_sz, state_size))
            next_states = np.zeros((batch_sz, state_size))
            actions, rewards, dones = [], [], []

            for i in range(batch_sz):
                states[i] = minibatch[i][0]
                actions.append(minibatch[i][1])
                rewards.append(minibatch[i][2])
                next_states[i] = minibatch[i][3]
                dones.append(minibatch[i][4])
            
            targets = DQNModel.predict(states)
            target_nexts = TargetNetwork.predict(next_states)
           
            for i in range(batch_sz):
                # correction on the Q value for the action used
                if dones[i]:
                    targets[i][actions[i]] = rewards[i]
                else:
                #Targets are computed using the target network which is only soft updated
                    targets[i][actions[i]] = rewards[i] + GAMMA * (np.amax(target_nexts[i]))   
                                
            # update the weights of the current network
            DQNModel.fit(states, targets, batch_size=batch_sz, verbose=0)
            
            #Soft Update the Target Network
            q_model_theta = DQNModel.get_weights()
            target_model_theta = TargetNetwork.get_weights()
            
            w_counter=0
            for q_weight, target_weight in zip(q_model_theta, target_model_theta):
                target_weight = target_weight * (1-TAU) + q_weight * TAU
                target_model_theta[w_counter] = target_weight
                w_counter+=1
            TargetNetwork.set_weights(target_model_theta)
        
        # update state
        state = next_state
        score=score+1
        if done:                   
            print("episode: {}, score: {}".format(it, score))
    scores.append(score)
TargetNetwork.save('DQNPoleBatchTargetSoftUpdatetanh')
plt.plot(scores)

In [None]:
Model = tf.keras.models.load_model('DQNPoleBatchTargetSoftUpdatetanh')
for i in range(5):
    watch_agent(Model)
env.close()