In [1]:
import gym
#create eviornment
env = gym.make('CartPole-v0')


# Q learning  
### Agent design and Neural Model
#### 1) The agent needs memory because it does not have any training data 
#### 2) It uses the memory to store the experiences
#### 3) We use this experience in training the neural network 
#### 4) We are using deque to insert new experiences and remove the old ones

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import os
from tensorflow import keras
from collections import deque
#from tensorflow.keras import models
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import random
#%matplotlib inline

In [3]:
class Agent:
    def __init__(self,state_size,action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95 #discount factor mentioned in bellman equations
        self.epsilon = 1.0 #100% exploration in the beginning
        self.epsilon_decay = 0.995
        self.epsilon_min = 0.01
        self.learning_rate = 0.001
        self.model = self._create_model()
        
    def _create_model(self):
        model = Sequential()
        model.add(Dense(24,input_dim=self.state_size,activation='relu'))
        model.add(Dense(24,activation='relu'))
        model.add(Dense(self.action_size,activation='linear'))
        model.compile(loss='mse',optimizer = Adam(learning_rate=self.learning_rate))
        return model
    def remember(self,state,action,reward,next_state,done):
        #remember past experiences
        self.memory.append((state,action,reward,next_state,done))
        
    def act(self,state):
        #smapling according to epsilon greedy method
        if np.random.rand()<=self.epsilon:
            return random.randrange(self.action_size)
        else:
            #ask neural network to give me the most suitable action
            return np.argmax(self.model.predict(state)[0])
    
    def train(self,batch_size=32):
        #training using replay buffer
        mini_batch = random.sample(self.memory,batch_size)
        for experience in mini_batch:
            state,action,reward,next_state,done = experience
            #expected reward
            if not done:
                #bellman eqaution to approximate target value to reward
                target =reward +self.gamma*np.amax(self.model.predict(next_state)[0])
            else:
                target = reward
            target_f   = self.model.predict(state)
            target_f[0][action] = target
            self.model.fit(state,target_f,epochs=1,verbose=0)
            
        if(self.epsilon>self.epsilon_min):
            self.epsilon *= self.epsilon_decay
    
    
    def load(self,name):
        self.model.load_weights(name)
            
    def save(self,name):
        self.model.save_weights(name)

#### 1) we are assuming Q(s,a) is a non-linear function and we are going to approximate
#### it, learn it and minimize the MSE loss
#### 2) we will approximate the target using bellman equations
#### 3) we will use the stochastic gradient descent
#### 4) Implement using replay buffer technique

In [4]:
# model  = Sequential()
# model.add(Dense(24,input_dim=4,activation='relu'))
# model.add(Dense(24,activation='relu'))
# model.add(Dense(2,activation='linear'))
# model.compile(loss='mse',optimizer = Adam(learning_rate=0.001))
# x = np.random.rand(1,4)
# model.predict(x)

## Training DQN Agent (Deep Q learner)

In [5]:
n_episodes = 1000
output_dir = "cartpole_model/"

In [6]:
agent = Agent(state_size =4,action_size =2)
done = False
state_size =4
action_size = 2
batch_size = 32

In [None]:
for e in range (n_episodes):
    state = env.reset()
    state = np.reshape(state,[1,state_size])
    
    for time in range(500):
        env.render()
        action  = agent.act(state)
        next_state,reward,done,other_info = env.step(action)
        reward = reward if not done else -10
        next_state = np.reshape(next_state,[1,state_size])
        agent.remember(state,action,reward,next_state,done)
        state = next_state
        if done:
            print("Game Episode :{}/{}, High Score:{},Exploration Rate:{:.2}".format(e,n_episodes,time,agent.epsilon))
            break
            
    if len(agent.memory)>batch_size:
        agent.train(batch_size)
    
    if e%50==0:
        agent.save(output_dir+"weights_"+'{:04d}'.format(e)+".hdf5")
        
env.close()

Game Episode :0/1000, High Score:11,Exploration Rate:1.0
Game Episode :1/1000, High Score:8,Exploration Rate:1.0
Game Episode :2/1000, High Score:20,Exploration Rate:1.0
Game Episode :3/1000, High Score:38,Exploration Rate:0.99
Game Episode :4/1000, High Score:12,Exploration Rate:0.99
Game Episode :5/1000, High Score:17,Exploration Rate:0.99
Game Episode :6/1000, High Score:16,Exploration Rate:0.98
Game Episode :7/1000, High Score:27,Exploration Rate:0.98
Game Episode :8/1000, High Score:58,Exploration Rate:0.97
Game Episode :9/1000, High Score:33,Exploration Rate:0.97
Game Episode :10/1000, High Score:17,Exploration Rate:0.96
Game Episode :11/1000, High Score:15,Exploration Rate:0.96
Game Episode :12/1000, High Score:16,Exploration Rate:0.95
Game Episode :13/1000, High Score:11,Exploration Rate:0.95
Game Episode :14/1000, High Score:11,Exploration Rate:0.94
Game Episode :15/1000, High Score:19,Exploration Rate:0.94
Game Episode :16/1000, High Score:17,Exploration Rate:0.93
Game Episod

Game Episode :139/1000, High Score:31,Exploration Rate:0.5
Game Episode :140/1000, High Score:73,Exploration Rate:0.5
Game Episode :141/1000, High Score:34,Exploration Rate:0.5
Game Episode :142/1000, High Score:12,Exploration Rate:0.5
Game Episode :143/1000, High Score:12,Exploration Rate:0.49
Game Episode :144/1000, High Score:40,Exploration Rate:0.49
Game Episode :145/1000, High Score:18,Exploration Rate:0.49
Game Episode :146/1000, High Score:12,Exploration Rate:0.49
Game Episode :147/1000, High Score:18,Exploration Rate:0.48
Game Episode :148/1000, High Score:19,Exploration Rate:0.48
Game Episode :149/1000, High Score:10,Exploration Rate:0.48
Game Episode :150/1000, High Score:15,Exploration Rate:0.48
Game Episode :151/1000, High Score:22,Exploration Rate:0.47
Game Episode :152/1000, High Score:35,Exploration Rate:0.47
Game Episode :153/1000, High Score:21,Exploration Rate:0.47
Game Episode :154/1000, High Score:34,Exploration Rate:0.47
Game Episode :155/1000, High Score:30,Explor