In [1]:
# Q_Learning is a way to measure the reward that we would get when taking a particular action 'a' in a state 's'. 
# It is not only a measurment of the immediate reward but a summation of the entire future reward we would get from 
# consequent actions as well. 
# Q(s,a) = r + Y*max(Q(s',a')); where, r is the immediate reward
# Input will have a state matrix, the output matrix from the Neural Network would be a matrix of how good 
# each action is


import numpy as np
import matplotlib.pyplot as plt
import os
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
import random

class Agent:
    
    def __init__(self,state_size,action_size):
        
        self.state_size= state_size
        self.action_size = action_size
        self.gamma = 0.95  # discount factor
        self.memory= deque(maxlen=2000)
        
        # exploration vs exploitation Trade off
        # exploration: good in the begining -> helps you to try various random things
        # exploitation: sample good experience from the past(memory)--> good in the end
        
        self.epsilon=1.0  #100 % exploration in the begining
        self.epsilon_decay=0.995
        self.epsilon_min=0.01
        self.learning_rate=0.001
        self.model= self._create_model()
    
    def _create_model(self):
        
        model=Sequential()
        model.add(Dense(24,activation='relu',input_dim=self.state_size))
        model.add(Dense(24,activation='relu'))
        model.add(Dense(self.action_size,activation='linear'))
        model.compile(loss='mse',optimizer=Adam(lr=self.learning_rate))
        return model
    
    
    # We need data to train the neural network, thus we use Replay Buffer Technique to generate data on the fly 
    # and use it for training 
    def remember(self,state,action,reward,next_state,done):
        self.memory.append((state,action,reward,next_state,done))
    
    
    def act(self,state):
        
        # Exploration vs Exploitation
        # sampling according to epsilon greedy method
        
        if np.random.rand()<=self.epsilon:
            return random.randrange(self.action_size)
        else:      ## in else case: ask neural network to give me the suitable action
            return np.argmax(self.model.predict(state)[0])
        
    def train(self,batch_size=32):
        minibatch=random.sample(self.memory,batch_size)
        for state,action,reward,next_state,done in minibatch:
            
            if not done:
                target= reward+ self.gamma*np.max(self.model.predict(next_state)[0])
            else:
                target=reward
            
            target_f = self.model.predict(state)
            target_f[0][action]=target
            
            self.model.fit(state,target_f,epochs=1,verbose=0)
        if self.epsilon > self.epsilon_min:
            
            self.epsilon*=self.epsilon_decay
    def load(self,name):
        self.model.load_weights(name)
        
    def save(self,name):
        self.model.save_weights(name)

In [None]:


import gym

import numpy as np

env= gym.make('CartPole-v0')

n_episodes = 500


output_dir = "cartpole_model/"

state_size = 4
action_size =2
batch_size = 32

agent = Agent(state_size, action_size) 

done = False  # initialy game is not completed

for e in range(n_episodes): 
    state = env.reset()
    state = np.reshape(state,[1,state_size])  ## state ko reshape kiya (1,4) shape mein
    
    for time in range(500):
        env.render()
        
        action = agent.act(state) #action is 0 or 1
        
        next_state,reward,done,other_info = env.step(action) 
        
        
        reward = reward if not done else -10
        
        next_state = np.reshape(next_state,[1,state_size])
        
        agent.remember(state,action,reward,next_state,done)
        
        state = next_state
        
        if done:
            print("Game Episode :{}/{}, High Score:{},Exploration Rate:{:.2}".format(e,n_episodes,time,agent.epsilon))
            break
            
    if len(agent.memory)>batch_size:
        agent.train(batch_size)
    
   # if e%50==0:
    #    agent.save(output_dir+"weights_"+'{:04d}'.format(e)+".hdf5")
        
print("Deep Q-Learner Model Trained!")
env.close()

Game Episode :0/500, High Score:16,Exploration Rate:1.0
Game Episode :1/500, High Score:21,Exploration Rate:1.0
Game Episode :2/500, High Score:24,Exploration Rate:0.99
Game Episode :3/500, High Score:22,Exploration Rate:0.99
Game Episode :4/500, High Score:16,Exploration Rate:0.99
Game Episode :5/500, High Score:22,Exploration Rate:0.98
Game Episode :6/500, High Score:16,Exploration Rate:0.98
Game Episode :7/500, High Score:38,Exploration Rate:0.97
Game Episode :8/500, High Score:9,Exploration Rate:0.97
Game Episode :9/500, High Score:34,Exploration Rate:0.96
Game Episode :10/500, High Score:26,Exploration Rate:0.96
Game Episode :11/500, High Score:21,Exploration Rate:0.95
Game Episode :12/500, High Score:18,Exploration Rate:0.95
Game Episode :13/500, High Score:35,Exploration Rate:0.94
Game Episode :14/500, High Score:10,Exploration Rate:0.94
Game Episode :15/500, High Score:15,Exploration Rate:0.93
Game Episode :16/500, High Score:11,Exploration Rate:0.93
Game Episode :17/500, High 

Game Episode :142/500, High Score:24,Exploration Rate:0.49
Game Episode :143/500, High Score:38,Exploration Rate:0.49
Game Episode :144/500, High Score:34,Exploration Rate:0.49
Game Episode :145/500, High Score:29,Exploration Rate:0.49
Game Episode :146/500, High Score:27,Exploration Rate:0.48
Game Episode :147/500, High Score:12,Exploration Rate:0.48
Game Episode :148/500, High Score:12,Exploration Rate:0.48
Game Episode :149/500, High Score:33,Exploration Rate:0.48
Game Episode :150/500, High Score:65,Exploration Rate:0.47
Game Episode :151/500, High Score:69,Exploration Rate:0.47
Game Episode :152/500, High Score:35,Exploration Rate:0.47
Game Episode :153/500, High Score:15,Exploration Rate:0.47
Game Episode :154/500, High Score:27,Exploration Rate:0.46
Game Episode :155/500, High Score:70,Exploration Rate:0.46
Game Episode :156/500, High Score:28,Exploration Rate:0.46
Game Episode :157/500, High Score:76,Exploration Rate:0.46
Game Episode :158/500, High Score:68,Exploration Rate:0.

Game Episode :281/500, High Score:63,Exploration Rate:0.25
Game Episode :282/500, High Score:113,Exploration Rate:0.24
Game Episode :283/500, High Score:132,Exploration Rate:0.24
Game Episode :284/500, High Score:131,Exploration Rate:0.24
Game Episode :285/500, High Score:27,Exploration Rate:0.24
Game Episode :286/500, High Score:53,Exploration Rate:0.24
Game Episode :287/500, High Score:67,Exploration Rate:0.24
Game Episode :288/500, High Score:75,Exploration Rate:0.24
Game Episode :289/500, High Score:70,Exploration Rate:0.24
Game Episode :290/500, High Score:50,Exploration Rate:0.23
Game Episode :291/500, High Score:47,Exploration Rate:0.23
Game Episode :292/500, High Score:139,Exploration Rate:0.23
Game Episode :293/500, High Score:80,Exploration Rate:0.23
Game Episode :294/500, High Score:132,Exploration Rate:0.23
Game Episode :295/500, High Score:75,Exploration Rate:0.23
Game Episode :296/500, High Score:101,Exploration Rate:0.23
Game Episode :297/500, High Score:62,Exploration R