In [1]:
import gym
env = gym.make('CartPole-v0')

## Comes with crtain important methods/attributes
- action_space
- observation_space
- reset()
- step()
- render()

In [2]:
#no. of diff actions
#move left and move right
env.action_space

Discrete(2)

In [3]:
env.action_space.sample()

0

In [4]:
#chooses randomly
env.action_space.sample()

1

In [5]:
env.observation_space
#box class is n-dimensional tensor in openAI gym

Box(4,)

In [6]:
#takes game to initial state. It is a state intialiser
#location,velocity,angular velocity etc.
env.reset()

array([-0.01253915,  0.02703236, -0.01415071, -0.03640447])

In [7]:
env.render() #opens env window
env.close() #closes env window

In [9]:
for _ in range(1000):
    
    random_action=env.action_space.sample()
    env.step(random_action)
    env.render()
    
env.close()



## Playing game with random strategy

In [10]:
for e in range(20): #Episode
    #play 20 episodes
    observation=env.reset()
    for t in range(50):
        env.render()
        action= env.action_space.sample()
        observation,reward,done,other_info  = env.step(action)
        
        if done:
            #Game Episode is over
            
            print("Game Episode :{}/{} High Score :{}".format(e,20,t))
            break
env.close()
        

Game Episode :0/20 High Score :17
Game Episode :1/20 High Score :10
Game Episode :2/20 High Score :43
Game Episode :3/20 High Score :42
Game Episode :4/20 High Score :15
Game Episode :5/20 High Score :25
Game Episode :6/20 High Score :29
Game Episode :7/20 High Score :26
Game Episode :8/20 High Score :17
Game Episode :9/20 High Score :11
Game Episode :10/20 High Score :46
Game Episode :11/20 High Score :24
Game Episode :12/20 High Score :12
Game Episode :13/20 High Score :12
Game Episode :14/20 High Score :27
Game Episode :15/20 High Score :16
Game Episode :16/20 High Score :11
Game Episode :17/20 High Score :37
Game Episode :18/20 High Score :23
Game Episode :19/20 High Score :12


## Q Learning

In [11]:
# Goal is to maximize the total reward

In [13]:
# Q is a function for the quality of the reward. It's parameters are states and actions

In [14]:
# We wil penalize future rewards by a discount factor of gamma

### Agent Design and Neural Model

In [8]:
import numpy as np
import matplotlib.pyplot as plt
import os
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
import random

Using TensorFlow backend.


In [9]:
# class Agent:
#     def __init__(self,state_size,action_size):
#         self.state_size= state_size
#         self.action_size=action_size
#         self.memory= deque(maxlen=2000) #agent needs memory as we dont have training data.
#                                         # we will store past experiences of agent to play game. #doubly ended queue.
#         self.gamma= 0.95 #Discount factor
#         #Exploration vs Exploitation trade off
#         # Exploration : (random) is good in beginnnig --> helps you to try various random things
#         #Exploitation : Sample Good experience from the past(memory) --> good in the end
        
#         self.epsilon=1.0 # 100% Random exploration in the beginning
#         self.epsilon_decay= 0.995 # as we play more, random exp. decreases and we start truting our knowledge.
#         self.epsilon_min=0.01
#         self.learning_rate= 0.001
#         self.model= self._create_model()
        
#     def _create_model(self):
#         model= Sequential()
#         model.add(Dense(24,input_dim=self.state_size,activation='relu'))
#         model.add(Dense(24,activation='relu'))
#         model.add(Dense(self.action_size,activation='linear'))
#         model.compile(loss='mse',optimizer=Adam(lr=0.001)) #mse as output is linear 
#         return model
    
#     def remember(self,state,action,reward,next_state,done):
#         self.memory.append(state,action,reward,next_state,done)
        
#     def act(self,state):
#         # Sampling according to the Epsilon Greedy Method
#         if np.random.randn()<= self.epsilon():
            
#             return random.randrange(self.action_size)
        
#         # Ask Neural Network to give most suitable
            
#         return np.argmax(model.predict(state)[0])
        
        
#     def train(self,batch_size):
#         # Training using a replay buffer
        
#         minibatch= random.sample(self.memory,batch_size)
        
#         for experience in minibatch:
#             state,action,reward,next_state,done= experience
            
#             # X,Y : state, expected reward
            
#             if not done:
#                 # game is not yet over, bellman eqn to approx. the target value of reward
                
#                 target= reward + self.gamma*np.amax(self.model.predict(next_state)[0])
#             else:
                
#                 target = reward
            
#             target_f = self.model.predict(state)
#             target[0][action]= target
            
#             #X = state, Y= target_f
            
#             self.model.fit(state,target_f,epochs=1,verbose=0)
        
#         if self.epsilon> self.epsilon_min:
#             self.epsilon*= self.epsilon_decay
    
#     def load(self,name):
        
#         self.model.load_weights(name)

#     def save(self,name):
#         self.model.save_weight(name)
        
        
            
    
            
        
    


        
        

In [10]:
class Agent:
    def __init__(self,state_size,action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95 #Discount Factor
        self.epsilon = 1.0 # Exploration Rate: How much to act randomly, 
        self.epsilon_decay = 0.995
        self.epsilon_min = 0.01
        self.learning_rate = 0.001 
        self.model = self._create_model()
        
    
    def _create_model(self):
        #Neural Network To Approximate Q-Value function
        model = Sequential()
        model.add(Dense(24,input_dim=self.state_size,activation='relu')) #1st Hidden Layer
        model.add(Dense(24,activation='relu')) #2nd Hidden Layer
        model.add(Dense(self.action_size,activation='linear'))
        model.compile(loss='mse',optimizer=Adam(lr=self.learning_rate))
        return model
    
    def remember(self,state,action,reward,next_state,done):
        self.memory.append((state,action,reward,next_state,done)) #remembering previous experiences
        
    def act(self,state):
        # Exploration vs Exploitation
        if np.random.rand()<=self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state) # predict reward value based upon current state
        return np.argmax(act_values[0]) #Left or Right
    
    def train(self,batch_size=32): #method that trains NN with experiences sampled from memory
        minibatch = random.sample(self.memory,batch_size)
        for state,action,reward,next_state,done in minibatch:
            
            if not done: #boolean 
                target = reward + self.gamma*np.amax(self.model.predict(next_state)[0])
            else:
                target = reward
            target_f = self.model.predict(state)
            target_f[0][action] = target
            self.model.fit(state,target_f,epochs=1,verbose=0) #single epoch, x =state, y = target_f, loss--> target_f - 
            
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
    
    def load(self,name):
        self.model.load_weights(name)
    def save(self,name):
        self.model.save_weights(name)

In [11]:
n_episodes= 600
output_dir= "cartpole_model/"

In [12]:
agent= Agent(state_size=4,action_size=2)
done= False

state_size = 4
action_size =2
batch_size = 32


In [13]:

for e in range(n_episodes):
    state = env.reset()
    state = np.reshape(state,[1,state_size])
    
    for time in range(5000):
        env.render()
        action = agent.act(state) #action is 0 or 1
        next_state,reward,done,other_info = env.step(action) 
        reward = reward if not done else -10
        next_state = np.reshape(next_state,[1,state_size])
        agent.remember(state,action,reward,next_state,done)
        state = next_state
        
        if done:
            print("Game Episode :{}/{}, High Score:{},Exploration Rate:{:.2}".format(e,n_episodes,time,agent.epsilon))
            break
            
    if len(agent.memory)>batch_size:
        agent.train(batch_size)
    
    if e%50==0:
        agent.save(output_dir+"weights_"+'{:04d}'.format(e)+".hdf5")
        
env.close()

Game Episode :0/600, High Score:29,Exploration Rate:1.0
Game Episode :1/600, High Score:41,Exploration Rate:1.0
Game Episode :2/600, High Score:12,Exploration Rate:0.99
Game Episode :3/600, High Score:11,Exploration Rate:0.99
Game Episode :4/600, High Score:15,Exploration Rate:0.99
Game Episode :5/600, High Score:11,Exploration Rate:0.98
Game Episode :6/600, High Score:14,Exploration Rate:0.98
Game Episode :7/600, High Score:12,Exploration Rate:0.97
Game Episode :8/600, High Score:15,Exploration Rate:0.97
Game Episode :9/600, High Score:23,Exploration Rate:0.96
Game Episode :10/600, High Score:14,Exploration Rate:0.96
Game Episode :11/600, High Score:20,Exploration Rate:0.95
Game Episode :12/600, High Score:23,Exploration Rate:0.95
Game Episode :13/600, High Score:51,Exploration Rate:0.94
Game Episode :14/600, High Score:26,Exploration Rate:0.94
Game Episode :15/600, High Score:11,Exploration Rate:0.93
Game Episode :16/600, High Score:14,Exploration Rate:0.93
Game Episode :17/600, High

In [None]:
`