In [16]:
import gym

In [17]:
import tensorflow as tf

In [3]:
env = gym.make('CartPole-v0')
env.reset()
for _ in range(1000):
    env.render()
    env.step(env.action_space.sample()) # take a random action
env.close()



In [4]:
env = gym.make('CartPole-v0')
for i_episode in range(20):
    observation = env.reset()
    for t in range(100):
        env.render()
        print(observation)
        action = env.action_space.sample()
        observation, reward, done, info = env.step(action)
        if done:
            print("Episode finished after {} timesteps".format(t+1))
            break
env.close()

[-0.00987965 -0.03336668  0.04327066 -0.02374189]
[-0.01054698  0.16110888  0.04279582 -0.30246457]
[-0.0073248   0.35559559  0.03674653 -0.58134941]
[-2.12893034e-04  1.59978549e-01  2.51195415e-02 -2.77320878e-01]
[ 0.00298668 -0.03549259  0.01957312  0.0231777 ]
[ 0.00227683 -0.23088969  0.02003668  0.32197134]
[-0.00234097 -0.42629116  0.0264761   0.6209051 ]
[-0.01086679 -0.62177266  0.03889421  0.92180764]
[-0.02330224 -0.42719723  0.05733036  0.64159725]
[-0.03184619 -0.23291934  0.0701623   0.36750509]
[-0.03650458 -0.42896449  0.07751241  0.68146009]
[-0.04508387 -0.62507251  0.09114161  0.99750471]
[-0.05758532 -0.82128705  0.1110917   1.31736349]
[-0.07401106 -1.01762487  0.13743897  1.6426486 ]
[-0.09436355 -0.82435405  0.17029194  1.39575612]
[-0.11085064 -0.63171013  0.19820707  1.16079424]
Episode finished after 16 timesteps
[ 0.01975264 -0.00757264  0.03260692 -0.00978246]
[ 0.01960119  0.18706688  0.03241127 -0.29200181]
[ 0.02334253  0.38171207  0.02657124 -0.57428901

In [8]:
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt
import os
from collections import deque
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import random

In [9]:
class Agent:
    def __init__(self,state_size,action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95 #Discount Factor
        self.epsilon = 1.0 # Exploration Rate: How much to act randomly, 
        self.epsilon_decay = 0.995
        self.epsilon_min = 0.01
        self.learning_rate = 0.001 
        self.model = self._create_model()
        
    
    def _create_model(self):
        #Neural Network To Approximate Q-Value function
        model = Sequential()
        model.add(Dense(24,input_dim=self.state_size,activation='relu')) #1st Hidden Layer
        model.add(Dense(24,activation='relu')) #2nd Hidden Layer
        model.add(Dense(self.action_size,activation='linear'))
        model.compile(loss='mse',optimizer=Adam(lr=self.learning_rate))
        return model
    
    def remember(self,state,action,reward,next_state,done):
        self.memory.append((state,action,reward,next_state,done)) #remembering previous experiences
        
    def act(self,state):
        # Exploration vs Exploitation
        if np.random.rand()<=self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state) # predict reward value based upon current state
        return np.argmax(act_values[0]) #Left or Right
    
    def train(self,batch_size=32): #method that trains NN with experiences sampled from memory
        minibatch = random.sample(self.memory,batch_size)
        for state,action,reward,next_state,done in minibatch:
            
            if not done: #boolean 
                target = reward + self.gamma*np.amax(self.model.predict(next_state)[0])
            else:
                target = reward
            target_f = self.model.predict(state)
            target_f[0][action] = target
            self.model.fit(state,target_f,epochs=1,verbose=0) #single epoch, x =state, y = target_f, loss--> target_f - 
            
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
    
    def load(self,name):
        self.model.load_weights(name)
    def save(self,name):
        self.model.save_weights(name)

In [13]:
n_episodes = 1000
output_dir = "cartpole_model/"

In [14]:
agent = Agent(state_size=4,action_size=2)
done = False
state_size = 4
action_size =2
batch_size = 32

In [15]:
agent = Agent(state_size, action_size) # initialise agent
done = False
for e in range(n_episodes):
    state = env.reset()
    state = np.reshape(state,[1,state_size])
    
    for time in range(5000):
        env.render()
        action = agent.act(state) #action is 0 or 1
        next_state,reward,done,other_info = env.step(action) 
        reward = reward if not done else -10
        next_state = np.reshape(next_state,[1,state_size])
        agent.remember(state,action,reward,next_state,done)
        state = next_state
        
        if done:
            print("Game Episode :{}/{}, High Score:{},Exploration Rate:{:.2}".format(e,n_episodes,time,agent.epsilon))
            break
            
    if len(agent.memory)>batch_size:
        agent.train(batch_size)
    
    if e%50==0:
        agent.save(output_dir+"weights_"+'{:04d}'.format(e)+".hdf5")
        
env.close()

Game Episode :0/1000, High Score:12,Exploration Rate:1.0
Game Episode :1/1000, High Score:29,Exploration Rate:1.0
Game Episode :2/1000, High Score:15,Exploration Rate:0.99
Game Episode :3/1000, High Score:36,Exploration Rate:0.99
Game Episode :4/1000, High Score:11,Exploration Rate:0.99
Game Episode :5/1000, High Score:18,Exploration Rate:0.98
Game Episode :6/1000, High Score:23,Exploration Rate:0.98
Game Episode :7/1000, High Score:12,Exploration Rate:0.97
Game Episode :8/1000, High Score:41,Exploration Rate:0.97
Game Episode :9/1000, High Score:25,Exploration Rate:0.96
Game Episode :10/1000, High Score:35,Exploration Rate:0.96
Game Episode :11/1000, High Score:21,Exploration Rate:0.95
Game Episode :12/1000, High Score:33,Exploration Rate:0.95
Game Episode :13/1000, High Score:28,Exploration Rate:0.94
Game Episode :14/1000, High Score:18,Exploration Rate:0.94
Game Episode :15/1000, High Score:23,Exploration Rate:0.93
Game Episode :16/1000, High Score:16,Exploration Rate:0.93
Game Epis