In [1]:
import gym
import keras
import random
import math
import numpy as np
from collections import deque

Using TensorFlow backend.


In [2]:
# Setting up Training parameters
n_episodes = 1000
n_win_ticks = 195
max_env_steps = None

gamma = 1.0
epsilon = 1.0 # exploration
epsilon_decay = 0.995
epsilon_min = 0.01
alpha = 0.01 # Learning rate
alpha_decay = 0.01
batch_size = 64
monitor = False
quiet = False


# Environment Parameters
memory = deque(maxlen=100000)
env = gym.make("CartPole-v0")


if max_env_steps is not None: env.max_episode_steps = max_env_steps

In [3]:
model = keras.models.Sequential()

model.add(keras.layers.Dense(48,input_dim=4,activation='relu'))
model.add(keras.layers.Dense(64,activation='relu'))
model.add(keras.layers.Dense(2,activation='relu'))

model.compile(loss='mse',optimizer=keras.optimizers.Adam(lr=alpha,decay=alpha_decay))







In [4]:
def remember(state, action, reward, next_state, done):
    memory.append((state,action,reward,next_state,done))

def choose_action(state,epsilon):
    return env.action_space.sample() if (np.random.random() <= epsilon) else np.argmax(model.predict(state))

def get_epsilon(t):
    return max(epsilon_min,min(epsilon,1.0-math.log10((t+1)*epsilon_decay)))

def preprocess_state(state):
    return np.reshape(state,[1,4])

def replay(batch_size,epsilon):
    x_batch,y_batch = [],[]
    minibatch = random.sample(memory,min(len(memory),batch_size))
    for state, action, reward, next_state, done in minibatch:
        y_target = model.predict(state)
        y_target[0][action] = reward if done else reward + gamma * np.max(model.predict(next_state)[0])
        x_batch.append(state[0])
        y_batch.append(y_target[0])
        
    model.fit(np.array(x_batch),np.array(y_batch),batch_size=len(x_batch),verbose=0)#verbose = 0
    
    if epsilon > epsilon_decay:
        epsilon *= epsilon_decay

In [5]:
def run():
    scores = deque(maxlen=100)
    
    for e in range(n_episodes):
        state = preprocess_state(env.reset())
        done = False
        i = 0
        while not done:
            action = choose_action(state,get_epsilon(e))
            next_state,reward,done,info = env.step(action)
            env.render()
            next_state = preprocess_state(next_state)
            remember(state,action,reward,next_state,done)
            state = next_state
            i += 1
        
        scores.append(i)
        mean_score = np.mean(scores)
        
        if mean_score >= n_win_ticks and e >= 100:
            if not quiet: print ("Run {} episodes. Solved after {} trails".format(e,e-100))
            return e-100
        if e%20 == 0 and not quiet:
            print ("[Episode {}] - Mean survival time over last 100 episodes was {} ticks".format(e, mean_score))
        replay(batch_size,epsilon)
    if not quiet: print ("Did not solve after {} episodes".format(e))
    return e

In [6]:
run()

[Episode 0] - Mean survival time over last 100 episodes was 17.0 ticks








[Episode 20] - Mean survival time over last 100 episodes was 21.761904761904763 ticks
[Episode 40] - Mean survival time over last 100 episodes was 23.121951219512194 ticks
[Episode 60] - Mean survival time over last 100 episodes was 22.442622950819672 ticks
[Episode 80] - Mean survival time over last 100 episodes was 19.296296296296298 ticks
[Episode 100] - Mean survival time over last 100 episodes was 19.85 ticks
[Episode 120] - Mean survival time over last 100 episodes was 20.8 ticks
[Episode 140] - Mean survival time over last 100 episodes was 22.31 ticks
[Episode 160] - Mean survival time over last 100 episodes was 21.47 ticks
[Episode 180] - Mean survival time over last 100 episodes was 24.83 ticks
[Episode 200] - Mean survival time over last 100 episodes was 26.21 ticks
[Episode 220] - Mean survival time over last 100 episodes was 25.87 ticks
[Episode 240] - Mean survival time over last 100 episodes wa

999

In [7]:
env.close()