In [1]:
import gym
import numpy as np
from collections import deque

from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

from keras import backend as K

import matplotlib.pyplot as plt
%matplotlib inline

Using TensorFlow backend.


In [2]:
env = gym.make('CartPole-v0')

In [3]:
EPISODES = 300
MAX_STEPS = 500
GAMMA = 0.99

EXPLORE_INIT = 1.0
EXPLORE_FINAL = 0.01
EXPLORE_DECAY = 0.01

MEMORY_SIZE = 10000
MEMORT_START_SIZE = 5000

BATCH_SIZE = 32
STATE_SIZE = env.observation_space.shape[0]
ACTION_SIZE = env.action_space.n

In [4]:
def create_Q_model(learning_rate=0.01):
    model = Sequential()

    model.add(Dense(32, activation='relu', input_shape=(STATE_SIZE,)))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(ACTION_SIZE, activation='linear'))

    optimizer = Adam(lr=learning_rate)
    model.compile(loss='mse', optimizer=optimizer)

    return model

In [5]:
class Memory():
    def __init__(self, max_size):
        self.buffer = deque(maxlen=max_size)

    def add(self, experience):
        self.buffer.append(experience)
    
    def size(self):
        return len(self.buffer)

    def sample(self, batch_size):
        idx = np.random.choice(
            np.arange(len(self.buffer)),
            size=batch_size,
            replace=False
        )
        return [self.buffer[ii] for ii in idx]

In [6]:
class TensorBoard:

    def __init__(self,
#                  model,
                 log_dir='./logs',
                 write_graph=False):
        
        global tf, projector
        import tensorflow as tf
        
        self.log_dir = log_dir
        self.write_graph = write_graph
        
        self.sess = K.get_session()

        if self.write_graph:
            self.writer = tf.summary.FileWriter(self.log_dir,
                                                self.sess.graph)
        else:
            self.writer = tf.summary.FileWriter(self.log_dir)
            
    def save(self, steps, logs):
        for name, value in logs.items():
            if name in ['batch', 'size']:
                continue
            summary = tf.Summary()
            summary_value = summary.value.add()
            summary_value.simple_value = value
            summary_value.tag = name
            self.writer.add_summary(
                summary,
                steps
            )
            
        self.writer.flush()

    def close(self):
        self.writer.close()

In [7]:
model = create_Q_model()
memory = Memory(max_size = MEMORY_SIZE)

In [8]:
explore_rate = EXPLORE_INIT
step = 0
ep = 0
begin_train = False

tensorboard = TensorBoard()
print("start")

for run_ep in range(0, EPISODES):
    state = env.reset()
    state = np.expand_dims(state, axis=0)
    
    total_reward = 0
    
    if begin_train:
        ep += 1
        
    explore_rate = EXPLORE_FINAL + (EXPLORE_INIT - EXPLORE_FINAL) * np.exp(-EXPLORE_DECAY * ep)
    
    is_print = False
    
    for time_t in range(MAX_STEPS):
        if begin_train and np.random.rand() > explore_rate:
            action = np.argmax(model.predict(state)[0])
        else:
            action = env.action_space.sample()
        
        next_state, reward, done, _ = env.step(action)
        next_state = np.expand_dims(next_state, axis=0)
        
#         env.render()

        memory.add((state, action, reward, done, next_state))
        state = next_state
        
        total_reward += reward
        
        if memory.size() >= MEMORT_START_SIZE:
            begin_train = True
            minibatch = memory.sample(BATCH_SIZE)
            
            inputs = np.zeros((BATCH_SIZE, STATE_SIZE))
            targets = np.zeros((BATCH_SIZE, ACTION_SIZE))
            
            for i, (state_b, action_b, reward_b, done_b, next_state_b) in enumerate(minibatch):
                inputs[i] = state_b[0]
                
                target = reward_b
                if not done_b:
                    target = reward_b + GAMMA * np.amax(model.predict(next_state_b))
                
                targets[i] = model.predict(state_b)
                targets[i][action_b] = target
        
            history = model.fit(inputs, targets, epochs=1, verbose=0)
            step += 1
            tensorboard.save(step, {'loss': history.history['loss'][-1]})
        
        if done:
            if begin_train:
                tensorboard.save(
                    ep + 1,
                    {'total_reward': total_reward, 'explore_rate': explore_rate}
                )
                log_message = "Episode {}, Total Reward {}, Explore Rate {}".format(ep + 1, total_reward, explore_rate)
                print(log_message)
            break
            
    if ep % 20 == 0:
        model.save('model.h5')

tensorboard.close()

start
Episode 1, Total Reward 10.0, Explore Rate 1.0
Episode 2, Total Reward 12.0, Explore Rate 0.9901493354116764
Episode 3, Total Reward 26.0, Explore Rate 0.9803966865736877
Episode 4, Total Reward 17.0, Explore Rate 0.970741078213023
Episode 5, Total Reward 16.0, Explore Rate 0.9611815447608
Episode 6, Total Reward 39.0, Explore Rate 0.9517171302557069
Episode 7, Total Reward 18.0, Explore Rate 0.9423468882484062
Episode 8, Total Reward 38.0, Explore Rate 0.9330698817068888
Episode 9, Total Reward 26.0, Explore Rate 0.9238851829227694
Episode 10, Total Reward 22.0, Explore Rate 0.9147918734185159
Episode 11, Total Reward 15.0, Explore Rate 0.9057890438555999
Episode 12, Total Reward 13.0, Explore Rate 0.896875793943563
Episode 13, Total Reward 12.0, Explore Rate 0.888051232349986
Episode 14, Total Reward 37.0, Explore Rate 0.8793144766113556
Episode 15, Total Reward 15.0, Explore Rate 0.8706646530448178
Episode 16, Total Reward 12.0, Explore Rate 0.8621008966608072
Episode 17, Tota

In [9]:
print("!")

!
