In [1]:
import gym
import numpy as np
from collections import deque

from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

from keras import backend as K

import matplotlib.pyplot as plt
%matplotlib inline

Using TensorFlow backend.


In [2]:
env = gym.make('CartPole-v0')

In [3]:
EPISODES = 300
MAX_STEPS = 200
ALPHA = 0.99
GAMMA = 0.99

EXPLORE_INIT = 1.0
EXPLORE_FINAL = 0.01
EXPLORE_DECAY = 0.01

MEMORY_SIZE = 5000
MEMORT_START_SIZE = 1000

BATCH_SIZE = 32
STATE_SIZE = env.observation_space.shape[0]
ACTION_SIZE = env.action_space.n

In [4]:
def create_Q_model(learning_rate=0.001):
    model = Sequential()

    model.add(Dense(32, activation='relu', kernel_initializer='glorot_normal', input_shape=(STATE_SIZE,)))
    model.add(Dense(32, activation='relu', kernel_initializer='glorot_normal'))
    model.add(Dense(ACTION_SIZE, activation='linear', kernel_initializer='glorot_normal'))

    optimizer = Adam(lr=learning_rate)
    model.compile(loss='mse', optimizer=optimizer)

    return model

In [5]:
class Memory():
    def __init__(self, max_size):
        self.buffer = deque(maxlen=max_size)

    def add(self, experience):
        self.buffer.append(experience)
    
    def size(self):
        return len(self.buffer)

    def sample(self, batch_size):
        idx = np.random.choice(
            np.arange(len(self.buffer)),
            size=batch_size,
            replace=False
        )
        return [self.buffer[ii] for ii in idx]

In [6]:
class TensorBoard:

    def __init__(self,
#                  model,
                 log_dir='./logs',
                 write_graph=False):
        
        global tf, projector
        import tensorflow as tf
        
        self.log_dir = log_dir
        self.write_graph = write_graph
        
        self.sess = K.get_session()

        if self.write_graph:
            self.writer = tf.summary.FileWriter(self.log_dir,
                                                self.sess.graph)
        else:
            self.writer = tf.summary.FileWriter(self.log_dir)
            
    def save(self, steps, logs):
        for name, value in logs.items():
            if name in ['batch', 'size']:
                continue
            summary = tf.Summary()
            summary_value = summary.value.add()
            summary_value.simple_value = value
            summary_value.tag = name
            self.writer.add_summary(
                summary,
                steps
            )
            
        self.writer.flush()

    def close(self):
        self.writer.close()

In [None]:
model = create_Q_model()
memory = Memory(max_size = MEMORY_SIZE)

In [None]:
explore_rate = EXPLORE_INIT
step = 0
ep = 0
begin_train = False

tensorboard = TensorBoard()
print("start")

for run_ep in range(0, EPISODES):
    state = env.reset()
    state = np.expand_dims(state, axis=0)
    
    total_reward = 0
    
    if begin_train:
        ep += 1
        
    explore_rate = EXPLORE_FINAL + (EXPLORE_INIT - EXPLORE_FINAL) * np.exp(-EXPLORE_DECAY * ep)
    
    is_print = False
    
    for step_t in range(MAX_STEPS):
        if begin_train and np.random.rand() > explore_rate:
            action = np.argmax(model.predict(state)[0])
        else:
            action = env.action_space.sample()
        
        next_state, reward, done, _ = env.step(action)
        next_state = np.expand_dims(next_state, axis=0)
        
#         env.render()

        memory.add((state, action, reward, done, next_state))
        state = next_state
        
        total_reward += reward
        
        if memory.size() >= MEMORT_START_SIZE:
            begin_train = True
            minibatch = memory.sample(BATCH_SIZE)
            
            inputs = np.zeros((BATCH_SIZE, STATE_SIZE))
            targets = np.zeros((BATCH_SIZE, ACTION_SIZE))
            
            for i, (state_b, action_b, reward_b, done_b, next_state_b) in enumerate(minibatch):
                inputs[i] = state_b[0]
                
                targets[i] = model.predict(state_b)
                
                target = reward_b
                if not done_b:
                    target = reward_b + GAMMA * np.amax(model.predict(next_state_b))
                    target = targets[i][action_b] + ALPHA * np.clip(target - targets[i][action_b], -1, 1)
                    
                targets[i][action_b] = target
                
#                 if not is_print:
#                     is_print = True
#                     print(model.predict(next_state_b)[0][action_b], target)
        
            history = model.fit(inputs, targets, epochs=1, verbose=0)
            step += 1
            tensorboard.save(step, {'loss': history.history['loss'][-1]})
        
        if done:
            if begin_train:
                tensorboard.save(
                    ep + 1,
                    {'total_steps': step_t + 1, 'explore_rate': explore_rate}
                )
                log_message = "Episode {}, Total Steps {}, Explore Rate {}".format(ep + 1, step_t + 1, explore_rate)
                print(log_message)
            break
            
    if ep % 20 == 0:
        model.save('model.h5')

tensorboard.close()

start
Episode 1, Total Steps 12, Explore Rate 1.0
Episode 2, Total Steps 18, Explore Rate 0.9901493354116764
Episode 3, Total Steps 12, Explore Rate 0.9803966865736877
Episode 4, Total Steps 30, Explore Rate 0.970741078213023
Episode 5, Total Steps 16, Explore Rate 0.9611815447608
Episode 6, Total Steps 25, Explore Rate 0.9517171302557069
Episode 7, Total Steps 14, Explore Rate 0.9423468882484062
Episode 8, Total Steps 19, Explore Rate 0.9330698817068888
Episode 9, Total Steps 18, Explore Rate 0.9238851829227694
Episode 10, Total Steps 41, Explore Rate 0.9147918734185159
Episode 11, Total Steps 12, Explore Rate 0.9057890438555999
Episode 12, Total Steps 22, Explore Rate 0.896875793943563
Episode 13, Total Steps 11, Explore Rate 0.888051232349986
Episode 14, Total Steps 28, Explore Rate 0.8793144766113556
Episode 15, Total Steps 17, Explore Rate 0.8706646530448178
Episode 16, Total Steps 24, Explore Rate 0.8621008966608072
Episode 17, Total Steps 15, Explore Rate 0.8536223510765493
Epis

Episode 136, Total Steps 200, Explore Rate 0.2666478580394326
Episode 137, Total Steps 200, Explore Rate 0.26409416918402034
Episode 138, Total Steps 200, Explore Rate 0.26156588995727226
Episode 139, Total Steps 47, Explore Rate 0.2590627675291589
Episode 140, Total Steps 9, Explore Rate 0.2565845515853515
Episode 141, Total Steps 11, Explore Rate 0.2541309943021904
Episode 142, Total Steps 10, Explore Rate 0.25170185032190273
Episode 143, Total Steps 12, Explore Rate 0.24929687672806608
Episode 144, Total Steps 9, Explore Rate 0.246915833021317
Episode 145, Total Steps 12, Explore Rate 0.24455848109530054
Episode 146, Total Steps 10, Explore Rate 0.2422245852128597
Episode 147, Total Steps 10, Explore Rate 0.23991391198246126
Episode 148, Total Steps 11, Explore Rate 0.2376262303348566
Episode 149, Total Steps 9, Explore Rate 0.2353613114999746
Episode 150, Total Steps 12, Explore Rate 0.23311892898404435
Episode 151, Total Steps 200, Explore Rate 0.23089885854694553
Episode 152, Tot

In [None]:
print("!")