In [1]:
import time
import collections
import random
import numpy as np
import gym
import keras.layers
import keras.models
from keras import backend as K

Using TensorFlow backend.


In [2]:
def env_create():
    env = gym.make('Boxing-ram-v0')
    return env
env = env_create()
env_max_possible_score = -1
print(env.reset())
print(env.observation_space)
print(env.action_space)

[ 63  12   0  40 214   3  14 244  16 244  13 244  15 244 255 160  27  89
   0   0  59   0   0   0   8   0   8   0   8   0   8   0  30 109   4  87
   0 245   0 247   0 245   0 246   0 255  72   0  18  32  20   2   0   0
   0   0   0   0   0   0   0   0   0   4  87   4  21  38   4  87 104 121
  87  95 159 191   0  95 159 191   0   0   0   0   0 208   0  43   0   1
  62   0   0 136   3   2   0  30   4 128   0 128   0 128   0 167   0 128
   0 128   0 254   0 167   0   0   0 120 243 109  86 243 104 120  63 246
 238 240]
Box(128,)
Discrete(18)


In [3]:
model_sample = collections.namedtuple('model_sample',
    ['state', 'state1', 'action', 'reward', 'gamma'])

In [4]:
class model:
    
    def __init__(self, input_n, output_n, tau=0.9):
        self.__input_n = input_n
        self.__output_n = output_n
        self.__tau = tau
        self.__m_train = self.__build_model()
        self.__m_target = self.__build_model()
        self.__m_train.compile('nadam', 'mse')
        self.__m_train.summary()
        self.__sync_target(0.)
        
    def __build_model(self):
        
        l_state = [
            keras.layers.BatchNormalization(),
            keras.layers.TimeDistributed(
                keras.layers.Dense(128, kernel_initializer='he_uniform')),
            keras.layers.BatchNormalization(),
            keras.layers.Activation('relu'),
            keras.layers.TimeDistributed(
                keras.layers.Dense(128, kernel_initializer='he_uniform')),
            keras.layers.BatchNormalization(),
            keras.layers.Activation('relu'),
            keras.layers.LSTM(128, return_sequences=False)]
        l_action = [
            keras.layers.BatchNormalization(),
            keras.layers.Dense(32, kernel_initializer='he_uniform'),
            keras.layers.BatchNormalization(),
            keras.layers.Activation('relu')]
        l_value = [
            keras.layers.Dense(128, kernel_initializer='he_uniform'),
            keras.layers.BatchNormalization(),
            keras.layers.Activation('relu'),
            keras.layers.Dense(64, kernel_initializer='he_uniform'),
            keras.layers.BatchNormalization(),
            keras.layers.Activation('relu'),
            keras.layers.Dense(1)]
        def apply_layers(x, layers):
            last_layer = x
            for l in layers:
                last_layer = l(last_layer)
            return last_layer
        
        m = m_input = keras.layers.Input((None, self.__input_n,))
        m = apply_layers(m, l_state)
        m_a = m_input_a = keras.layers.Input((self.__output_n,))
        m_a = apply_layers(m_a, l_action)
        m = keras.layers.Concatenate()([m, m_a])
        m = apply_layers(m, l_value)
        return keras.models.Model([m_input, m_input_a], [m])
    
    def __sync_target(self, tau):
        w_train = self.__m_train.get_weights()
        w_target = self.__m_target.get_weights()
        w = [tau*x + (1-tau)*y for x,y in zip(w_target,w_train)]
        self.__m_target.set_weights(w)
    
    def __get_sample_q(self, samples):
        q = self.__m_train.predict([
            np.repeat(np.array([
                s.state1 for s in samples], dtype=np.float32),
                self.__output_n, axis=0),
            np.tile(np.identity(self.__output_n, dtype=np.float32),
                (len(samples), 1))], batch_size=64)
        q = np.reshape(q, (len(samples), self.__output_n))
        q = np.argmax(q, axis=-1)
        q = keras.utils.to_categorical(q, num_classes=self.__output_n)
        q = self.__m_target.predict([
            np.array([s.state for s in samples], dtype=np.float32),q],
            batch_size=64)
        q = np.array([s.reward for s in samples])[...,np.newaxis] + \
            q * np.array([s.gamma for s in samples])[...,np.newaxis]
        return q
    
    def train(self, samples, epochs=1, verbose=False):
        a = keras.utils.to_categorical([
            s.action for s in samples], num_classes=self.__output_n)
        q = self.__get_sample_q(samples)
        self.__m_train.fit(
            x=[np.array([s.state for s in samples], dtype=np.float32),a],
            y=q,
            batch_size=64,
            epochs=epochs,
            verbose=verbose)
        self.__sync_target(self.__tau)
    
    def get_sample_priority(self, samples):
        q = self.__get_sample_q(samples)[:,0]
        t = self.__m_train.predict([
            np.array([s.state for s in samples], dtype=np.float32),
            keras.utils.to_categorical([
                s.action for s in samples], num_classes=self.__output_n)],
            batch_size=64)[:,0]
        p = np.abs(q - t)
        p = p / np.max(p)
        p = p / np.sum(p)
        return p
    
    def get_action_prob(self, state, verbose=False):
        action_prob = self.__m_train.predict([
            np.array([state]*self.__output_n, dtype=np.float32),
            np.identity(self.__output_n, dtype=np.float32)])
        action_prob = np.argmax(action_prob[:,0])
        action_prob = keras.utils.to_categorical([
            action_prob], num_classes=self.__output_n)[0]
        if verbose:
            print(action_prob)
        return action_prob

In [5]:
def play(env, m, max_steps=1000, n_prev_states=8,
         reward_steps=1, gamma=0.98, epsilon=0., verbose=False):
    state_0 = env.reset()
    state_null = np.zeros_like(state_0)
    state_queue = []
    episode = []
    samples = []
    action_probs = []
    gamelen = 0
    gamelen_max = 0
    def get_prev_states(episode, idx, get_state):
        states = [get_state(e) for e in episode[max(0,(idx-n_prev_states)+1):idx+1]]
        states = [state_null]*(max(0,n_prev_states-len(states))) + states
        return states
    def add_to_samples(episode):
        processed_episode = []
        for i in range(len(episode)):
            episode_reward = 0
            episode_gamma = 1.
            for j in range(reward_steps):
                if i + j >= len(episode):
                    break
                episode_reward += episode[i+j].reward * episode_gamma
                episode_gamma *= episode[i+j].gamma * gamma
            processed_episode.append(model_sample(
                get_prev_states(episode, i, lambda e: e.state),
                get_prev_states(episode, i, lambda e: e.state1),
                episode[i].action,
                episode_reward,
                episode_gamma))
        samples.extend(processed_episode)
    for i in range(max_steps):
        state_queue.append(state_0)
        if len(state_queue) > n_prev_states:
            state_queue.pop(0)
        state_queue_padded = \
            [state_null]*(max(0,n_prev_states-len(state_queue))) + state_queue
        action_prob = m.get_action_prob(state_queue_padded)
        action_probs.append(action_prob)
        action_prob = epsilon/action_prob.shape[-1] + (1-epsilon)*action_prob
        action = int(np.random.choice(
            list(range(action_prob.shape[-1])),
            p=action_prob))
        state_1, reward, done, _ = env.step(action)
        episode.append(model_sample(
            state_0, state_1, action, reward, 0. if done else 1.))
        state_0 = state_1
        gamelen += 1
        if done:
            add_to_samples(episode)
            episode = []
            state_0 = env.reset()
            state_null = np.zeros_like(state_0)
            state_queue = []
            gamelen_max = max(gamelen_max, gamelen)
            gamelen = 0
    if episode:
        add_to_samples(episode)
        gamelen_max = max(gamelen_max, gamelen)
        gamelen = 0
    if verbose:
        print('std[action_prob]', np.mean(np.std(action_probs, ddof=1, axis=0)))
        print('max game len', gamelen_max)
    return samples

In [6]:
K.clear_session()
m = model(
    env.observation_space.shape[0],
    env.action_space.n, tau=0.95)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None, 128)    0                                            
__________________________________________________________________________________________________
batch_normalization_1 (BatchNor (None, None, 128)    512         input_1[0][0]                    
__________________________________________________________________________________________________
time_distributed_1 (TimeDistrib (None, None, 128)    16512       batch_normalization_1[0][0]      
__________________________________________________________________________________________________
batch_normalization_2 (BatchNor (None, None, 128)    512         time_distributed_1[0][0]         
__________________________________________________________________________________________________
activation

In [7]:
def test(m, max_steps=10000, render=False, n_prev_states=8):
    global env
    state = env.reset()
    state_null = np.zeros_like(state)
    state_queue = []
    rewards = 0.
    for _ in range(max_steps):
        state_queue.append(state)
        if len(state_queue) > n_prev_states:
            state_queue.pop(0)
        state_queue_padded = \
            [state_null]*(max(0,n_prev_states-len(state_queue))) + state_queue 
        action_prob = m.get_action_prob(state_queue)
        action = int(np.random.choice(
            list(range(action_prob.shape[-1])),
            p=action_prob))
        state, reward, done, _ = env.step(action)
        rewards += reward
        if render:
            env.render()
        if done:
            break
        if render:
            time.sleep(1/60)
    if render:
        env.close()
        env = env_create()
    return rewards

In [8]:
replays = play(
    env, m, max_steps=30000, n_prev_states=4,
    reward_steps=5, epsilon=0.05, verbose=False)
for i in range(200):
    samples = play(
        env, m, max_steps=3000, n_prev_states=4,
        reward_steps=5, epsilon=0.05, verbose=True)
    replays.extend(samples)
    if len(replays) > 100000:
        replays = list(replays[-100000:])
    samples = [model_sample(*s) for s in np.array(replays)[
        np.random.choice(np.arange(len(replays), dtype=np.int32),
            p=m.get_sample_priority(replays), size=30000)]]
    m.train(samples, epochs=1, verbose=True)
    print('epoch {} completed'.format(i))
    if i % 5 != 4:
        continue
    test_result = [test(m, max_steps=3000,
        n_prev_states=4, render=(i==0)) for i in range(3)]
    print('test result',
          'mean', np.mean(test_result),
          'std', np.std(test_result, ddof=1))
    if np.mean(test_result) == env_max_possible_score and \
        np.std(test_result, ddof=1) == 0:
        print('the network always gets full score, early exit')
        break

std[action_prob] 0.16865271
max game len 2401
Epoch 1/1
epoch 0 completed
std[action_prob] 0.10125823
max game len 2391
Epoch 1/1
epoch 1 completed
std[action_prob] 0.16699557
max game len 2387
Epoch 1/1
epoch 2 completed
std[action_prob] 0.17667902
max game len 2290
Epoch 1/1
epoch 3 completed
std[action_prob] 0.16961138
max game len 2379
Epoch 1/1
epoch 4 completed
test result mean 5.666666666666667 std 8.32666399786453
std[action_prob] 0.18276687
max game len 2383
Epoch 1/1
epoch 5 completed
std[action_prob] 0.20950584
max game len 2396
Epoch 1/1
epoch 6 completed
std[action_prob] 0.18701033
max game len 2393
Epoch 1/1
epoch 7 completed
std[action_prob] 0.21008737
max game len 2380
Epoch 1/1
epoch 8 completed
std[action_prob] 0.22011615
max game len 2354
Epoch 1/1
epoch 9 completed
test result mean 4.0 std 5.5677643628300215
std[action_prob] 0.19591929
max game len 2380
Epoch 1/1
epoch 10 completed


KeyboardInterrupt: 

In [None]:
test(m, n_prev_states=4, render=True)