In [1]:
import time
import collections
import random
import numpy as np
import gym
import keras.layers
import keras.models
from keras import backend as K

Using TensorFlow backend.


In [2]:
def env_create():
    env = gym.make('MsPacman-ram-v0')
    return env
env = env_create()
env_max_possible_score = -1
print(env.reset())
print(env.observation_space)
print(env.action_space)

[  0 112 114 115  48   3  88  88  88  88  88   0  80  80  80  50  98   0
   0   3   0   0   1   0   0   1   6   6 198   4  63   0  45   1   0 198
 198   0   0   0   0  32  52   0   0 120   0 100 130   0   0 134   1 222
   0   1   3   0   6  80 255 255   0 255 255  80 255 255  80 255 255  80
 255 255  80 191 191  80 191 191  80 191 191  80 255 255  80 255 255  80
 255 255  80 255 255   0 255 255  80 255 255  20 223  43 217 123 217 123
 217 123 217 123 217 123 217 221   0  63   0   0   0   0   0   2  66 240
 146 215]
Box(128,)
Discrete(9)


In [3]:
model_sample = collections.namedtuple('model_sample',
    ['state', 'action', 'reward'])

In [4]:
class model:
    
    def __init__(self, input_n, output_n, beta=0.0001):
        
        self.__input_n = input_n
        self.__output_n = output_n
        self.__l_shared = [
            keras.layers.BatchNormalization(),
            keras.layers.Conv1D(128, 3,
                padding='same', kernel_initializer='he_uniform'),
            keras.layers.BatchNormalization(),
            keras.layers.Activation('relu'),
            keras.layers.Conv1D(128, 3,
                padding='same', kernel_initializer='he_uniform'),
            keras.layers.BatchNormalization(),
            keras.layers.Activation('relu'),
            keras.layers.Conv1D(128, 3,
                padding='same', kernel_initializer='he_uniform'),
            keras.layers.BatchNormalization(),
            keras.layers.Activation('relu'),
            keras.layers.LSTM(128, return_sequences=False),
            keras.layers.Dense(128, kernel_initializer='he_uniform'),
            keras.layers.BatchNormalization(),
            keras.layers.Activation('relu')]
        self.__l_policy = [
            keras.layers.Dense(128, kernel_initializer='he_uniform'),
            keras.layers.BatchNormalization(),
            keras.layers.Activation('relu'),
            keras.layers.Dense(128, kernel_initializer='he_uniform'),
            keras.layers.BatchNormalization(),
            keras.layers.Activation('relu'),
            keras.layers.Dense(self.__output_n, activation='softmax')]
        self.__l_value = [
            keras.layers.Dense(128, kernel_initializer='he_uniform'),
            keras.layers.BatchNormalization(),
            keras.layers.Activation('relu'),
            keras.layers.Dense(128, kernel_initializer='he_uniform'),
            keras.layers.BatchNormalization(),
            keras.layers.Activation('relu'),
            keras.layers.Dense(1)]
        def apply_layers(x, layers):
            last_layer = x
            for l in layers:
                last_layer = l(last_layer)
            return last_layer
        
        m = m_input = keras.layers.Input((None, self.__input_n,))
        m = apply_layers(m, self.__l_shared)
        m = apply_layers(m, self.__l_policy)
        self.__m_policy = keras.models.Model([m_input], [m])
        
        m = m_input = keras.layers.Input((None, self.__input_n,))
        m = apply_layers(m, self.__l_shared)
        m = apply_layers(m, self.__l_value)
        self.__m_value = keras.models.Model([m_input], [m])
        
        m = m_input = keras.layers.Input((None, self.__input_n,))
        m_shared = apply_layers(m, self.__l_shared)
        m_policy = apply_layers(m_shared, self.__l_policy)
        m_value = apply_layers(m_shared, self.__l_value)
        self.__m_value_policy = keras.models.Model([m_input], [m_value, m_policy])
        
        m = m_input = keras.layers.Input((None, self.__input_n,))
        m_value, m_policy = self.__m_value_policy(m)
        m = keras.layers.Concatenate()([m_value, m_policy])
        self.__m_train = keras.models.Model([m_input], [m])
        self.__m_train.compile('nadam',
            lambda y_true, y_pred: model.__loss(y_true, y_pred, beta))
        
        self.__m_policy.summary()
        self.__m_value.summary()
        self.__m_train.summary()
    
    @staticmethod
    def __loss(y_true, y_pred, beta):
        r, action_onehot = y_true[:,:1], y_true[:,1:]
        value, policy = y_pred[:,:1], y_pred[:,1:]
        advantage = r - value
        log_policy = K.log(policy + K.epsilon())
        log_choosen_action_prob = K.sum(action_onehot * log_policy, axis=-1, keepdims=True)
        action_loss = -K.mean(log_choosen_action_prob * advantage)
        value_loss = 0.5 * K.mean(K.square(advantage))
        entropy = K.mean(-K.sum(policy * log_policy, axis=-1, keepdims=True))
        return action_loss + value_loss - beta * entropy
    
    def train(self, samples, epochs=1, verbose=False):
        self.__m_train.fit(
            x=np.array([s.state for s in samples], dtype=np.float32),
            y=np.hstack([
                np.reshape(np.array([s.reward for s in samples], dtype=np.float32), (-1, 1)),
                keras.utils.to_categorical([s.action for s in samples], num_classes=self.__output_n)]),
            batch_size=64,
            epochs=epochs,
            verbose=verbose)
    
    def evalute_value(self, state, verbose=False):
        v = self.__m_value.predict(
            np.array([state], dtype=np.float32))[0,0]
        if verbose:
            print(v)
        return v
    
    def get_action_prob(self, state, verbose=False):
        action_prob = self.__m_policy.predict(
            np.array([state], dtype=np.float32))[0]
        if verbose:
            print(action_prob)
        return action_prob

In [5]:
def play(env, m, gamma=0.98, max_steps=1000, n_prev_states=8, verbose=False):
    state_0 = env.reset()
    state_null = np.zeros_like(state_0)
    state_queue = []
    episode = []
    samples = []
    action_probs = []
    gamelen = 0
    gamelen_max = 0
    def get_prev_states(episode, idx):
        states = [e.state for e in episode[max(0,(idx-n_prev_states)+1):idx+1]]
        states = [state_null]*(max(0,n_prev_states-len(states))) + states
        return states
    def add_to_samples(episode, done):
        if done:
            discounted_reward = 0.
        else:
            discounted_reward = m.evalute_value(get_prev_states(episode, len(episode)-1))
        episode[-1] = model_sample(
                get_prev_states(episode, len(episode)-1),
                episode[-1].action,
                discounted_reward)
        for i in reversed(range(len(episode)-1)):
            discounted_reward = episode[i].reward + \
                gamma * discounted_reward
            episode[i] = model_sample(
                get_prev_states(episode, i),
                episode[i].action,
                discounted_reward)
        samples.extend(episode)
    for i in range(max_steps):
        state_queue.append(state_0)
        if len(state_queue) > n_prev_states:
            state_queue.pop(0)
        state_queue_padded = \
            [state_null]*(max(0,n_prev_states-len(state_queue))) + state_queue
        action_prob = m.get_action_prob(state_queue_padded)
        action_probs.append(action_prob)
        action = int(np.random.choice(
            list(range(action_prob.shape[-1])),
            p=action_prob))
        state_1, reward, done, _ = env.step(action)
        episode.append(model_sample(state_0, action, reward))
        state_0 = state_1
        gamelen += 1
        if done:
            add_to_samples(episode, True)
            episode = []
            state_0 = env.reset()
            state_null = np.zeros_like(state_0)
            state_queue = []
            gamelen_max = max(gamelen_max, gamelen)
            gamelen = 0
    if episode:
        add_to_samples(episode, False)
        gamelen_max = max(gamelen_max, gamelen)
        gamelen = 0
    if verbose:
        print('std[action_prob]', np.mean(np.std(action_probs, ddof=1, axis=0)))
        print('max game len', gamelen_max)
    return samples

In [6]:
K.clear_session()
m = model(
    env.observation_space.shape[0],
    env.action_space.n, beta=20.)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, None, 128)         0         
_________________________________________________________________
batch_normalization_1 (Batch (None, None, 128)         512       
_________________________________________________________________
conv1d_1 (Conv1D)            (None, None, 128)         49280     
_________________________________________________________________
batch_normalization_2 (Batch (None, None, 128)         512       
_________________________________________________________________
activation_1 (Activation)    (None, None, 128)         0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, None, 128)         49280     
_________________________________________________________________
batch_normalization_3 (Batch (None, None, 128)         512       
__________

In [7]:
def test(m, render=False, n_prev_states=8):
    global env
    state = env.reset()
    state_null = np.zeros_like(state)
    state_queue = []
    rewards = 0.
    while True:
        state_queue.append(state)
        if len(state_queue) > n_prev_states:
            state_queue.pop(0)
        state_queue_padded = \
            [state_null]*(max(0,n_prev_states-len(state_queue))) + state_queue 
        action_prob = m.get_action_prob(state_queue)
        action = int(np.random.choice(
            list(range(action_prob.shape[-1])),
            p=action_prob))
        state, reward, done, _ = env.step(action)
        rewards += reward
        if render:
            env.render()
        if done:
            break
        if render:
            time.sleep(1/60)
    if render:
        env.close()
        env = env_create()
    return rewards

In [None]:
replays = []
for i in range(100):
    samples = play(
        env, m, max_steps=1000, n_prev_states=8, verbose=True)
    replays.extend(samples)
    m.train(replays, epochs=1, verbose=True)
    if len(replays) > 100000:
        random.shuffle(replays)
        replays = replays[:100000]
    print('epoch {} completed'.format(i))
    if i % 5 != 4:
        continue
    test_result = [test(m, n_prev_states=8, render=(i==0)) for i in range(3)]
    print('test result',
          'mean', np.mean(test_result),
          'std', np.std(test_result, ddof=1))
    if np.mean(test_result) == env_max_possible_score and \
        np.std(test_result, ddof=1) == 0:
        print('the network always gets full score, early exit')
        break

std[action_prob] 0.01314588
max game len 703
Epoch 1/1
epoch 0 completed
std[action_prob] 0.012818041
max game len 799
Epoch 1/1
epoch 1 completed
std[action_prob] 0.012864577
max game len 642
Epoch 1/1
epoch 2 completed
std[action_prob] 0.012848404
max game len 661
Epoch 1/1
epoch 3 completed
std[action_prob] 0.016418105
max game len 547
Epoch 1/1
epoch 4 completed
test result mean 366.6666666666667 std 305.9956426705017
std[action_prob] 0.02443367
max game len 700
Epoch 1/1
epoch 5 completed
std[action_prob] 0.019466408
max game len 512
Epoch 1/1
epoch 6 completed
std[action_prob] 0.022860568
max game len 1000
Epoch 1/1
epoch 7 completed
std[action_prob] 0.021522466
max game len 793
Epoch 1/1
epoch 8 completed
std[action_prob] 0.028374549
max game len 651
Epoch 1/1
epoch 9 completed
test result mean 550.0 std 294.6183972531247
std[action_prob] 0.0945079
max game len 531
Epoch 1/1
epoch 10 completed
std[action_prob] 0.09540895
max game len 757
Epoch 1/1
epoch 11 completed
std[action_p

In [None]:
test(m, n_prev_states=8, render=True)