In [1]:
import time
import collections
import random
import numpy as np
import gym
import keras.layers
import keras.models
from keras import backend as K

Using TensorFlow backend.


In [2]:
def env_create():
    env = gym.make('Pendulum-v0')
    return env
env = env_create()
env_max_possible_score = -1
print(env.reset())
print(env.observation_space)
print(env.action_space)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[ 0.44073046 -0.89763949  0.42941648]
Box(3,)
Box(1,)


In [3]:
model_sample = collections.namedtuple('model_sample',
    ['state', 'state1', 'action', 'reward', 'gamma', 'priority'])

In [4]:
class model:
    
    def __init__(self, input_n, output_n, tau=0.9):
        self.__input_n = input_n
        self.__output_n = output_n
        self.__tau = tau
        self.__m_train,m_train_freeze, \
            self.__m_train_policy = self.__build_model()
        self.__m_target,_, \
            self.__m_target_policy = self.__build_model()
        self.__m_train.compile('nadam', 'mse')
        m = m_input = keras.layers.Input((None, self.__input_n,))
        m = self.__m_train_policy(m)
        m = m_train_freeze([m_input, m])
        self.__m_train_policy_chain = keras.Model([m_input], [m])
        self.__m_train_policy_chain.compile(
            'nadam', __class__.__gradient_ascent)
        self.__m_train_policy_chain.summary()
        self.__sync_target(0.)
    
    @staticmethod
    def __gradient_ascent(y_true, y_pred):
        y_pred = y_pred + 0.*y_true
        y_pred = K.sum(y_pred, axis=-1)
        return -K.mean(y_pred)
        
    def __build_model(self):
        
        l_state = [
            keras.layers.BatchNormalization(),
            keras.layers.TimeDistributed(
                keras.layers.Dense(128, kernel_initializer='he_uniform')),
            keras.layers.BatchNormalization(),
            keras.layers.Activation('relu'),
            keras.layers.TimeDistributed(
                keras.layers.Dense(128, kernel_initializer='he_uniform')),
            keras.layers.BatchNormalization(),
            keras.layers.Activation('relu'),
            keras.layers.LSTM(128, return_sequences=False)]
        l_action = [
            keras.layers.BatchNormalization(),
            keras.layers.Dense(32, kernel_initializer='he_uniform'),
            keras.layers.BatchNormalization(),
            keras.layers.Activation('relu')]
        l_value = [
            keras.layers.Dense(128, kernel_initializer='he_uniform'),
            keras.layers.BatchNormalization(),
            keras.layers.Activation('relu'),
            keras.layers.Dense(64, kernel_initializer='he_uniform'),
            keras.layers.BatchNormalization(),
            keras.layers.Activation('relu'),
            keras.layers.Dense(1)]
        l_policy = [
            keras.layers.BatchNormalization(),
            keras.layers.TimeDistributed(
                keras.layers.Dense(128, kernel_initializer='he_uniform')),
            keras.layers.BatchNormalization(),
            keras.layers.Activation('relu'),
            keras.layers.TimeDistributed(
                keras.layers.Dense(128, kernel_initializer='he_uniform')),
            keras.layers.BatchNormalization(),
            keras.layers.Activation('relu'),
            keras.layers.LSTM(128, return_sequences=False),
            keras.layers.Dense(128, kernel_initializer='he_uniform'),
            keras.layers.BatchNormalization(),
            keras.layers.Activation('relu'),
            keras.layers.Dense(64, kernel_initializer='he_uniform'),
            keras.layers.BatchNormalization(),
            keras.layers.Activation('relu'),
            keras.layers.Dense(self.__output_n)]
        def apply_layers(x, layers):
            last_layer = x
            for l in layers:
                last_layer = l(last_layer)
            return last_layer
        
        m = m_input = keras.layers.Input((None, self.__input_n,))
        m = apply_layers(m, l_state)
        m_a = m_input_a = keras.layers.Input((self.__output_n,))
        m_a = apply_layers(m_a, l_action)
        m = keras.layers.Concatenate()([m, m_a])
        m = apply_layers(m, l_value)
        m_train = keras.models.Model([m_input, m_input_a], [m])
        m_train_freeze = keras.models.Model([m_input, m_input_a], [m])
        m_train_freeze.trainable = False
        
        m = m_input = keras.layers.Input((None, self.__input_n,))
        m = apply_layers(m, l_policy)
        m_train_policy = keras.models.Model([m_input], [m])
        
        return m_train, m_train_freeze, m_train_policy
    
    def __sync_target(self, tau):
        for m_train, m_target in [
            (self.__m_train, self.__m_target),
            (self.__m_train_policy, self.__m_target_policy)]:
            w_train = m_train.get_weights()
            w_target = m_target.get_weights()
            w = [tau*x + (1-tau)*y for x,y in zip(w_target,w_train)]
            m_target.set_weights(w)
    
    def __get_sample_q(self, samples):
        q = self.__m_target_policy.predict([
            np.array([s.state for s in samples], dtype=np.float32)],
            batch_size=64)
        q = self.__m_target.predict([
            np.array([s.state for s in samples], dtype=np.float32),q],
            batch_size=64)
        q = np.array([s.reward for s in samples])[...,np.newaxis] + \
            q * np.array([s.gamma for s in samples])[...,np.newaxis]
        return q
    
    def train(self, samples, epochs=1, verbose=False):
        a = np.array([s.action for s in samples], dtype=np.float32)
        q = self.__get_sample_q(samples)
        self.__m_train.fit(
            x=[np.array([s.state for s in samples], dtype=np.float32),a],
            y=q,
            batch_size=64,
            epochs=epochs,
            verbose=verbose)
        self.__m_train_policy_chain.fit(
            x=[np.array([s.state for s in samples], dtype=np.float32)],
            y=np.zeros_like(q),
            batch_size=64,
            epochs=epochs,
            verbose=verbose)
        self.__sync_target(self.__tau)
    
    def get_sample_priority(self, samples):
        q = self.__get_sample_q(samples)[:,0]
        t = self.__m_train.predict([
            np.array([s.state for s in samples], dtype=np.float32),
            np.array([s.action for s in samples], dtype=np.float32)],
            batch_size=64)[:,0]
        p = np.abs(q - t)
        return p
    
    def get_action(self, state, verbose=False):
        action = self.__m_train_policy.predict([
            np.array([state], dtype=np.float32)])[0]
        if verbose:
            print(action)
        return action

In [5]:
class ou_noise:
    
    def __init__(self, shape, sigma, mu=0., theta=0.15, dt=1.):
        self.__shape = shape
        self.__mu = np.ones(shape) * mu
        self.__state = np.copy(self.__mu)
        self.__theta = theta
        self.__sigma = sigma
        self.__dt = dt
    
    def sample(self):
        self.__state += \
            self.__theta * (self.__mu - self.__state) * self.__dt + \
            self.__sigma * np.random.randn(*self.__shape) * self.__dt**2
        return np.copy(self.__state)

In [6]:
def play(env, m, max_steps=1000, n_prev_states=8,
         reward_steps=1, gamma=0.98, epsilon=0., verbose=False):
    noise = ou_noise(env.action_space.shape, epsilon)
    state_0 = env.reset()
    state_null = np.zeros_like(state_0)
    state_queue = []
    episode = []
    samples = []
    actions = []
    gamelen = 0
    gamelen_max = 0
    def get_prev_states(episode, idx, get_state):
        states = [get_state(e) for e in episode[max(0,(idx-n_prev_states)+1):idx+1]]
        states = [state_null]*(max(0,n_prev_states-len(states))) + states
        return states
    def add_to_samples(episode):
        processed_episode = []
        for i in range(len(episode)):
            episode_reward = 0
            episode_gamma = 1.
            for j in range(reward_steps):
                if i + j >= len(episode):
                    break
                episode_reward += episode[i+j].reward * episode_gamma
                episode_gamma *= episode[i+j].gamma * gamma
            processed_episode.append(model_sample(
                get_prev_states(episode, i, lambda e: e.state),
                get_prev_states(episode, i, lambda e: e.state1),
                episode[i].action,
                episode_reward,
                episode_gamma, 0.))
        priority = m.get_sample_priority(processed_episode)
        for i in range(len(processed_episode)):
            processed_episode[i] = processed_episode[i]._replace(priority=priority[i])
        samples.extend(processed_episode)
    for i in range(max_steps):
        state_queue.append(state_0)
        if len(state_queue) > n_prev_states:
            state_queue.pop(0)
        state_queue_padded = \
            [state_null]*(max(0,n_prev_states-len(state_queue))) + state_queue
        action = m.get_action(state_queue_padded)
        actions.append(action)
        action = action + noise.sample()
        action_clipped = np.clip(action, env.action_space.low, env.action_space.high)
        state_1, reward, done, _ = env.step(action_clipped)
        episode.append(model_sample(
            state_0, state_1, action, reward, 0. if done else 1., 0.))
        state_0 = state_1
        gamelen += 1
        if done:
            add_to_samples(episode)
            episode = []
            state_0 = env.reset()
            state_null = np.zeros_like(state_0)
            state_queue = []
            noise = ou_noise(env.action_space.shape, epsilon)
            gamelen_max = max(gamelen_max, gamelen)
            gamelen = 0
    if episode:
        add_to_samples(episode)
        gamelen_max = max(gamelen_max, gamelen)
        gamelen = 0
    if verbose:
        print('std[action]', np.mean(np.std(actions, ddof=1, axis=0)))
        print('max game len', gamelen_max)
    return samples

In [7]:
K.clear_session()
m = model(
    env.observation_space.shape[0],
    env.action_space.shape[0], tau=0.998)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            (None, None, 3)      0                                            
__________________________________________________________________________________________________
model_3 (Model)                 (None, 1)            175245      input_7[0][0]                    
__________________________________________________________________________________________________
model_2 (Model)                 (None, 1)            179537      input_7[0][0]                    
                                                                 model_3[1][0]                    
Total params: 354,782
Trainable params: 174,343
Non-trainable params: 180,439
__________________________________________________________________________________________________


In [8]:
def test(m, max_steps=10000, render=False, n_prev_states=8):
    global env
    state = env.reset()
    state_null = np.zeros_like(state)
    state_queue = []
    rewards = 0.
    for _ in range(max_steps):
        state_queue.append(state)
        if len(state_queue) > n_prev_states:
            state_queue.pop(0)
        state_queue_padded = \
            [state_null]*(max(0,n_prev_states-len(state_queue))) + state_queue 
        action = m.get_action(state_queue)
        action_clipped = np.clip(action, env.action_space.low, env.action_space.high)
        state, reward, done, _ = env.step(action_clipped)
        rewards += reward
        if render:
            env.render()
        if done:
            break
        if render:
            time.sleep(1/60)
    if render:
        env.close()
        env = env_create()
    return rewards

In [9]:
replays = play(
    env, m, max_steps=30000, n_prev_states=4,
    reward_steps=3, epsilon=0.2, verbose=False)
for i in range(300):
    samples = play(
        env, m, max_steps=3000, n_prev_states=4,
        reward_steps=3, epsilon=0.2, verbose=True)
    replays.extend(samples)
    if len(replays) > 100000:
        replays = list(replays[-100000:])
    for j in range(300):
        sample_p = np.array([r.priority for r in replays])
        sample_p = np.power(sample_p, 0.5)
        sample_p = sample_p / np.sum(sample_p)
        sample_indices = np.random.choice(
            np.arange(len(replays), dtype=np.int32), p=sample_p, size=128)
        samples = [model_sample(*s) \
            for s in np.array(replays)[sample_indices]]
        sample_p = m.get_sample_priority(samples)
        for s,p in zip(sample_indices, sample_p):
            replays[s] = replays[s]._replace(priority=p)
        m.train(samples, epochs=1, verbose=(j+1)%100==0)
    print('epoch {} completed'.format(i))
    test_result = [test(m, max_steps=3000,
        n_prev_states=4, render=(i==0)) for i in range(3)]
    print('test result',
          'mean', np.mean(test_result),
          'std', np.std(test_result, ddof=1))
    if np.mean(test_result) == env_max_possible_score and \
        np.std(test_result, ddof=1) == 0:
        print('the network always gets full score, early exit')
        break

KeyboardInterrupt: 

In [None]:
test(m, n_prev_states=4, render=True)