In [1]:
import time
import collections
import random
import numpy as np
import gym
import keras.layers
import keras.models
from keras import backend as K

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
env = gym.make('MountainCarContinuous-v0')
print(env.reset())
print(env.observation_space)
print(env.action_space)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[-0.41720302  0.        ]
Box(2,)
Box(1,)


In [3]:
model_sample = collections.namedtuple('model_sample',
    ['state', 'action', 'reward'])

In [4]:
class model:
    
    def __init__(self, input_n, output_n, beta=0.0001):
        
        self.__input_n = input_n
        self.__output_n = output_n
        self.__l_shared = [
            keras.layers.LSTM(16, return_sequences=False)]
        self.__l_policy_mu = [
            keras.layers.Dense(16, kernel_initializer='he_uniform'),
            keras.layers.BatchNormalization(),
            keras.layers.Activation('relu'),
            keras.layers.Dense(self.__output_n)]
        self.__l_policy_sigma = [
            keras.layers.Dense(16, kernel_initializer='he_uniform'),
            keras.layers.BatchNormalization(),
            keras.layers.Activation('relu'),
            keras.layers.Dense(self.__output_n, kernel_initializer='he_uniform'),
            keras.layers.Activation('softplus')]
        self.__l_value = [
            keras.layers.Dense(16, kernel_initializer='he_uniform'),
            keras.layers.BatchNormalization(),
            keras.layers.Activation('relu'),
            keras.layers.Dense(1)]
        def apply_layers(x, layers):
            last_layer = x
            for l in layers:
                last_layer = l(last_layer)
            return last_layer
        
        m = m_input = keras.layers.Input((None, self.__input_n,))
        m = apply_layers(m, self.__l_shared)
        m_mu = apply_layers(m, self.__l_policy_mu)
        m_sigma = apply_layers(m, self.__l_policy_sigma)
        m = keras.layers.Concatenate()([m_mu, m_sigma])
        self.__m_policy = keras.models.Model([m_input], [m])
        self.__m_policy.compile('adam', 'mse')
        
        m = m_input = keras.layers.Input((None, self.__input_n,))
        m = apply_layers(m, self.__l_shared)
        m = apply_layers(m, self.__l_value)
        self.__m_value = keras.models.Model([m_input], [m])
        self.__m_value.compile('adam', 'mse')
        
        m = m_input = keras.layers.Input((None, self.__input_n,))
        m_shared = apply_layers(m, self.__l_shared)
        m_policy_mu = apply_layers(m_shared, self.__l_policy_mu)
        m_policy_sigma = apply_layers(m_shared, self.__l_policy_sigma)
        m_policy = keras.layers.Concatenate()([m_policy_mu, m_policy_sigma])
        m_value = apply_layers(m_shared, self.__l_value)
        self.__m_value_policy = keras.models.Model([m_input], [m_value, m_policy])
        self.__m_value_policy.compile('adam', 'mse')
        
        m = m_input = keras.layers.Input((None, self.__input_n,))
        m_value, m_policy = self.__m_value_policy(m)
        m = keras.layers.Concatenate()([m_value, m_policy])
        self.__m_optimizer = keras.optimizers.Adam(clipnorm=5.)
        self.__m_train = keras.models.Model([m_input], [m])
        self.__m_train.compile(self.__m_optimizer,
            lambda y_true, y_pred: model.__loss(y_true, y_pred, beta, self.__output_n))
        
        self.__m_policy.summary()
        self.__m_value.summary()
        self.__m_train.summary()
    
    @staticmethod
    def __loss(y_true, y_pred, beta, output_size):
        r, action = y_true[:,:1], y_true[:,1:1+output_size]
        value, policy_mu, policy_sigma = \
            y_pred[:,:1], y_pred[:,1:1+output_size], y_pred[:,1+output_size:1+output_size*2] 
        advantage = r - value
        log_choosen_action_prob = \
            K.sum(K.exp(-K.square(action - policy_mu)/(2*K.square(policy_sigma)+K.epsilon())), axis=-1)
        action_loss = -K.mean(log_choosen_action_prob * advantage)
        value_loss = 0.5 * K.mean(K.square(advantage))
        entropy = K.mean(K.sum(0.5*K.log( \
            2*np.pi*np.e*K.square(policy_sigma)+K.epsilon()), axis=-1))
        return action_loss + value_loss - beta * entropy
    
    def train(self, samples, epochs=1, verbose=False):
        self.__m_train.fit(
            x=np.array([s.state for s in samples], dtype=np.float32),
            y=np.hstack([
                np.reshape(np.array([s.reward for s in samples], dtype=np.float32), (-1, 1)),
                np.array([s.action for s in samples]),
                np.zeros((len(samples),len(samples[0].action)))]),
            batch_size=64,
            epochs=epochs,
            verbose=verbose)
    
    def evalute_value(self, state, verbose=False):
        v = self.__m_value.predict(
            np.array([state], dtype=np.float32))[0,0]
        if verbose:
            print(v)
        return v
    
    def get_action_prob(self, state, verbose=False):
        action_prob = self.__m_policy.predict(
            np.array([state], dtype=np.float32))[0]
        action_prob_mu = action_prob[:self.__output_n]
        action_prob_sigma = action_prob[self.__output_n:]
        if verbose:
            print(action_prob_mu, action_prob_sigma)
        return action_prob_mu, action_prob_sigma

In [5]:
def play(env, m, gamma=0.98, max_steps=1000, n_prev_states=8, verbose=False):
    state_0 = env.reset()
    state_null = np.zeros_like(state_0)
    state_queue = []
    episode = []
    samples = []
    action_probs = []
    gamelen = 0
    gamelen_max = 0
    def get_prev_states(episode, idx):
        states = [e.state for e in episode[(idx-n_prev_states)+1:idx+1]]
        states = [state_null]*(max(0,n_prev_states-len(states))) + states
        return states
    def add_to_samples(episode, done):
        if done:
            discounted_reward = 0.
        else:
            discounted_reward = m.evalute_value(get_prev_states(episode, len(episode)-1))
        episode[-1] = model_sample(
                get_prev_states(episode, len(episode)-1),
                episode[-1].action,
                discounted_reward)
        for i in reversed(range(len(episode)-1)):
            discounted_reward = episode[i].reward + \
                gamma * discounted_reward
            episode[i] = model_sample(
                get_prev_states(episode, i),
                episode[i].action,
                discounted_reward)
        samples.extend(episode)
    for i in range(max_steps):
        state_queue.append(state_0)
        if len(state_queue) > n_prev_states:
            state_queue.pop(0)
        state_queue_padded = \
            [state_null]*(max(0,n_prev_states-len(state_queue))) + state_queue
        action_prob_mu, action_prob_sigma = m.get_action_prob(state_queue_padded)
        action_probs.append(np.hstack([action_prob_mu, action_prob_sigma]))
        action = action_prob_mu + \
            np.random.randn(*action_prob_sigma.shape) * action_prob_sigma
        state_1, reward, done, _ = env.step(action)
        episode.append(model_sample(state_0, action, reward))
        state_0 = state_1
        gamelen += 1
        if done:
            add_to_samples(episode, True)
            episode = []
            state_0 = env.reset()
            state_null = np.zeros_like(state_0)
            state_queue = []
            gamelen_max = max(gamelen_max, gamelen)
            gamelen = 0
    if episode:
        add_to_samples(episode, False)
        gamelen_max = max(gamelen_max, gamelen)
        gamelen = 0
    if verbose:
        print('std[action_prob]', np.mean(np.std(action_probs, ddof=1, axis=0)))
        print('max game len', gamelen_max)
    return samples

In [6]:
K.clear_session()
m = model(2, 1, beta=0.0001)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None, 2)      0                                            
__________________________________________________________________________________________________
lstm_1 (LSTM)                   (None, 16)           1216        input_1[0][0]                    
__________________________________________________________________________________________________
dense_3 (Dense)                 (None, 16)           272         lstm_1[0][0]                     
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 16)           272         lstm_1[0][0]                     
__________________________________________________________________________________________________
batch_norm

In [7]:
def test(m, render=False, n_prev_states=8):
    global env
    state = env.reset()
    state_null = np.zeros_like(state)
    state_queue = []
    rewards = 0.
    while True:
        state_queue.append(state)
        if len(state_queue) > n_prev_states:
            state_queue.pop(0)
        state_queue_padded = \
            [state_null]*(max(0,n_prev_states-len(state_queue))) + state_queue 
        action, _ = m.get_action_prob(state_queue)
        state, reward, done, _ = env.step(action)
        rewards += reward
        if render:
            env.render()
        if done:
            break
        if render:
            time.sleep(1/30)
    if render:
        env.close()
        env = gym.make('MountainCarContinuous-v0')
    return rewards

In [None]:
replays = []
for i in range(500):
    samples = play(env, m, max_steps=1000, verbose=True)
    replays.extend(samples)
    m.train(replays, epochs=1, verbose=True)
    if len(replays) > 10000:
        replays.sort(key=lambda x: x.reward, reverse=True)
        replays_best = replays[:3000]
        replays_worst = replays[-3000:]
        replays_middle = replays[3000:-3000]
        random.shuffle(replays_middle)
        replays = replays_best + replays_worst + replays_middle[:4000]
    print('epoch {} completed'.format(i))

std[action_prob] 0.011637534
max game len 999
Epoch 1/1

In [None]:
test(m, render=True)