In [1]:
import time
import collections
import random
import numpy as np
import gym
import keras.layers
import keras.models
from keras import backend as K

Using TensorFlow backend.


In [2]:
env = gym.make('BipedalWalker-v2')
print(env.reset())
print(env.observation_space)
print(env.action_space, env.action_space.low, env.action_space.high)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[ 2.74546980e-03  1.28543901e-05 -1.67614549e-03 -1.60000849e-02
  9.25640911e-02  3.89107061e-03  8.59707221e-01 -1.69117749e-03
  1.00000000e+00  3.28603275e-02  3.89091601e-03  8.53495002e-01
 -2.64752253e-03  1.00000000e+00  4.40813392e-01  4.45819497e-01
  4.61422116e-01  4.89549488e-01  5.34102023e-01  6.02460206e-01
  7.09147871e-01  8.85930538e-01  1.00000000e+00  1.00000000e+00]
Box(24,)
Box(4,) [-1. -1. -1. -1.] [1. 1. 1. 1.]


In [3]:
model_sample = collections.namedtuple('model_sample',
    ['state', 'action', 'reward'])

In [4]:
class model:
    
    def __init__(self, input_n, output_n, dis_output_ns=[], beta=0.0001):
        
        self.__input_n = input_n
        self.__output_n = output_n
        self.__dis_output_ns = dis_output_ns
        self.__l_shared = [
            keras.layers.BatchNormalization(),
            keras.layers.LSTM(128, return_sequences=False),
            keras.layers.Dropout(0.5)]
        self.__l_policy_mu = [
            keras.layers.Dense(64, kernel_initializer='he_uniform'),
            keras.layers.BatchNormalization(),
            keras.layers.Activation('relu'),
            keras.layers.Dense(self.__output_n)]
        self.__l_policy_sigma = [
            keras.layers.Dense(64, kernel_initializer='he_uniform'),
            keras.layers.BatchNormalization(),
            keras.layers.Activation('relu'),
            keras.layers.Dense(self.__output_n, kernel_initializer='he_uniform'),
            keras.layers.Activation('softplus')]
        self.__l_policy_dis = [[
            keras.layers.Dense(64, kernel_initializer='he_uniform'),
            keras.layers.BatchNormalization(),
            keras.layers.Activation('relu'),
            keras.layers.Dense(dis),
            keras.layers.BatchNormalization(),
            keras.layers.Softmax()]
            for dis in self.__dis_output_ns]
        self.__l_value = [
            keras.layers.Dense(64, kernel_initializer='he_uniform'),
            keras.layers.BatchNormalization(),
            keras.layers.Activation('relu'),
            keras.layers.Dense(1)]
        def apply_layers(x, layers):
            last_layer = x
            for l in layers:
                last_layer = l(last_layer)
            return last_layer
        
        m = m_input = keras.layers.Input((None, self.__input_n,))
        m = apply_layers(m, self.__l_shared)
        m_mu = apply_layers(m, self.__l_policy_mu)
        m_sigma = apply_layers(m, self.__l_policy_sigma)
        m_dis = [apply_layers(m, dis) for dis in self.__l_policy_dis]
        m = keras.layers.Concatenate()([m_mu, m_sigma] + m_dis)
        self.__m_policy = keras.models.Model([m_input], [m])
        self.__m_policy.compile('adam', 'mse')
        
        m = m_input = keras.layers.Input((None, self.__input_n,))
        m = apply_layers(m, self.__l_shared)
        m = apply_layers(m, self.__l_value)
        self.__m_value = keras.models.Model([m_input], [m])
        self.__m_value.compile('adam', 'mse')
        
        m = m_input = keras.layers.Input((None, self.__input_n,))
        m_shared = apply_layers(m, self.__l_shared)
        m_policy_mu = apply_layers(m_shared, self.__l_policy_mu)
        m_policy_sigma = apply_layers(m_shared, self.__l_policy_sigma)
        m_policy_dis = [apply_layers(m_shared, dis) for dis in self.__l_policy_dis]
        m_policy = keras.layers.Concatenate()([m_policy_mu, m_policy_sigma] + m_policy_dis)
        m_value = apply_layers(m_shared, self.__l_value)
        self.__m_value_policy = keras.models.Model([m_input], [m_value, m_policy])
        self.__m_value_policy.compile('adam', 'mse')
        
        m = m_input = keras.layers.Input((None, self.__input_n,))
        m_value, m_policy = self.__m_value_policy(m)
        m = keras.layers.Concatenate()([m_value, m_policy])
        self.__m_optimizer = keras.optimizers.Adam(clipnorm=5.)
        self.__m_train = keras.models.Model([m_input], [m])
        self.__m_train.compile(self.__m_optimizer,
            lambda y_true, y_pred: model.__loss(
                y_true, y_pred, beta, self.__output_n, self.__dis_output_ns))
        
        self.__m_policy.summary()
        self.__m_value.summary()
        self.__m_train.summary()
    
    @staticmethod
    def __loss(y_true, y_pred, beta, output_size, dis_sizes):
        
        r, action, dis_actions, = y_true[:,:1], y_true[:,1:1+output_size], [y_true[:, \
            1+output_size*2+sum(dis_sizes[:i]):
            1+output_size*2+sum(dis_sizes[:i+1])]
            for i in range(len(dis_sizes))]
        value, policy_mu, policy_sigma, policy_dis = \
            y_pred[:,:1], y_pred[:,1:1+output_size], \
            y_pred[:,1+output_size:1+output_size*2], [y_pred[:, \
            1+output_size*2+sum(dis_sizes[:i]):
            1+output_size*2+sum(dis_sizes[:i+1])]
            for i in range(len(dis_sizes))]
        
        advantage = r - value
        value_loss = 0.5 * K.mean(K.square(advantage))
        
        log_choosen_action_prob = K.sum(K.exp( \
            -K.square(action - policy_mu)/ \
            (2*K.square(policy_sigma)+K.epsilon())), axis=-1, keepdims=True)
        action_loss = -K.mean(log_choosen_action_prob * advantage)
        entropy = K.mean(K.sum(0.5*K.log( \
            2*np.pi*np.e*K.square(policy_sigma)+K.epsilon()), axis=-1, keepdims=True))
        
        for action_onehot, policy in zip(dis_actions, policy_dis):
            log_policy = K.log(policy + K.epsilon())
            log_choosen_action_prob = K.sum(action_onehot * log_policy, axis=-1, keepdims=True)
            action_loss += -K.mean(log_choosen_action_prob * advantage)
            entropy += K.mean(-K.sum(policy * log_policy, axis=-1, keepdims=True))
        
        return action_loss + value_loss - beta * entropy
    
    def train(self, samples, epochs=1, verbose=False):
        self.__m_train.fit(
            x=np.array([s.state for s in samples], dtype=np.float32),
            y=np.hstack([
                np.reshape(np.array([s.reward for s in samples], dtype=np.float32), (-1, 1)),
                np.array([s.action[0] for s in samples]),
                np.zeros((len(samples),len(samples[0].action)))] + [
                np.array([s.action[i+1] for s in samples])
                for i in range(len(self.__dis_output_ns))]),
            batch_size=64,
            epochs=epochs,
            verbose=verbose)
    
    def evalute_value(self, state, verbose=False):
        v = self.__m_value.predict(
            np.array([state], dtype=np.float32))[0,0]
        if verbose:
            print(v)
        return v
    
    def get_action_prob(self, state, verbose=False):
        action_prob = self.__m_policy.predict(
            np.array([state], dtype=np.float32))[0]
        action_prob_mu = action_prob[:self.__output_n]
        action_prob_sigma = action_prob[self.__output_n:self.__output_n*2]
        action_prob_dis = [action_prob[
            self.__output_n*2+sum(self.__dis_output_ns[:i]):
            self.__output_n*2+sum(self.__dis_output_ns[:i+1])]
            for i in range(len(self.__dis_output_ns))]
        if verbose:
            print(action_prob_mu, action_prob_sigma, action_prob_dis)
        return action_prob_mu, action_prob_sigma, action_prob_dis

In [5]:
def discrete_to_continuous(discretes, sample=True):
    if sample:
        choosen_actions = [np.random.choice(
            list(range(dis.shape[-1])), p=dis)
            for dis in discretes]
    else:
        choosen_actions = [np.argmax(dis) for dis in discretes]
    return np.array([
        env.action_space.low[i] + \
        (choosen_actions[i]/(discretes[i].shape[-1]-1)) * \
        (env.action_space.high[i]-env.action_space.low[i])
        if discretes[i].shape[-1] > 1 else \
        (env.action_space.high[i]+env.action_space.low[i])/2
        for i in range(len(discretes))])

In [6]:
def play(env, m, gamma=0.98, max_steps=1000, n_prev_states=32, verbose=False):
    state_0 = env.reset()
    state_null = np.zeros_like(state_0)
    state_queue = []
    episode = []
    samples = []
    action_probs = []
    gamelen = 0
    gamelen_max = 0
    def get_prev_states(episode, idx):
        states = [e.state for e in episode[max(0,(idx-n_prev_states)+1):idx+1]]
        states = [state_null]*(max(0,n_prev_states-len(states))) + states
        return states
    def add_to_samples(episode, done):
        if done:
            discounted_reward = 0.
        else:
            discounted_reward = m.evalute_value(get_prev_states(episode, len(episode)-1))
        episode[-1] = model_sample(
                get_prev_states(episode, len(episode)-1),
                episode[-1].action,
                discounted_reward)
        for i in reversed(range(len(episode)-1)):
            discounted_reward = episode[i].reward + \
                gamma * discounted_reward
            episode[i] = model_sample(
                get_prev_states(episode, i),
                episode[i].action,
                discounted_reward)
        samples.extend(episode)
    for i in range(max_steps):
        state_queue.append(state_0)
        if len(state_queue) > n_prev_states:
            state_queue.pop(0)
        state_queue_padded = \
            [state_null]*(max(0,n_prev_states-len(state_queue))) + state_queue
        action_prob_mu, action_prob_sigma, action_prob_dis = \
            m.get_action_prob(state_queue_padded)
        action_probs.append(np.hstack(action_prob_dis))
        action = action_prob_mu + \
            np.random.randn(*action_prob_sigma.shape) * action_prob_sigma
        state_1, reward, done, _ = \
            env.step(action + discrete_to_continuous(action_prob_dis, sample=True))
        episode.append(model_sample(state_0, [action]+action_prob_dis, reward))
        state_0 = state_1
        gamelen += 1
        if done:
            add_to_samples(episode, True)
            episode = []
            state_0 = env.reset()
            state_null = np.zeros_like(state_0)
            state_queue = []
            gamelen_max = max(gamelen_max, gamelen)
            gamelen = 0
    if episode:
        add_to_samples(episode, False)
        gamelen_max = max(gamelen_max, gamelen)
        gamelen = 0
    if verbose:
        print('std[action_prob]', np.mean(np.std(action_probs, ddof=1, axis=0)))
        print('max game len', gamelen_max)
    return samples

In [7]:
K.clear_session()
m = model(24, 4, [15]*4, beta=0.01)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None, 24)     0                                            
__________________________________________________________________________________________________
batch_normalization_1 (BatchNor (None, None, 24)     96          input_1[0][0]                    
__________________________________________________________________________________________________
lstm_1 (LSTM)                   (None, 128)          78336       batch_normalization_1[0][0]      
__________________________________________________________________________________________________
dropout_1 (Dropout)             (None, 128)          0           lstm_1[0][0]                     
__________________________________________________________________________________________________
dense_5 (D

In [8]:
def test(m, render=False, n_prev_states=32, sample=False):
    global env
    state = env.reset()
    state_null = np.zeros_like(state)
    state_queue = []
    rewards = 0.
    while True:
        state_queue.append(state)
        if len(state_queue) > n_prev_states:
            state_queue.pop(0)
        state_queue_padded = \
            [state_null]*(max(0,n_prev_states-len(state_queue))) + state_queue 
        action_mu, action_sigma, action_dis = m.get_action_prob(state_queue)
        if sample:
            action = action_mu + np.random.randn(*action_sigma.shape) * action_sigma
        else:
            action = action_mu
        state, reward, done, _ = \
            env.step(action + discrete_to_continuous(action_dis, sample=sample))
        rewards += reward
        if render:
            env.render()
        if done:
            break
        if render:
            time.sleep(1/30)
    if render:
        env.close()
        env = gym.make('BipedalWalker-v2')
    return rewards

In [None]:
for i in range(500):
    samples = play(env, m, max_steps=1000, verbose=True)
    m.train(samples, epochs=1, verbose=True)
    print('epoch {} completed'.format(i))
    if (i+1) % 10 == 0:
        test_result = [test(m, render=False) for _ in range(3)]
        print('test result',
              'mean', np.mean(test_result),
              'std', np.std(test_result, ddof=1))

std[action_prob] 0.0063002533
max game len 279
Epoch 1/1
epoch 0 completed
std[action_prob] 0.032328524
max game len 294
Epoch 1/1
epoch 1 completed
std[action_prob] 0.0180429
max game len 1000
Epoch 1/1
epoch 2 completed


In [21]:
test(m, render=True, sample=False)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


-136.425818450002