In [1]:
import shutup
shutup.please()

In [2]:
import numpy as np

In [45]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '-1' # -1:cpu, 0:first gpu
import random
import gym
import pybullet_envs
import pylab
import numpy as np
import tensorflow as tf
from tensorboardX import SummaryWriter

from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam, RMSprop, Adagrad, Adadelta
from tensorflow.keras import backend as K
import copy

from threading import Thread, Lock
from multiprocessing import Process, Pipe
import time

tf.compat.v1.disable_eager_execution() # usually using this for fastest performance

# Helpers

# gaussian_likelihood - ver se consigo uma função global
# Keras loss
# def gaussian_likelihood(self, actions, pred): # for keras custom loss
#     log_std = -0.5 * np.ones(self.action_space, dtype=np.float32)
#     pre_sum = -0.5 * (((actions-pred)/(K.exp(log_std)+1e-8))**2 + 2*log_std + K.log(2*np.pi))
#     return K.sum(pre_sum, axis=1)
#
# # Agent
# def gaussian_likelihood(self, action, pred, log_std):
#     # https://github.com/hill-a/stable-baselines/blob/master/stable_baselines/sac/policies.py
#     pre_sum = -0.5 * (((action-pred)/(np.exp(log_std)+1e-8))**2 + 2*log_std + np.log(2*np.pi))
#     return np.sum(pre_sum, axis=1)
def gaussian_likelihood(log_std, lib="keras"): # for keras custom loss
    _exp = K.exp
    _log = K.log
    _sum = K.sum
    if lib == "numpy":
        _exp = np.exp
        _log = np.log
        _sum = np.sum
    
    def fn(actions,pred):
        pre_sum = -0.5 * (((actions-pred)/(_exp(log_std)+1e-8))**2 + 2*log_std + _log(2*np.pi))
        return _sum(pre_sum, axis=1)
    
    return fn

# Continuous
class PpoActorContinuous:
    def __init__(self, input_shape, action_space, lr, optimizer,loss_clipping = 0.2):

        self.action_space = action_space
        self.loss_clipping = loss_clipping
        self.log_std = -0.5 * np.ones(self.action_space , dtype=np.float32)

        self.gaussian_likelihood = gaussian_likelihood(self.log_std, lib="keras")
        
        X_input = Input(input_shape)
        

        X = Dense(512, activation="relu", kernel_initializer=tf.random_normal_initializer(stddev=0.01))(X_input)
        X = Dense(256, activation="relu", kernel_initializer=tf.random_normal_initializer(stddev=0.01))(X)
        X = Dense(64, activation="relu", kernel_initializer=tf.random_normal_initializer(stddev=0.01))(X)
        output = Dense(self.action_space, activation="tanh")(X)

        self.Actor = Model(inputs = X_input, outputs = output)
        self.Actor.compile(loss=self.ppo_loss, optimizer=optimizer(learning_rate=lr))

    def ppo_loss(self, y_true, y_pred):
        advantages, actions, logp_old_ph, = y_true[:, :1], y_true[:, 1:1+self.action_space], y_true[:, 1+self.action_space]

        logp = self.gaussian_likelihood(actions, y_pred)

        ratio = K.exp(logp - logp_old_ph)

        p1 = ratio * advantages
        p2 = tf.where(advantages > 0, (1.0 + self.loss_clipping)*advantages, (1.0 - self.loss_clipping)*advantages) # minimum advantage

        actor_loss = -K.mean(K.minimum(p1, p2))

        return actor_loss

#     def gaussian_likelihood(self, actions, pred): # for keras custom loss
        
        
#         #log_std = -0.5 * np.ones(self.action_space, dtype=np.float32)
#         pre_sum = -0.5 * (((actions-pred)/(K.exp(self.log_std)+1e-8))**2 + 2*self.log_std + K.log(2*np.pi))
#         return K.sum(pre_sum, axis=1)

    def predict(self, state):
        return self.Actor.predict(state)

# Discrete
class PpoActorDiscrete:
    def __init__(self, input_shape, action_space, lr, optimizer,loss_clipping=0.2,loss_entropy=0.001):

        self.action_space = action_space
        self.loss_clipping = loss_clipping
        self.loss_entropy = loss_entropy

        X_input = Input(input_shape)


        X = Dense(512, activation="relu", kernel_initializer=tf.random_normal_initializer(stddev=0.01))(X_input)
        X = Dense(256, activation="relu", kernel_initializer=tf.random_normal_initializer(stddev=0.01))(X)
        X = Dense(64, activation="relu", kernel_initializer=tf.random_normal_initializer(stddev=0.01))(X)
        output = Dense(self.action_space, activation="softmax")(X)

        self.Actor = Model(inputs = X_input, outputs = output)
        self.Actor.compile(loss=self.ppo_loss, optimizer=optimizer(learning_rate=lr))

    def ppo_loss(self, y_true, y_pred):
        # Defined in https://arxiv.org/abs/1707.06347
        #advantages, prediction_picks, actions = y_true[:, :1], y_true[:, 1:1+self.action_space], y_true[:, 1+self.action_space:]
        advantages,  actions, prediction_picks = y_true[:, :1], y_true[:, 1:1+self.action_space], y_true[:, 1+self.action_space:]

        prob = actions * y_pred
        old_prob = actions * prediction_picks

        prob = K.clip(prob, 1e-10, 1.0)
        old_prob = K.clip(old_prob, 1e-10, 1.0)

        ratio = K.exp(K.log(prob) - K.log(old_prob))

        p1 = ratio * advantages
        p2 = K.clip(ratio, min_value=1 - self.loss_clipping, max_value=1 + self.loss_clipping) * advantages

        actor_loss = -K.mean(K.minimum(p1, p2))

        entropy = -(y_pred * K.log(y_pred + 1e-10))
        entropy = self.loss_entropy * K.mean(entropy)

        total_loss = actor_loss - entropy

        return total_loss

    def predict(self, state):
        return self.Actor.predict(state)


# PPO Critic for discrete or continuous only differs in the initializer
class PpoCritic:
    def __init__(self, input_shape, action_space, lr, optimizer,loss_function_version=1, loss_clipping=0.2,kernel_initializer=False,continuous_action_space=False):

        self.loss_clipping = loss_clipping
        
        X_input = Input(input_shape)
        old_values = Input(shape=(1,))

        if kernel_initializer == False:
            if continuous_action_space == False:
                kernel_initializer = 'he_uniform'
            else:
                kernel_initializer=tf.random_normal_initializer(stddev=0.01)
                
        if loss_function_version == 1:
            loss_function = self.ppo_loss
        else:
            loss_function = self.ppo_loss_2(old_values)

        V = Dense(512, activation="relu", kernel_initializer=kernel_initializer)(X_input)
        V = Dense(256, activation="relu", kernel_initializer=kernel_initializer)(V)
        V = Dense(64, activation="relu", kernel_initializer=kernel_initializer)(V)
        value = Dense(1, activation=None)(V)

        self.Critic = Model(inputs=[X_input, old_values], outputs = value)
        self.Critic.compile(loss=[loss_function], optimizer=optimizer(learning_rate=lr))
        
    def ppo_loss(self, y_true, y_pred):
        value_loss = K.mean((y_true - y_pred) ** 2) # standard PPO loss
        return value_loss

    def ppo_loss_2(self, values):
        def loss(y_true, y_pred):

            clipped_value_loss = values + K.clip(y_pred - values, -self.loss_clipping, self.loss_clipping)
            v_loss1 = (y_true - clipped_value_loss) ** 2
            v_loss2 = (y_true - y_pred) ** 2

            value_loss = 0.5 * K.mean(K.maximum(v_loss1, v_loss2))
            #value_loss = K.mean((y_true - y_pred) ** 2) # standard PPO loss
            return value_loss
        return loss

    def predict(self, state):
        return self.Critic.predict([state, np.zeros((state.shape[0], 1))])

class PpoBuffer:
    def __init__(self):
        self.reset()
        
    def reset(self):
        self.states=[] 
        self.next_states=[] 
        self.actions=[] 
        self.rewards=[] 
        self.predictions=[] 
        self.dones=[] 
        
# PPO PPOAgent
class PpoAgent:
    # PPO Main Optimization Algorithm
    def __init__(self, env_name, training_batch=4000, epochs=80, episodes=1000,lr=0.00025,shuffle=False,target_kl = 0.01, continuous_action_space=False):
        # Initialization
        # Environment and PPO parameters
        self.env_name = env_name
        self.env = gym.make(env_name)
        self.target_kl = 0.01
        

        if continuous_action_space:
            self.action_size = self.env.action_space.shape[0]
        else:
            self.action_size = self.env.action_space.n
            
        self.state_size = self.env.observation_space.shape
        self.EPISODES = episodes # total episodes to train through all environments
        self.episode = 0 # used to track the episodes total count of episodes played through all thread environments
        self.max_average = 0 # when average score is above 0 model will be saved
        self.lr = lr
        self.epochs = epochs # training epochs
        self.shuffle = shuffle
        self.Training_batch = training_batch
        #self.optimizer = RMSprop
        self.optimizer = Adam
        self.replay_count = 0
        self.continuous_action_space=continuous_action_space

        # Instantiate plot memory
        self.scores_, self.episodes_, self.average_ = [], [], [] # used in matplotlib plots

        if continuous_action_space:
            self.Actor= PpoActorContinuous(self.state_size, self.action_size, lr=self.lr, optimizer = self.optimizer,loss_clipping = 0.2)
        else:
            self.Actor= PpoActorDiscrete(self.state_size, self.action_size, lr=self.lr, optimizer = self.optimizer,loss_clipping=0.2,loss_entropy=0.001)

        self.Critic = PpoCritic(self.state_size, self.action_size, lr=self.lr, optimizer = self.optimizer,loss_clipping=0.2,kernel_initializer=False,continuous_action_space=continuous_action_space)

        # do not change bellow
        self.log_std = -0.5 * np.ones(self.action_size, dtype=np.float32)
        self.std = np.exp(self.log_std)
        
        # Bind gaussian likelihood
        self.gaussian_likelihood = gaussian_likelihood(self.log_std, lib="numpy")

    def act(self, state):
        if self.continuous_action_space:
            # Use the network to predict the next action to take, using the model
            prediction = self.Actor.predict(state)

            low, high = -1.0, 1.0 # -1 and 1 are boundaries of tanh
            action = prediction + np.random.uniform(low, high, size=prediction.shape) * self.std
            action = np.clip(action, low, high)

            logp_t = self.gaussian_likelihood(action, prediction)

            return action[0], action , logp_t[0]
        else:
            prediction = self.Actor.predict(state)[0]
            action = np.random.choice(self.action_size, p=prediction)
            action_onehot = np.zeros([self.action_size])
            action_onehot[action] = 1
            return action, action_onehot, prediction

    def reshape_state(self,state):
        return np.reshape(state, [1, self.state_size[0]])
    
    def run_batch(self):
        state = self.env.reset()
        state = self.reshape_state(state)
        done, score = False, 0
        while True:
            # Instantiate or reset games memory
            buffer = PpoBuffer()

            for t in range(self.Training_batch):
                #self.env.render()
                # Actor picks an action
                action, action_data, prediction = self.act(state)

                # Retrieve new state, reward, and whether the state is terminal

                next_state, reward, done, _ = self.env.step(action) 
                next_state = self.reshape_state(next_state)
                # Memorize (state, next_states, action, reward, done, logp_ts) for training
                buffer.states.append(state)
                buffer.next_states.append(next_state)
  
                buffer.actions.append(action_data)
                buffer.rewards.append(reward)
                buffer.dones.append(done)
  
                buffer.predictions.append(prediction)

                # Update current state shape
                state = next_state
                score += reward
                if done:
                    self.episode += 1
                    average = self.checkpoint(score, self.episode)
                    #if str(self.episode)[-2:] == "00":
                    print("episode: {}/{}, score: {}, average: {:.2f} {}".format(self.episode, self.EPISODES, score, average, ''))
                    state, done, score = self.env.reset(), False, 0
                    state = self.reshape_state(state) #np.reshape(state, [1, self.state_size[0]])


            if self.episode >= self.EPISODES:
                break
            
            print()
            print(action_data)
            print()
            self.replay(buffer)

  
        self.env.close()


    def replay(self, buffer):
        # reshape memory to appropriate shape for training
        states = np.vstack(buffer.states)
        next_states = np.vstack(buffer.next_states)
        actions = np.vstack(buffer.actions)
        predictions = np.vstack(buffer.predictions)
        rewards = buffer.rewards
        dones = buffer.dones
        
        # Get Critic network predictions
        values = self.Critic.predict(states)
        next_values = self.Critic.predict(next_states)

        # Compute discounted rewards and advantages
        advantages, target = self.get_gaes(rewards, dones, np.squeeze(values), np.squeeze(next_values))

        # stack everything to numpy array pack all advantages, predictions and actions to y_true and when they are received in custom loss function we unpack it

        y_true = np.hstack([advantages, actions, predictions])

        # training Actor and Critic networks
        a_loss = self.Actor.Actor.fit(states, y_true, epochs=self.epochs, verbose=0, shuffle=self.shuffle)
        c_loss = self.Critic.Critic.fit([states, values], target, epochs=self.epochs, verbose=0, shuffle=self.shuffle)

        if self.continuous_action_space:
            # calculate loss parameters (should be done in loss, but couldn't find working way how to do that with disabled eager execution)
            pred = self.Actor.predict(states)
            #log_std = -0.5 * np.ones(self.action_size, dtype=np.float32)
            #logp = self.gaussian_likelihood(actions, pred, log_std)
            logp = self.gaussian_likelihood(actions, pred)
            approx_kl = np.mean(predictions - logp)
            approx_ent = np.mean(-logp)
            print()
            print('approx_kl',approx_kl)
            print('approx_ent',approx_ent)
            print()
        self.replay_count += 1
        
        buffer.reset()

    ### Equal fns
#     def gaussian_likelihood(self, action, pred):
#         # for continuous only
#         # https://github.com/hill-a/stable-baselines/blob/master/stable_baselines/sac/policies.py
#         pre_sum = -0.5 * (((action-pred)/(np.exp(self.log_std)+1e-8))**2 + 2*self.log_std + np.log(2*np.pi))
#         return np.sum(pre_sum, axis=1)

    def get_gaes(self, rewards, dones, values, next_values, gamma = 0.99, lamda = 0.90, normalize=True):
        deltas = [r + gamma * (1 - d) * nv - v for r, d, nv, v in zip(rewards, dones, next_values, values)]
        deltas = np.stack(deltas)
        gaes = copy.deepcopy(deltas)
        for t in reversed(range(len(deltas) - 1)):
            gaes[t] = gaes[t] + (1 - dones[t]) * gamma * lamda * gaes[t + 1]

        target = gaes + values
        if normalize:
            gaes = (gaes - gaes.mean()) / (gaes.std() + 1e-8)
        return np.vstack(gaes), np.vstack(target)

    def load(self):
#         self.Actor.Actor.load_weights(self.Actor_name)
#         self.Critic.Critic.load_weights(self.Critic_name)
        pass

    def save(self):
#         self.Actor.Actor.save_weights(self.Actor_name)
#         self.Critic.Critic.save_weights(self.Critic_name)
        pass

    def checkpoint(self, score, episode):
        self.scores_.append(score)
        self.episodes_.append(episode)
        self.average_.append(sum(self.scores_[-50:]) / len(self.scores_[-50:]))
        saving = False
        # saving best models
        if self.average_[-1] >= self.max_average:
            self.max_average = self.average_[-1]
            self.save()
            # decreaate learning rate every saved model
            self.lr *= 0.95
            K.set_value(self.Actor.Actor.optimizer.learning_rate, self.lr)
            K.set_value(self.Critic.Critic.optimizer.learning_rate, self.lr)
            saving = True
            print()
            print('New record')
            print()

        if str(episode)[-2:] == "00":# much faster than episode % 100
            # Do some logging
            pass

        return self.average_[-1]


In [39]:
discrete_agent = PpoAgent('CartPole-v1', training_batch=4000,epochs=80,lr=3e-4,episodes=500, continuous_action_space=False)
discrete_agent.run_batch()


New record

episode: 1/500, score: 9.0, average: 9.00 

New record

episode: 2/500, score: 14.0, average: 11.50 

New record

episode: 3/500, score: 17.0, average: 13.33 

New record

episode: 4/500, score: 18.0, average: 14.50 

New record

episode: 5/500, score: 28.0, average: 17.20 

New record

episode: 6/500, score: 22.0, average: 18.00 
episode: 7/500, score: 11.0, average: 17.00 
episode: 8/500, score: 12.0, average: 16.38 
episode: 9/500, score: 26.0, average: 17.44 
episode: 10/500, score: 17.0, average: 17.40 
episode: 11/500, score: 15.0, average: 17.18 
episode: 12/500, score: 10.0, average: 16.58 

New record

episode: 13/500, score: 35.0, average: 18.00 

New record

episode: 14/500, score: 20.0, average: 18.14 
episode: 15/500, score: 18.0, average: 18.13 
episode: 16/500, score: 11.0, average: 17.69 
episode: 17/500, score: 25.0, average: 18.12 
episode: 18/500, score: 10.0, average: 17.67 

New record

episode: 19/500, score: 33.0, average: 18.47 

New record

episode

episode: 176/500, score: 11.0, average: 20.80 
episode: 177/500, score: 11.0, average: 20.42 
episode: 178/500, score: 24.0, average: 20.24 
episode: 179/500, score: 22.0, average: 20.36 

[1. 0.]



KeyboardInterrupt: 

In [None]:
continuous_agent= PpoAgent('LunarLanderContinuous-v2',training_batch=4000, epochs=80,episodes=500, continuous_action_space=True)
#continuous_agent = PpoAgent('InvertedPendulumBulletEnv-v0',training_batch=4000, epochs=80,episodes=500, continuous_action_space=True)
continuous_agent.run_batch()

# action logp_t
# [[-0.54015928  0.29224633]] [-1.34975245]


action logp_t
[[0.32941785 0.21218028]] [-1.04692631]


action logp_t
[[ 0.12699455 -0.29439818]] [-0.97795182]


action logp_t
[[ 0.11797611 -0.53656225]] [-1.24855627]


action logp_t
[[-0.05975853  0.01397618]] [-0.84289899]


action logp_t
[[ 0.23653878 -0.0991624 ]] [-0.92769181]


action logp_t
[[-0.1022898  -0.18129503]] [-0.89670594]


action logp_t
[[-0.09951301 -0.14349646]] [-0.87924085]


action logp_t
[[0.4352326 0.2154001]] [-1.15894721]


action logp_t
[[-0.12683911  0.4165998 ]] [-1.09526031]


action logp_t
[[-0.05431579  0.50605974]] [-1.18967589]


action logp_t
[[ 0.19171254 -0.56245385]] [-1.31827881]


action logp_t
[[0.45608635 0.48326046]] [-1.43850427]


action logp_t
[[0.12507219 0.30867312]] [-0.98870745]


action logp_t
[[0.56983221 0.27860498]] [-1.38544275]


action logp_t
[[ 0.07601657 -0.12984789]] [-0.86880699]


action logp_t
[[-0.52911905  0.06979651]] [-1.22420062]


action logp_t
[[0.57162735 0.15878225]] [-1.31704838]


action logp_t
[[0.12775098 


action logp_t
[[-0.13516283  0.34188321]] [-1.02129946]


action logp_t
[[ 0.58107888 -0.25458971]] [-1.38565088]


action logp_t
[[-0.07635034 -0.32258305]] [-0.98724866]


action logp_t
[[-0.12718875  0.26560517]] [-0.95551182]


action logp_t
[[ 0.52408768 -0.43261067]] [-1.46630164]


action logp_t
[[-0.45995921 -0.42772378]] [-1.37366413]


action logp_t
[[-0.04530767  0.00402282]] [-0.84063477]


action logp_t
[[ 0.09507942 -0.22868032]] [-0.92141866]


action logp_t
[[-0.48724622  0.3789326 ]] [-1.3550209]


action logp_t
[[-0.47658139  0.258884  ]] [-1.23703004]


action logp_t
[[0.43211275 0.46857524]] [-1.39047171]


action logp_t
[[0.28703961 0.45423046]] [-1.2305205]


action logp_t
[[ 0.36164177 -0.03915445]] [-1.01816719]


action logp_t
[[-0.18229336 -0.40681343]] [-1.10784284]


action logp_t
[[0.24635326 0.12882485]] [-0.94319819]


action logp_t
[[0.37788501 0.38493687]] [-1.23374306]


action logp_t
[[-0.55259875  0.42348877]] [-1.49588771]


action logp_t
[[-0.1329


action logp_t
[[-0.1482318   0.40597577]] [-1.09182525]


action logp_t
[[-0.16856688 -0.53606883]] [-1.26678654]


action logp_t
[[-0.04370545  0.48918623]] [-1.16588979]


action logp_t
[[ 0.53353899 -0.18216426]] [-1.26990677]


action logp_t
[[0.1707245  0.60394797]] [-1.37351783]

episode: 3/500, score: -279.5078772400842, average: -285.49 

action logp_t
[[-0.42283409  0.03151505]] [-1.08147332]


action logp_t
[[-0.08412317  0.26398644]] [-0.94185937]


action logp_t
[[-0.44642813 -0.55336457]] [-1.52460513]


action logp_t
[[-0.55201816  0.51048786]] [-1.60488109]


action logp_t
[[0.05308888 0.39085075]] [-1.0491317]


action logp_t
[[-0.1597026  -0.54708668]] [-1.27948173]


action logp_t
[[ 0.1070193  -0.06205551]] [-0.85890888]


action logp_t
[[-0.50653025  0.46820292]] [-1.48331207]


action logp_t
[[ 0.59699194 -0.20475211]] [-1.38042658]


action logp_t
[[ 0.52579582 -0.49634652]] [-1.54974474]


action logp_t
[[ 0.20596279 -0.40420837]] [-1.1182612]


action logp_t
[[


action logp_t
[[0.34236839 0.35383678]] [-1.16758939]


action logp_t
[[0.14394784 0.05810547]] [-0.87077274]


action logp_t
[[0.10397079 0.18839059]] [-0.90083016]


action logp_t
[[-0.45774377 -0.12283208]] [-1.14268726]


action logp_t
[[0.34563886 0.10790243]] [-1.01642173]


action logp_t
[[-0.60270874 -0.56282287]] [-1.76171452]


action logp_t
[[0.13483752 0.4981916 ]] [-1.19983534]


action logp_t
[[ 0.00152954 -0.27433569]] [-0.94030313]


action logp_t
[[0.37868602 0.15371847]] [-1.06525244]


action logp_t
[[0.29378652 0.11964889]] [-0.97491353]


action logp_t
[[ 0.27707873 -0.08173261]] [-0.95164895]


action logp_t
[[-0.48766685 -0.46954462]] [-1.46045037]


action logp_t
[[-0.21217091 -0.16025476]] [-0.93380789]


action logp_t
[[0.27026271 0.14152226]] [-0.96460141]


action logp_t
[[0.38357293 0.4431764 ]] [-1.30501567]


action logp_t
[[0.49962972 0.47217453]] [-1.48051897]


action logp_t
[[0.04767493 0.45162089]] [-1.11807784]


action logp_t
[[-0.59684114  0.0117


action logp_t
[[0.50628948 0.5045742 ]] [-1.53297725]


action logp_t
[[-0.3345974   0.44438449]] [-1.2581328]


action logp_t
[[0.42391719 0.1939977 ]] [-1.13381231]


action logp_t
[[0.52998325 0.30126072]] [-1.34367635]


action logp_t
[[0.37037441 0.32232463]] [-1.1660371]


action logp_t
[[ 0.30574803 -0.14492119]] [-0.99379499]


action logp_t
[[0.01577146 0.175595  ]] [-0.88019176]


action logp_t
[[ 0.38132177 -0.35408097]] [-1.20624585]


action logp_t
[[-0.29152109 -0.24836472]] [-1.03679596]


action logp_t
[[-0.45378598  0.48436235]] [-1.43624712]


action logp_t
[[0.26873509 0.34525807]] [-1.09849584]


action logp_t
[[-0.52313305 -0.18450904]] [-1.25539365]


action logp_t
[[0.07474312 0.37603774]] [-1.03790664]


action logp_t
[[-0.31201065  0.36804891]] [-1.1540755]


action logp_t
[[-0.35582686 -0.12058284]] [-1.02923065]


action logp_t
[[ 0.16931622 -0.57397084]] [-1.32453823]


action logp_t
[[0.25499489 0.29182509]] [-1.04245842]


action logp_t
[[ 0.60198433 -0.5


action logp_t
[[ 0.16524391 -0.24049996]] [-0.954085]


action logp_t
[[0.38412689 0.24883915]] [-1.12311657]


action logp_t
[[-0.3329905  -0.55447379]] [-1.40621682]


action logp_t
[[-0.53436718 -0.02682905]] [-1.22599174]


action logp_t
[[0.29032324 0.3638763 ]] [-1.13266091]


action logp_t
[[ 0.2188863  -0.33101265]] [-1.05254079]


action logp_t
[[-0.40890294  0.11693046]] [-1.08290008]


action logp_t
[[-0.28884512 -0.33466398]] [-1.10321854]


action logp_t
[[-0.23269672  0.54746647]] [-1.3180724]


action logp_t
[[-0.43574048 -0.43131665]] [-1.34831761]


action logp_t
[[0.14012768 0.53079542]] [-1.24741453]


action logp_t
[[ 0.34703819 -0.55912436]] [-1.42732288]


action logp_t
[[-0.48608788  0.24540961]] [-1.23994733]


action logp_t
[[0.56954936 0.39145741]] [-1.48772381]


action logp_t
[[ 0.29140736 -0.23075118]] [-1.02624831]


action logp_t
[[-0.01026447 -0.3830685 ]] [-1.03765927]


action logp_t
[[-0.60615241 -0.59883527]] [-1.82404546]


action logp_t
[[-0.29382


action logp_t
[[-0.29681232 -0.57826996]] [-1.41188687]


action logp_t
[[0.41456652 0.43832741]] [-1.3329265]


action logp_t
[[-0.26089535  0.52711846]] [-1.3078221]


action logp_t
[[-0.43623136 -0.46415747]] [-1.38896553]


action logp_t
[[0.42301289 0.3477598 ]] [-1.24581431]


action logp_t
[[0.05869811 0.46262674]] [-1.13353585]


action logp_t
[[0.54717118 0.44407081]] [-1.51330508]


action logp_t
[[-0.18558091  0.12758196]] [-0.90667521]


action logp_t
[[-0.28170395 -0.45395982]] [-1.22552928]


action logp_t
[[-0.11579423  0.27860756]] [-0.96156111]


action logp_t
[[ 0.29704435 -0.03767326]] [-0.95995578]


action logp_t
[[-0.40025484  0.23841364]] [-1.13261491]


action logp_t
[[0.03280953 0.60459415]] [-1.33634707]


action logp_t
[[0.45105563 0.45239855]] [-1.39304102]


action logp_t
[[0.58574052 0.32622797]] [-1.44938151]


action logp_t
[[0.35049055 0.39526322]] [-1.21757638]


action logp_t
[[-0.22751781  0.12695724]] [-0.9300111]


action logp_t
[[0.14942901 0.352


action logp_t
[[0.15491897 0.41124178]] [-1.1005435]


action logp_t
[[0.29786219 0.10729868]] [-0.97456267]


action logp_t
[[-0.23542397  0.60600142]] [-1.41190647]


action logp_t
[[-0.48228578 -0.01752296]] [-1.15367888]


action logp_t
[[-0.42572958  0.18192567]] [-1.12852626]


action logp_t
[[-0.19758683 -0.13292189]] [-0.91464471]

episode: 10/500, score: -234.70988803526876, average: -194.98 

action logp_t
[[0.1417014  0.04861163]] [-0.86865075]


action logp_t
[[-0.21377301  0.35513486]] [-1.07063718]


action logp_t
[[-0.01733856  0.21207773]] [-0.89920058]


action logp_t
[[0.14709415 0.44020858]] [-1.13061525]


action logp_t
[[ 0.4221036  -0.07235166]] [-1.08811178]


action logp_t
[[-0.11365723 -0.17019329]] [-0.89470248]


action logp_t
[[-0.2716549  -0.30858904]] [-1.06728369]


action logp_t
[[-0.54344251 -0.01933282]] [-1.23866755]


action logp_t
[[0.13792103 0.16224491]] [-0.89966598]


action logp_t
[[ 0.50517752 -0.33162325]] [-1.33549398]


action logp_t
[[0.1


action logp_t
[[-0.04937988 -0.26029351]] [-0.93326568]


action logp_t
[[ 0.52469866 -0.09956528]] [-1.22638217]


action logp_t
[[-0.19361231 -0.00125613]] [-0.88852131]


action logp_t
[[-0.27088726  0.207771  ]] [-0.99581313]


action logp_t
[[-0.37474183  0.27394105]] [-1.13010139]


action logp_t
[[-0.29635072  0.26003666]] [-1.04864566]


action logp_t
[[0.22291986 0.46909205]] [-1.20475691]


action logp_t
[[ 0.59583233 -0.34319937]] [-1.48144187]


action logp_t
[[0.39034905 0.49732837]] [-1.38169167]


action logp_t
[[-0.4210168   0.03042878]] [-1.0793614]


action logp_t
[[-0.44304182  0.17844555]] [-1.14720805]


action logp_t
[[ 0.09166988 -0.39305967]] [-1.05945333]


action logp_t
[[-0.31614079  0.23559751]] [-1.04863853]


action logp_t
[[ 0.09480467 -0.0267452 ]] [-0.85121697]


action logp_t
[[ 0.47534626 -0.43969311]] [-1.40850596]


action logp_t
[[0.0391022  0.21005521]] [-0.89999529]


action logp_t
[[ 0.43803961 -0.21047696]] [-1.1595904]


action logp_t
[[-0.20


action logp_t
[[-0.59282418  0.39500457]] [-1.52630073]


action logp_t
[[ 0.2064473  -0.42421583]] [-1.14104256]


action logp_t
[[ 0.51028549 -0.20641334]] [-1.25067452]


action logp_t
[[0.49147802 0.18545084]] [-1.21358263]


action logp_t
[[-0.59482804 -0.36204153]] [-1.49623531]


action logp_t
[[-0.00678921 -0.40882786]] [-1.06538062]


action logp_t
[[-0.3935322   0.54531672]] [-1.45158021]


action logp_t
[[-0.22338938  0.41694526]] [-1.14139437]


action logp_t
[[ 0.00856435 -0.46659383]] [-1.13416577]


action logp_t
[[0.45874662 0.57526599]] [-1.57399422]


action logp_t
[[ 0.29717545 -0.46951151]] [-1.25819073]


action logp_t
[[-0.32814129 -0.2120143 ]] [-1.04498715]


action logp_t
[[-0.31699957 -0.57193527]] [-1.41891572]


action logp_t
[[-0.39315992  0.36515638]] [-1.22849934]


action logp_t
[[0.47176899 0.39741994]] [-1.35547164]


action logp_t
[[0.24799584 0.33079776]] [-1.07036815]


action logp_t
[[0.17036693 0.45157465]] [-1.15450825]


action logp_t
[[-0.0680


action logp_t
[[-0.41731791  0.56559859]] [-1.50867966]


action logp_t
[[0.16350578 0.29321989]] [-0.99117975]


action logp_t
[[0.40759453 0.30718253]] [-1.19231659]


action logp_t
[[0.20072281 0.10338085]] [-0.90736464]


action logp_t
[[-0.2011916  0.4802372]] [-1.2060115]


action logp_t
[[-0.033954    0.39255425]] [-1.04876424]


action logp_t
[[0.04972425 0.37571627]] [-1.03307839]


action logp_t
[[-0.59703551 -0.33815296]] [-1.47719625]


action logp_t
[[-0.43947776  0.43599001]] [-1.35822656]


action logp_t
[[0.32848818 0.19176551]] [-1.03482401]


action logp_t
[[-0.39716647 -0.43783549]] [-1.31247331]


action logp_t
[[0.23442098 0.28829233]] [-1.02573482]


action logp_t
[[ 0.31513571 -0.29337551]] [-1.09013688]


action logp_t
[[-0.02328283  0.21415996]] [-0.90092758]


action logp_t
[[ 0.19295366 -0.06238578]] [-0.89394261]


action logp_t
[[ 0.41743784 -0.25687125]] [-1.16475533]


action logp_t
[[ 0.23643108 -0.11791213]] [-0.93295082]


action logp_t
[[0.50009174 0


action logp_t
[[ 0.34389095 -0.49523198]] [-1.3327578]


action logp_t
[[ 0.17428382 -0.49629918]] [-1.21446761]


action logp_t
[[ 0.24272487 -0.38643439]] [-1.12148892]


action logp_t
[[ 0.58584692 -0.54624021]] [-1.71107557]


action logp_t
[[ 0.02688521 -0.60355177]] [-1.33433077]


action logp_t
[[ 0.08520298 -0.38373418]] [-1.04822713]


action logp_t
[[-0.03008895  0.38646154]] [-1.04183814]


action logp_t
[[-0.55689261  0.3080118 ]] [-1.38733805]


action logp_t
[[-0.29730537 -0.50250438]] [-1.30103628]


action logp_t
[[-0.49900048 -0.21580743]] [-1.23899838]


action logp_t
[[ 0.5469699  -0.49457263]] [-1.57796354]


action logp_t
[[ 0.28178817 -0.48569919]] [-1.26706347]


action logp_t
[[ 0.30093794 -0.12388647]] [-0.98230421]


action logp_t
[[ 0.5579933  -0.45953214]] [-1.54905825]


action logp_t
[[-0.08096199  0.2104    ]] [-0.90674084]


action logp_t
[[-0.48552837  0.13393763]] [-1.18195838]


action logp_t
[[ 0.32438115 -0.06915789]] [-0.98783607]


action logp_t



action logp_t
[[ 0.40792621 -0.37634565]] [-1.25715287]

episode: 19/500, score: -50.03614121188304, average: -155.98 

action logp_t
[[0.29982445 0.23840087]] [-1.0377228]


action logp_t
[[ 0.44820464 -0.34767015]] [-1.27660205]


action logp_t
[[ 0.45712655 -0.33423378]] [-1.27512117]


action logp_t
[[-0.34949279  0.15965037]] [-1.03757676]


action logp_t
[[0.551765   0.35007684]] [-1.41909215]


action logp_t
[[-0.06512908  0.4001075 ]] [-1.06065392]


action logp_t
[[-0.28558236  0.12754772]] [-0.97007217]


action logp_t
[[0.55002091 0.07222403]] [-1.25727654]


action logp_t
[[-0.20575443  0.54325095]] [-1.29552579]


action logp_t
[[0.31314619 0.09562483]] [-0.98417173]


action logp_t
[[0.25928901 0.21970954]] [-0.99521304]


action logp_t
[[-0.23058619  0.23951961]] [-0.98739357]


action logp_t
[[ 0.04850602 -0.22011011]] [-0.90723583]


action logp_t
[[-0.28435481  0.19344904]] [-0.9978539]


action logp_t
[[ 0.25766319 -0.52495603]] [-1.30369113]


action logp_t
[[0.213


action logp_t
[[-0.52913133 -0.34383851]] [-1.3784185]


action logp_t
[[0.46288932 0.54710014]] [-1.53641484]


action logp_t
[[0.51331904 0.1979185 ]] [-1.24996603]


action logp_t
[[-0.23286338 -0.18711138]] [-0.95885989]


action logp_t
[[ 0.51516336 -0.16206949]] [-1.23513896]


action logp_t
[[-0.0837049  -0.10138528]] [-0.86126905]


action logp_t
[[ 0.38029704 -0.31652832]] [-1.17132084]


action logp_t
[[0.2898048  0.53410799]] [-1.34006272]


action logp_t
[[0.36976969 0.30469357]] [-1.15041729]


action logp_t
[[-0.42279416  0.48459882]] [-1.39915437]


action logp_t
[[-0.54725942  0.18583975]] [-1.29090232]


action logp_t
[[-0.40584183  0.48467372]] [-1.38022349]


action logp_t
[[ 0.22741926 -0.13395525]] [-0.93296124]


action logp_t
[[-0.14247752 -0.12669851]] [-0.88707393]


action logp_t
[[0.19016813 0.57117274]] [-1.33064203]


action logp_t
[[-0.32765082 -0.40056629]] [-1.20138361]


action logp_t
[[-0.52276145  0.06965996]] [-1.21504152]


action logp_t
[[ 0.07388


action logp_t
[[0.4020331  0.42883963]] [-1.30793473]


action logp_t
[[-0.26167692 -0.40395933]] [-1.15237342]


action logp_t
[[0.01627592 0.0291784 ]] [-0.83942044]


action logp_t
[[-0.4927717  -0.55755824]] [-1.59011383]


action logp_t
[[-0.00205128 -0.04724378]] [-0.8409244]

episode: 22/500, score: -47.19908779635644, average: -164.45 

action logp_t
[[0.03994586 0.18083919]] [-0.88442348]


action logp_t
[[0.5528509  0.24398846]] [-1.33493911]


action logp_t
[[-0.08578466 -0.07340488]] [-0.85511494]


action logp_t
[[0.18587912 0.0216415 ]] [-0.88576724]


action logp_t
[[ 0.34999001 -0.34871721]] [-1.17047835]


action logp_t
[[-0.35459174  0.55247372]] [-1.4226186]


action logp_t
[[-0.28859829  0.53071197]] [-1.3330385]


action logp_t
[[ 0.31025114 -0.51477638]] [-1.32971208]


action logp_t
[[ 0.41142537 -0.49595766]] [-1.40327513]


action logp_t
[[-0.0274787   0.36880825]] [-1.02346823]


action logp_t
[[ 0.18789262 -0.06107978]] [-0.89128385]


action logp_t
[[0.1988


action logp_t
[[ 0.14676098 -0.30180986]] [-0.99115893]


action logp_t
[[ 0.22048364 -0.52998061]] [-1.28603526]


action logp_t
[[ 0.28205049 -0.05794348]] [-0.95085363]


action logp_t
[[-0.54536154  0.48681563]] [-1.56356716]


action logp_t
[[ 0.44086226 -0.00472967]] [-1.1024993]


action logp_t
[[0.38132992 0.21822832]] [-1.10056654]


action logp_t
[[ 0.04412181 -0.01115386]] [-0.84073824]


action logp_t
[[0.56563238 0.24700541]] [-1.3561531]


action logp_t
[[ 0.51492102 -0.23015249]] [-1.27079936]


action logp_t
[[-0.0482088 -0.5560636]] [-1.26134802]


action logp_t
[[-0.11007241  0.12353439]] [-0.8749502]


action logp_t
[[ 0.52638394 -0.50688831]] [-1.56430316]


action logp_t
[[0.38673239 0.0711843 ]] [-1.04841809]


action logp_t
[[0.32740793 0.05046298]] [-0.98735292]


action logp_t
[[0.29649032 0.20612998]] [-1.01536719]


action logp_t
[[-0.32895819 -0.49437863]] [-1.31688936]


action logp_t
[[-0.47512693  0.10663677]] [-1.15965719]


action logp_t
[[-0.38592025 


action logp_t
[[-0.09405265  0.52290237]] [-1.22081278]


action logp_t
[[0.17522638 0.13440331]] [-0.90440932]


action logp_t
[[0.0130522  0.29032332]] [-0.95242775]


action logp_t
[[-0.18910093 -0.31845901]] [-1.02421125]


action logp_t
[[-0.48995631  0.35538111]] [-1.3344257]


action logp_t
[[0.07454947 0.29128205]] [-0.96064739]


action logp_t
[[0.38564334 0.16578777]] [-1.07804382]


action logp_t
[[-0.00832969 -0.27510629]] [-0.9410622]


action logp_t
[[ 0.13585425 -0.29864909]] [-0.98473647]


action logp_t
[[-0.18501099 -0.38599592]] [-1.08683106]


action logp_t
[[-0.48916301 -0.3100659 ]] [-1.29296175]


action logp_t
[[-0.08268369 -0.38929093]] [-1.05328095]


action logp_t
[[-0.04281002  0.53385299]] [-1.22720291]


action logp_t
[[-0.55383397 -0.51951639]] [-1.62079258]


action logp_t
[[0.48489437 0.20132928]] [-1.21342728]


action logp_t
[[0.41210699 0.18447452]] [-1.11570521]


action logp_t
[[-0.45703346 -0.5087842 ]] [-1.47299562]


action logp_t
[[-0.57373883


action logp_t
[[ 0.58087754 -0.57867535]] [-1.75246077]


action logp_t
[[ 0.29733238 -0.18533876]] [-1.00512659]


action logp_t
[[-0.04378752  0.25044305]] [-0.92559644]


action logp_t
[[ 0.15771014 -0.01360786]] [-0.87211386]


action logp_t
[[-0.53659257 -0.2781865 ]] [-1.33389742]


action logp_t
[[-0.33213917 -0.12414815]] [-1.00843927]


action logp_t
[[-0.23878488 -0.21699585]] [-0.97917848]


action logp_t
[[0.09845638 0.12267409]] [-0.87157775]


action logp_t
[[ 0.28323853 -0.14832615]] [-0.97713672]


action logp_t
[[ 0.33999646 -0.39017689]] [-1.20233121]


action logp_t
[[-0.27078056 -0.49167905]] [-1.26595701]


action logp_t
[[-0.13877267 -0.19231583]] [-0.91422906]


action logp_t
[[ 0.00681091 -0.15723497]] [-0.87157572]


action logp_t
[[-0.51523861  0.43870735]] [-1.45974824]


action logp_t
[[-0.04756631 -0.31643595]] [-0.9770388]


action logp_t
[[-0.20495586  0.60583453]] [-1.39360858]


action logp_t
[[-0.15782465  0.3258258 ]] [-1.01589351]


action logp_t
[[


action logp_t
[[0.47680069 0.04954897]] [-1.15091973]


action logp_t
[[ 0.23197325 -0.54948409]] [-1.322095]


action logp_t
[[ 0.31188545 -0.12276309]] [-0.99115099]


action logp_t
[[ 0.2386797  -0.52133015]] [-1.2854404]


action logp_t
[[ 0.4458845 -0.5776941]] [-1.56282183]


action logp_t
[[-0.46504517 -0.48846907]] [-1.45566415]


action logp_t
[[ 0.17650987 -0.54961883]] [-1.2914748]


action logp_t
[[-0.01817194  0.41668511]] [-1.07397972]


action logp_t
[[-0.11441262 -0.08322699]] [-0.86494701]


action logp_t
[[0.03255158 0.34285537]] [-0.99890475]


action logp_t
[[-0.14437962 -0.3180429 ]] [-1.00366231]


action logp_t
[[-0.35020186  0.38487072]] [-1.20505036]


action logp_t
[[ 0.27639161 -0.46103215]] [-1.23134208]


action logp_t
[[ 0.34379012 -0.40879923]] [-1.22648022]


action logp_t
[[-0.24556603  0.01508005]] [-0.91973438]


action logp_t
[[0.49520657 0.18548759]] [-1.21861512]


action logp_t
[[ 0.26254398 -0.17531963]] [-0.97386206]


action logp_t
[[-0.550658


action logp_t
[[-0.36438424  0.29501256]] [-1.13613074]


action logp_t
[[-0.05048883 -0.36903702]] [-1.02650229]


action logp_t
[[ 0.12925841 -0.58855941]] [-1.33168091]


action logp_t
[[0.11829027 0.54184128]] [-1.2559044]


action logp_t
[[0.03966514 0.23077159]] [-0.91238229]


action logp_t
[[-0.40744075 -0.55029103]] [-1.47480501]


action logp_t
[[0.11973802 0.16560232]] [-0.89471937]


action logp_t
[[0.22453464 0.36107479]] [-1.0837452]


action logp_t
[[-0.19701955 -0.31518099]] [-1.0255167]


action logp_t
[[0.56097926 0.23361006]] [-1.34023429]


action logp_t
[[-0.43073719 -0.17611516]] [-1.13184546]


action logp_t
[[-0.4434732  -0.42904409]] [-1.35502279]


action logp_t
[[-0.04585324 -0.44479246]] [-1.1096044]


action logp_t
[[0.53495559 0.42170524]] [-1.46895074]


action logp_t
[[0.49856625 0.29360549]] [-1.293268]


action logp_t
[[ 0.36255343 -0.33149848]] [-1.16616875]


action logp_t
[[ 0.41598006 -0.29357045]] [-1.19052032]


action logp_t
[[-0.56784454 -0.22


action logp_t
[[-0.01360153 -0.21433168]] [-0.90073452]


action logp_t
[[-0.18845479 -0.15763014]] [-0.9196535]


action logp_t
[[-0.25600713  0.00050267]] [-0.92639568]


action logp_t
[[-0.24150653  0.10777789]] [-0.93231234]


action logp_t
[[-0.5764785   0.30053034]] [-1.41078376]


action logp_t
[[-0.18747752 -0.36342335]] [-1.06506295]


action logp_t
[[ 0.14082307 -0.08386897]] [-0.87477192]


action logp_t
[[-0.52116946 -0.42351777]] [-1.45003873]


action logp_t
[[-0.35406366  0.32573679]] [-1.15142536]


action logp_t
[[-0.09399367 -0.41841938]] [-1.08796966]


action logp_t
[[ 0.22495262 -0.59388515]] [-1.38699041]


action logp_t
[[-0.07472559  0.06745704]] [-0.85143289]


action logp_t
[[0.02692216 0.18431485]] [-0.88494494]


action logp_t
[[ 0.19026613 -0.06593397]] [-0.89345605]


action logp_t
[[-0.46960666 -0.57126391]] [-1.58060448]


action logp_t
[[ 0.33282813 -0.04823757]] [-0.99236122]


action logp_t
[[ 0.18772824 -0.22572366]] [-0.95561746]


action logp_t
[[


action logp_t
[[0.22180735 0.14749852]] [-0.93444454]


action logp_t
[[-0.44352579 -0.1108597 ]] [-1.1217388]


action logp_t
[[ 0.44266513 -0.37600947]] [-1.29640685]


action logp_t
[[0.43357194 0.18686931]] [-1.14105613]


action logp_t
[[-0.01014357  0.16331604]] [-0.87431146]


action logp_t
[[ 0.00703361 -0.46914835]] [-1.13695453]


action logp_t
[[ 0.38181292 -0.41648771]] [-1.27178645]


action logp_t
[[ 0.55607669 -0.04253246]] [-1.26078856]


action logp_t
[[ 0.27759244 -0.425149  ]] [-1.18826087]


action logp_t
[[ 0.10971879 -0.18918527]] [-0.90287899]


action logp_t
[[-0.18001679  0.38229482]] [-1.08064992]


action logp_t
[[-0.03950843 -0.51365292]] [-1.19875668]


action logp_t
[[-0.19594955  0.09218627]] [-0.90156612]


action logp_t
[[-0.16297855 -0.06197559]] [-0.87918376]


action logp_t
[[-0.30022525 -0.4774716 ]] [-1.26987607]

episode: 33/500, score: -192.7230684458045, average: -177.93 

action logp_t
[[0.38442584 0.37559677]] [-1.23092543]


action logp_t
[[


action logp_t
[[-0.59071669 -0.17850366]] [-1.35476261]


action logp_t
[[-0.39612952 -0.58488589]] [-1.51595925]


action logp_t
[[-0.05167184  0.0476503 ]] [-0.84449247]


action logp_t
[[0.44865744 0.60672632]] [-1.61195926]


action logp_t
[[ 0.54276937 -0.32545899]] [-1.38313278]


action logp_t
[[-0.40149277 -0.16306698]] [-1.09270702]


action logp_t
[[0.17795904 0.59657152]] [-1.36448608]


action logp_t
[[-0.03265533  0.29027616]] [-0.95364115]


action logp_t
[[-0.20329979 -0.42217165]] [-1.13628967]


action logp_t
[[ 0.03573938 -0.19742712]] [-0.89274138]


action logp_t
[[0.31583389 0.52826979]] [-1.35281126]


action logp_t
[[0.49850819 0.19868816]] [-1.22974556]


action logp_t
[[-0.50479305 -0.39390407]] [-1.39474656]


action logp_t
[[-0.34593774  0.56387996]] [-1.43201241]


action logp_t
[[-0.16541378 -0.13857659]] [-0.90105525]


action logp_t
[[ 0.1552719  -0.13257343]] [-0.89476185]


action logp_t
[[ 0.33200814 -0.46737613]] [-1.28515287]


action logp_t
[[ 0.31


action logp_t
[[ 0.42630655 -0.35091999]] [-1.25305224]


action logp_t
[[-0.54917655  0.25058634]] [-1.3321951]


action logp_t
[[-0.19397579 -0.33886066]] [-1.04495508]


action logp_t
[[0.14360083 0.40466202]] [-1.08849817]


action logp_t
[[-0.60344673 -0.04057147]] [-1.33417756]


action logp_t
[[-0.55843279 -0.40509523]] [-1.48411495]


action logp_t
[[ 0.45758519 -0.17232132]] [-1.16353884]


action logp_t
[[-0.4662987   0.53319007]] [-1.51892087]


action logp_t
[[-0.10820936 -0.10073085]] [-0.86746085]


action logp_t
[[ 0.59285092 -0.43069191]] [-1.56866478]


action logp_t
[[0.15132624 0.47462008]] [-1.17525083]


action logp_t
[[0.44279634 0.34099806]] [-1.26293852]


action logp_t
[[ 0.12120526 -0.40368556]] [-1.07961005]


action logp_t
[[-0.57434654  0.4808201 ]] [-1.59949198]


action logp_t
[[0.41403991 0.05203613]] [-1.0751228]


action logp_t
[[0.30519614 0.43483013]] [-1.22178338]


action logp_t
[[-0.03525563 -0.20437972]] [-0.8963366]


action logp_t
[[-0.3138946


action logp_t
[[-0.12445947 -0.55442253]] [-1.27700257]


action logp_t
[[-0.12520532  0.34965568]] [-1.02474238]


action logp_t
[[-0.05110294 -0.20948836]] [-0.90117786]


action logp_t
[[-0.33747441  0.0819148 ]] [-1.001017]


action logp_t
[[-0.46902351  0.08492328]] [-1.14562657]


action logp_t
[[-0.27226762 -0.28484269]] [-1.04862657]


action logp_t
[[-0.38707195 -0.14671359]] [-1.07011893]


action logp_t
[[-0.01266648  0.29276138]] [-0.95428143]


action logp_t
[[-0.23159114 -0.53208992]] [-1.29560447]


action logp_t
[[-0.54670272  0.60030289]] [-1.73219817]


action logp_t
[[-0.05805354 -0.58300248]] [-1.30482272]


action logp_t
[[0.11649243 0.03869954]] [-0.85855864]


action logp_t
[[0.1332494 0.4328863]] [-1.11657672]


action logp_t
[[0.50269623 0.19879187]] [-1.23589118]


action logp_t
[[-0.2946611   0.45617723]] [-1.23771603]


action logp_t
[[-0.30849487 -0.34421116]] [-1.12793777]


action logp_t
[[-0.11596873 -0.31510842]] [-0.9911438]


action logp_t
[[ 0.17561


action logp_t
[[ 0.32406476 -0.34748065]] [-1.14530237]


action logp_t
[[0.56533433 0.01282606]] [-1.27315861]


action logp_t
[[-0.00597803  0.03094095]] [-0.83920303]


action logp_t
[[-0.37188987  0.48238031]] [-1.34142112]


action logp_t
[[-0.54015928  0.29224633]] [-1.34975245]


action logp_t
[[-0.24687675 -0.42978425]] [-1.17167637]


action logp_t
[[-0.40337356 -0.50103499]] [-1.39997017]


[[-0.40337356 -0.50103499]]



In [47]:
e= gym.make('LunarLanderContinuous-v2')
e.action_space.sample()

array([-0.7120607 ,  0.16744614], dtype=float32)

In [44]:
str(a)== str(b)

True