In [1]:
import shutup
shutup.please()

In [2]:
import numpy as np

In [3]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '-1' # -1:cpu, 0:first gpu
import random
import gym
import pybullet_envs
import pylab
import numpy as np
import tensorflow as tf
from tensorboardX import SummaryWriter

from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam, RMSprop, Adagrad, Adadelta
from tensorflow.keras import backend as K
import copy

from threading import Thread, Lock
from multiprocessing import Process, Pipe
import time

tf.compat.v1.disable_eager_execution() # usually using this for fastest performance

# Helpers

# gaussian_likelihood - ver se consigo uma função global
# Keras loss
# def gaussian_likelihood(self, actions, pred): # for keras custom loss
#     log_std = -0.5 * np.ones(self.action_space, dtype=np.float32)
#     pre_sum = -0.5 * (((actions-pred)/(K.exp(log_std)+1e-8))**2 + 2*log_std + K.log(2*np.pi))
#     return K.sum(pre_sum, axis=1)
#
# # Agent
# def gaussian_likelihood(self, action, pred, log_std):
#     # https://github.com/hill-a/stable-baselines/blob/master/stable_baselines/sac/policies.py
#     pre_sum = -0.5 * (((action-pred)/(np.exp(log_std)+1e-8))**2 + 2*log_std + np.log(2*np.pi))
#     return np.sum(pre_sum, axis=1)
def gaussian_likelihood(log_std, lib="keras"): # for keras custom loss
    _exp = K.exp
    _log = K.log
    _sum = K.sum
    if lib == "numpy":
        _exp = np.exp
        _log = np.log
        _sum = np.sum
    
    def fn(actions,pred):
        pre_sum = -0.5 * (((actions-pred)/(_exp(log_std)+1e-8))**2 + 2*log_std + _log(2*np.pi))
        return _sum(pre_sum, axis=1)
    
    return fn

# Continuous
class PpoActorContinuous:
    def __init__(self, input_shape, action_space, lr, optimizer,loss_clipping = 0.2):

        self.action_space = action_space
        self.loss_clipping = loss_clipping
        self.log_std = -0.5 * np.ones(self.action_space , dtype=np.float32)

        self.gaussian_likelihood = gaussian_likelihood(self.log_std, lib="keras")
        
        X_input = Input(input_shape)
        

        X = Dense(512, activation="relu", kernel_initializer=tf.random_normal_initializer(stddev=0.01))(X_input)
        X = Dense(256, activation="relu", kernel_initializer=tf.random_normal_initializer(stddev=0.01))(X)
        X = Dense(64, activation="relu", kernel_initializer=tf.random_normal_initializer(stddev=0.01))(X)
        output = Dense(self.action_space, activation="tanh")(X)

        self.Actor = Model(inputs = X_input, outputs = output)
        self.Actor.compile(loss=self.ppo_loss, optimizer=optimizer(learning_rate=lr))

    def ppo_loss(self, y_true, y_pred):
        advantages, actions, logp_old_ph, = y_true[:, :1], y_true[:, 1:1+self.action_space], y_true[:, 1+self.action_space]

        logp = self.gaussian_likelihood(actions, y_pred)

        ratio = K.exp(logp - logp_old_ph)

        p1 = ratio * advantages
        p2 = tf.where(advantages > 0, (1.0 + self.loss_clipping)*advantages, (1.0 - self.loss_clipping)*advantages) # minimum advantage

        actor_loss = -K.mean(K.minimum(p1, p2))

        return actor_loss

#     def gaussian_likelihood(self, actions, pred): # for keras custom loss
        
        
#         #log_std = -0.5 * np.ones(self.action_space, dtype=np.float32)
#         pre_sum = -0.5 * (((actions-pred)/(K.exp(self.log_std)+1e-8))**2 + 2*self.log_std + K.log(2*np.pi))
#         return K.sum(pre_sum, axis=1)

    def predict(self, state):
        return self.Actor.predict(state)

# Discrete
class PpoActorDiscrete:
    def __init__(self, input_shape, action_space, lr, optimizer,loss_clipping=0.2,loss_entropy=0.001):

        self.action_space = action_space
        self.loss_clipping = loss_clipping
        self.loss_entropy = loss_entropy

        X_input = Input(input_shape)


        X = Dense(512, activation="relu", kernel_initializer=tf.random_normal_initializer(stddev=0.01))(X_input)
        X = Dense(256, activation="relu", kernel_initializer=tf.random_normal_initializer(stddev=0.01))(X)
        X = Dense(64, activation="relu", kernel_initializer=tf.random_normal_initializer(stddev=0.01))(X)
        output = Dense(self.action_space, activation="softmax")(X)

        self.Actor = Model(inputs = X_input, outputs = output)
        self.Actor.compile(loss=self.ppo_loss, optimizer=optimizer(learning_rate=lr))

    def ppo_loss(self, y_true, y_pred):
        # Defined in https://arxiv.org/abs/1707.06347
        #advantages, prediction_picks, actions = y_true[:, :1], y_true[:, 1:1+self.action_space], y_true[:, 1+self.action_space:]
        advantages,  actions, prediction_picks = y_true[:, :1], y_true[:, 1:1+self.action_space], y_true[:, 1+self.action_space:]

        prob = actions * y_pred
        old_prob = actions * prediction_picks

        prob = K.clip(prob, 1e-10, 1.0)
        old_prob = K.clip(old_prob, 1e-10, 1.0)

        ratio = K.exp(K.log(prob) - K.log(old_prob))

        p1 = ratio * advantages
        p2 = K.clip(ratio, min_value=1 - self.loss_clipping, max_value=1 + self.loss_clipping) * advantages

        actor_loss = -K.mean(K.minimum(p1, p2))

        entropy = -(y_pred * K.log(y_pred + 1e-10))
        entropy = self.loss_entropy * K.mean(entropy)

        total_loss = actor_loss - entropy

        return total_loss

    def predict(self, state):
        return self.Actor.predict(state)


# PPO Critic for discrete or continuous only differs in the initializer
class PpoCritic:
    def __init__(self, input_shape, action_space, lr, optimizer,loss_function_version=1, loss_clipping=0.2,kernel_initializer=False,continuous_action_space=False):

        self.loss_clipping = loss_clipping
        
        X_input = Input(input_shape)
        old_values = Input(shape=(1,))

        if kernel_initializer == False:
            if continuous_action_space == False:
                kernel_initializer = 'he_uniform'
            else:
                kernel_initializer=tf.random_normal_initializer(stddev=0.01)
                
        if loss_function_version == 1:
            loss_function = self.ppo_loss
        else:
            loss_function = self.ppo_loss_2(old_values)

        V = Dense(512, activation="relu", kernel_initializer=kernel_initializer)(X_input)
        V = Dense(256, activation="relu", kernel_initializer=kernel_initializer)(V)
        V = Dense(64, activation="relu", kernel_initializer=kernel_initializer)(V)
        value = Dense(1, activation=None)(V)

        self.Critic = Model(inputs=[X_input, old_values], outputs = value)
        self.Critic.compile(loss=[loss_function], optimizer=optimizer(learning_rate=lr))
        
    def ppo_loss(self, y_true, y_pred):
        value_loss = K.mean((y_true - y_pred) ** 2) # standard PPO loss
        return value_loss

    def ppo_loss_2(self, values):
        def loss(y_true, y_pred):

            clipped_value_loss = values + K.clip(y_pred - values, -self.loss_clipping, self.loss_clipping)
            v_loss1 = (y_true - clipped_value_loss) ** 2
            v_loss2 = (y_true - y_pred) ** 2

            value_loss = 0.5 * K.mean(K.maximum(v_loss1, v_loss2))
            #value_loss = K.mean((y_true - y_pred) ** 2) # standard PPO loss
            return value_loss
        return loss

    def predict(self, state):
        return self.Critic.predict([state, np.zeros((state.shape[0], 1))])

class PpoBuffer:
    def __init__(self):
        self.reset()
        
    def reset(self):
        self.states=[] 
        self.next_states=[] 
        self.actions=[] 
        self.rewards=[] 
        self.predictions=[] 
        self.dones=[] 
        
# PPO PPOAgent
class PpoAgent:
    # PPO Main Optimization Algorithm
    def __init__(self, env_name, training_batch=4000, epochs=80, episodes=1000,lr=0.00025,shuffle=False,target_kl = 0.01, continuous_action_space=False):
        # Initialization
        # Environment and PPO parameters
        self.env_name = env_name
        self.env = gym.make(env_name)
        self.target_kl = 0.01
        

        if continuous_action_space:
            self.action_size = self.env.action_space.shape[0]
        else:
            self.action_size = self.env.action_space.n
            
        self.state_size = self.env.observation_space.shape
        self.EPISODES = episodes # total episodes to train through all environments
        self.episode = 0 # used to track the episodes total count of episodes played through all thread environments
        self.max_average = 0 # when average score is above 0 model will be saved
        self.lr = lr
        self.epochs = epochs # training epochs
        self.shuffle = shuffle
        self.Training_batch = training_batch
        #self.optimizer = RMSprop
        self.optimizer = Adam
        self.replay_count = 0
        self.continuous_action_space=continuous_action_space

        # Instantiate plot memory
        self.scores_, self.episodes_, self.average_ = [], [], [] # used in matplotlib plots

        if continuous_action_space:
            self.Actor= PpoActorContinuous(self.state_size, self.action_size, lr=self.lr, optimizer = self.optimizer,loss_clipping = 0.2)
        else:
            self.Actor= PpoActorDiscrete(self.state_size, self.action_size, lr=self.lr, optimizer = self.optimizer,loss_clipping=0.2,loss_entropy=0.001)

        self.Critic = PpoCritic(self.state_size, self.action_size, lr=self.lr, optimizer = self.optimizer,loss_clipping=0.2,kernel_initializer=False,continuous_action_space=continuous_action_space)

        # do not change bellow
        self.log_std = -0.5 * np.ones(self.action_size, dtype=np.float32)
        self.std = np.exp(self.log_std)
        
        # Bind gaussian likelihood
        self.gaussian_likelihood = gaussian_likelihood(self.log_std, lib="numpy")

    def act(self, state):
        if self.continuous_action_space:
            # Use the network to predict the next action to take, using the model
            prediction = self.Actor.predict(state)

            low, high = -1.0, 1.0 # -1 and 1 are boundaries of tanh
            action = prediction + np.random.uniform(low, high, size=prediction.shape) * self.std
            action = np.clip(action, low, high)

            logp_t = self.gaussian_likelihood(action, prediction)

            return action[0], action , logp_t[0]
        else:
            prediction = self.Actor.predict(state)[0]
            action = np.random.choice(self.action_size, p=prediction)
            action_onehot = np.zeros([self.action_size])
            action_onehot[action] = 1
            return action, action_onehot, prediction

    def reshape_state(self,state):
        return np.reshape(state, [1, self.state_size[0]])
    
    def run_batch(self):
        state = self.env.reset()
        state = self.reshape_state(state)
        done, score = False, 0
        while True:
            # Instantiate or reset games memory
            buffer = PpoBuffer()

            for t in range(self.Training_batch):
                #self.env.render()
                # Actor picks an action
                action, action_data, prediction = self.act(state)

                # Retrieve new state, reward, and whether the state is terminal

                next_state, reward, done, _ = self.env.step(action) 
                next_state = self.reshape_state(next_state)
                # Memorize (state, next_states, action, reward, done, logp_ts) for training
                buffer.states.append(state)
                buffer.next_states.append(next_state)
  
                buffer.actions.append(action_data)
                buffer.rewards.append(reward)
                buffer.dones.append(done)
  
                buffer.predictions.append(prediction)

                # Update current state shape
                state = next_state
                score += reward
                if done:
                    self.episode += 1
                    average = self.checkpoint(score, self.episode)
                    #if str(self.episode)[-2:] == "00":
                    print("episode: {}/{}, score: {}, average: {:.2f} {}".format(self.episode, self.EPISODES, score, average, ''))
                    state, done, score = self.env.reset(), False, 0
                    state = self.reshape_state(state) #np.reshape(state, [1, self.state_size[0]])


            if self.episode >= self.EPISODES:
                break
            
            print()
            print(action_data)
            print()
            self.replay(buffer)

  
        self.env.close()


    def replay(self, buffer):
        # reshape memory to appropriate shape for training
        states = np.vstack(buffer.states)
        next_states = np.vstack(buffer.next_states)
        actions = np.vstack(buffer.actions)
        predictions = np.vstack(buffer.predictions)
        rewards = buffer.rewards
        dones = buffer.dones
        
        # Get Critic network predictions
        values = self.Critic.predict(states)
        next_values = self.Critic.predict(next_states)

        # Compute discounted rewards and advantages
        advantages, target = self.get_gaes(rewards, dones, np.squeeze(values), np.squeeze(next_values))

        # stack everything to numpy array pack all advantages, predictions and actions to y_true and when they are received in custom loss function we unpack it

        y_true = np.hstack([advantages, actions, predictions])

        # training Actor and Critic networks
        a_loss = self.Actor.Actor.fit(states, y_true, epochs=self.epochs, verbose=0, shuffle=self.shuffle)
        c_loss = self.Critic.Critic.fit([states, values], target, epochs=self.epochs, verbose=0, shuffle=self.shuffle)

        if self.continuous_action_space:
            # calculate loss parameters (should be done in loss, but couldn't find working way how to do that with disabled eager execution)
            pred = self.Actor.predict(states)
            #log_std = -0.5 * np.ones(self.action_size, dtype=np.float32)
            #logp = self.gaussian_likelihood(actions, pred, log_std)
            logp = self.gaussian_likelihood(actions, pred)
            approx_kl = np.mean(predictions - logp)
            approx_ent = np.mean(-logp)
            print()
            print('approx_kl',approx_kl)
            print('approx_ent',approx_ent)
            print()
        self.replay_count += 1
        
        buffer.reset()

    ### Equal fns
#     def gaussian_likelihood(self, action, pred):
#         # for continuous only
#         # https://github.com/hill-a/stable-baselines/blob/master/stable_baselines/sac/policies.py
#         pre_sum = -0.5 * (((action-pred)/(np.exp(self.log_std)+1e-8))**2 + 2*self.log_std + np.log(2*np.pi))
#         return np.sum(pre_sum, axis=1)

    def get_gaes(self, rewards, dones, values, next_values, gamma = 0.99, lamda = 0.90, normalize=True):
        deltas = [r + gamma * (1 - d) * nv - v for r, d, nv, v in zip(rewards, dones, next_values, values)]
        deltas = np.stack(deltas)
        gaes = copy.deepcopy(deltas)
        for t in reversed(range(len(deltas) - 1)):
            gaes[t] = gaes[t] + (1 - dones[t]) * gamma * lamda * gaes[t + 1]

        target = gaes + values
        if normalize:
            gaes = (gaes - gaes.mean()) / (gaes.std() + 1e-8)
        return np.vstack(gaes), np.vstack(target)

    def load(self):
#         self.Actor.Actor.load_weights(self.Actor_name)
#         self.Critic.Critic.load_weights(self.Critic_name)
        pass

    def save(self):
#         self.Actor.Actor.save_weights(self.Actor_name)
#         self.Critic.Critic.save_weights(self.Critic_name)
        pass

    def checkpoint(self, score, episode):
        self.scores_.append(score)
        self.episodes_.append(episode)
        self.average_.append(sum(self.scores_[-50:]) / len(self.scores_[-50:]))
        saving = False
        # saving best models
        if self.average_[-1] >= self.max_average:
            self.max_average = self.average_[-1]
            self.save()
            # decreaate learning rate every saved model
            self.lr *= 0.95
            K.set_value(self.Actor.Actor.optimizer.learning_rate, self.lr)
            K.set_value(self.Critic.Critic.optimizer.learning_rate, self.lr)
            saving = True
            print()
            print('New record')
            print()

        if str(episode)[-2:] == "00":# much faster than episode % 100
            # Do some logging
            pass

        return self.average_[-1]


In [4]:
discrete_agent = PpoAgent('CartPole-v1', training_batch=4000,epochs=80,lr=3e-4,episodes=500, continuous_action_space=False)
discrete_agent.run_batch()


New record

episode: 1/500, score: 37.0, average: 37.00 
episode: 2/500, score: 12.0, average: 24.50 
episode: 3/500, score: 39.0, average: 29.33 
episode: 4/500, score: 10.0, average: 24.50 
episode: 5/500, score: 15.0, average: 22.60 
episode: 6/500, score: 21.0, average: 22.33 
episode: 7/500, score: 34.0, average: 24.00 
episode: 8/500, score: 34.0, average: 25.25 
episode: 9/500, score: 19.0, average: 24.56 
episode: 10/500, score: 16.0, average: 23.70 
episode: 11/500, score: 52.0, average: 26.27 
episode: 12/500, score: 13.0, average: 25.17 
episode: 13/500, score: 20.0, average: 24.77 
episode: 14/500, score: 17.0, average: 24.21 
episode: 15/500, score: 12.0, average: 23.40 
episode: 16/500, score: 26.0, average: 23.56 
episode: 17/500, score: 15.0, average: 23.06 
episode: 18/500, score: 42.0, average: 24.11 
episode: 19/500, score: 14.0, average: 23.58 
episode: 20/500, score: 13.0, average: 23.05 
episode: 21/500, score: 30.0, average: 23.38 
episode: 22/500, score: 9.0, a

episode: 178/500, score: 42.0, average: 24.18 
episode: 179/500, score: 39.0, average: 24.34 
episode: 180/500, score: 25.0, average: 24.32 
episode: 181/500, score: 16.0, average: 24.40 
episode: 182/500, score: 35.0, average: 24.82 
episode: 183/500, score: 29.0, average: 25.12 
episode: 184/500, score: 30.0, average: 25.22 
episode: 185/500, score: 36.0, average: 25.48 
episode: 186/500, score: 14.0, average: 25.24 
episode: 187/500, score: 20.0, average: 25.20 
episode: 188/500, score: 18.0, average: 25.28 
episode: 189/500, score: 61.0, average: 25.84 
episode: 190/500, score: 13.0, average: 25.56 
episode: 191/500, score: 16.0, average: 25.46 
episode: 192/500, score: 19.0, average: 25.60 
episode: 193/500, score: 21.0, average: 25.62 
episode: 194/500, score: 13.0, average: 25.64 
episode: 195/500, score: 54.0, average: 25.78 
episode: 196/500, score: 57.0, average: 26.52 
episode: 197/500, score: 33.0, average: 26.62 
episode: 198/500, score: 10.0, average: 26.34 
episode: 199/


New record

episode: 343/500, score: 174.0, average: 94.94 

New record

episode: 344/500, score: 200.0, average: 97.40 

New record

episode: 345/500, score: 122.0, average: 98.08 

New record

episode: 346/500, score: 146.0, average: 100.22 

New record

episode: 347/500, score: 205.0, average: 103.46 

New record

episode: 348/500, score: 200.0, average: 103.58 
episode: 349/500, score: 149.0, average: 101.90 
episode: 350/500, score: 169.0, average: 102.82 
episode: 351/500, score: 107.0, average: 102.18 

New record

episode: 352/500, score: 194.0, average: 105.12 

New record

episode: 353/500, score: 185.0, average: 107.70 

New record

episode: 354/500, score: 196.0, average: 110.86 

New record

episode: 355/500, score: 110.0, average: 112.18 

New record

episode: 356/500, score: 135.0, average: 114.08 
episode: 357/500, score: 15.0, average: 111.62 
episode: 358/500, score: 159.0, average: 110.00 

New record

episode: 359/500, score: 254.0, average: 114.52 

New record

ep

episode: 495/500, score: 194.0, average: 186.74 
episode: 496/500, score: 198.0, average: 187.60 
episode: 497/500, score: 169.0, average: 186.72 
episode: 498/500, score: 238.0, average: 187.06 
episode: 499/500, score: 437.0, average: 191.30 
episode: 500/500, score: 186.0, average: 191.56 
episode: 501/500, score: 152.0, average: 190.80 
episode: 502/500, score: 163.0, average: 190.74 
episode: 503/500, score: 169.0, average: 190.20 
episode: 504/500, score: 202.0, average: 190.54 
episode: 505/500, score: 201.0, average: 191.44 
episode: 506/500, score: 199.0, average: 190.74 


In [5]:
continuous_agent= PpoAgent('LunarLanderContinuous-v2',training_batch=4000, epochs=80,episodes=500, continuous_action_space=True)
#continuous_agent = PpoAgent('InvertedPendulumBulletEnv-v0',training_batch=4000, epochs=80,episodes=500, continuous_action_space=True)
continuous_agent.run_batch()

# action logp_t
# [[-0.54015928  0.29224633]] [-1.34975245]
# [-0.54015928  0.29224633] , [[-0.54015928  0.29224633]], -1.34975245

episode: 1/500, score: -73.80934093550005, average: -73.81 
episode: 2/500, score: -110.71513130168515, average: -92.26 
episode: 3/500, score: -56.99671323749743, average: -80.51 
episode: 4/500, score: -68.42991247259818, average: -77.49 
episode: 5/500, score: -146.27684908612105, average: -91.25 
episode: 6/500, score: -203.26603902440834, average: -109.92 
episode: 7/500, score: -108.71473057548275, average: -109.74 
episode: 8/500, score: -42.867126607946986, average: -101.38 
episode: 9/500, score: -112.45305632368402, average: -102.61 
episode: 10/500, score: -90.92765935575524, average: -101.45 
episode: 11/500, score: -335.14904398837626, average: -122.69 
episode: 12/500, score: -105.82560744671177, average: -121.29 
episode: 13/500, score: -94.59168094207932, average: -119.23 
episode: 14/500, score: -173.66428454240173, average: -123.12 
episode: 15/500, score: -87.25436812944359, average: -120.73 
episode: 16/500, score: -246.5841846271375, average: -128.60 
episode: 17/5

episode: 130/500, score: -49.36924549623862, average: -160.60 
episode: 131/500, score: -175.1805806086728, average: -157.04 
episode: 132/500, score: -63.547330982820625, average: -151.77 
episode: 133/500, score: -75.99584930577714, average: -151.92 
episode: 134/500, score: -56.181170950049975, average: -145.12 
episode: 135/500, score: -102.86201460711688, average: -140.13 
episode: 136/500, score: -84.99976549535069, average: -139.90 
episode: 137/500, score: -62.21469721966856, average: -140.46 
episode: 138/500, score: -130.42939165196722, average: -142.17 
episode: 139/500, score: -59.68774749110422, average: -143.23 
episode: 140/500, score: -98.20480200319341, average: -143.97 
episode: 141/500, score: -83.57874142935049, average: -138.91 
episode: 142/500, score: -92.36156280673595, average: -138.97 
episode: 143/500, score: -91.59804376101715, average: -135.01 
episode: 144/500, score: -119.27935262314917, average: -135.32 
episode: 145/500, score: -48.51234084408595, avera

episode: 256/500, score: -22.221691715910225, average: -97.35 
episode: 257/500, score: 3.6511632455323166, average: -94.93 
episode: 258/500, score: -191.62110750698838, average: -94.20 
episode: 259/500, score: -195.1981381939554, average: -94.01 
episode: 260/500, score: -211.74862701166126, average: -97.05 
episode: 261/500, score: -55.75196932586094, average: -96.70 
episode: 262/500, score: -52.762040253276155, average: -95.78 
episode: 263/500, score: -79.5678640571608, average: -96.03 
episode: 264/500, score: -33.591500893903145, average: -94.23 
episode: 265/500, score: -99.39698373265256, average: -95.47 

[[-1.          0.37920969]]


approx_kl 0.2535750060772152
approx_ent 1.4194493652942124

episode: 266/500, score: -204.4554616126003, average: -98.46 
episode: 267/500, score: 15.457937400960915, average: -94.10 
episode: 268/500, score: -144.70547273037363, average: -94.37 
episode: 269/500, score: -14.365874662357896, average: -93.08 
episode: 270/500, score: -230.70902

episode: 381/500, score: -26.69796867022025, average: -183.49 
episode: 382/500, score: -60.159008010710416, average: -179.51 
episode: 383/500, score: -160.2071906956552, average: -170.60 
episode: 384/500, score: -102.0595194065881, average: -172.03 

[[-0.95896574 -0.12970419]]


approx_kl 0.13210237864274457
approx_ent 1.269770905512089

episode: 385/500, score: -71.33429826785463, average: -158.74 
episode: 386/500, score: -1.765801177150479, average: -148.64 
episode: 387/500, score: -30.823434376069272, average: -140.52 
episode: 388/500, score: -64.87751197637265, average: -135.44 
episode: 389/500, score: -115.36008627032457, average: -136.17 
episode: 390/500, score: -65.44174156990317, average: -131.45 
episode: 391/500, score: 1.0944754925242393, average: -123.22 
episode: 392/500, score: -29.26478695923406, average: -123.89 
episode: 393/500, score: -40.78981906488062, average: -116.79 
episode: 394/500, score: -33.74301195103972, average: -116.04 
episode: 395/500, score:

episode: 508/500, score: -173.8598667677232, average: -86.62 
episode: 509/500, score: -40.3465990366748, average: -84.97 
episode: 510/500, score: -100.7774437559279, average: -83.33 
episode: 511/500, score: -101.00082713084304, average: -82.85 
episode: 512/500, score: -128.50500832493884, average: -82.64 


In [10]:
e= gym.make('LunarLanderContinuous-v2')
e.action_space.sample(),e.action_space.sample().shape

(array([0.80637634, 0.94653237], dtype=float32), (2,))

In [7]:
e.reset()
e.step(np.array([ 0.0255991 , -0.17774109]))

(array([-0.01228723,  1.4177923 , -0.6244158 ,  0.15316963,  0.01370645,
         0.13240685,  0.        ,  0.        ], dtype=float32),
 -1.7243087057170101,
 False,
 {})

In [8]:
e.step([ 0.0255991 , -0.17774109])

(array([-0.01842794,  1.4214106 , -0.6209765 ,  0.16072994,  0.02053215,
         0.13652691,  0.        ,  0.        ], dtype=float32),
 -1.0561277959531197,
 False,
 {})

In [11]:
e.continuous

True

In [14]:
e.spec.name

'LunarLanderContinuous'