In [1]:
import shutup
shutup.please()

In [2]:
import numpy as np

In [3]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '-1' # -1:cpu, 0:first gpu
import random
import gym
import pybullet_envs
import pylab
import numpy as np
import tensorflow as tf
from tensorboardX import SummaryWriter

from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam, RMSprop, Adagrad, Adadelta
from tensorflow.keras import backend as K
import copy

from threading import Thread, Lock
from multiprocessing import Process, Pipe
import time

tf.compat.v1.disable_eager_execution() # usually using this for fastest performance

# Helpers

# gaussian_likelihood - ver se consigo uma função global
# Keras loss
# def gaussian_likelihood(self, actions, pred): # for keras custom loss
#     log_std = -0.5 * np.ones(self.action_space, dtype=np.float32)
#     pre_sum = -0.5 * (((actions-pred)/(K.exp(log_std)+1e-8))**2 + 2*log_std + K.log(2*np.pi))
#     return K.sum(pre_sum, axis=1)
#
# # Agent
# def gaussian_likelihood(self, action, pred, log_std):
#     # https://github.com/hill-a/stable-baselines/blob/master/stable_baselines/sac/policies.py
#     pre_sum = -0.5 * (((action-pred)/(np.exp(log_std)+1e-8))**2 + 2*log_std + np.log(2*np.pi))
#     return np.sum(pre_sum, axis=1)


# Continuous
class PpoActorContinuous:
    def __init__(self, input_shape, action_space, lr, optimizer,loss_clipping = 0.2):

        self.loss_clipping = loss_clipping

        X_input = Input(input_shape)
        self.action_space = action_space

        X = Dense(512, activation="relu", kernel_initializer=tf.random_normal_initializer(stddev=0.01))(X_input)
        X = Dense(256, activation="relu", kernel_initializer=tf.random_normal_initializer(stddev=0.01))(X)
        X = Dense(64, activation="relu", kernel_initializer=tf.random_normal_initializer(stddev=0.01))(X)
        output = Dense(self.action_space, activation="tanh")(X)

        self.Actor = Model(inputs = X_input, outputs = output)
        self.Actor.compile(loss=self.ppo_loss, optimizer=optimizer(learning_rate=lr))

    def ppo_loss(self, y_true, y_pred):
        advantages, actions, logp_old_ph, = y_true[:, :1], y_true[:, 1:1+self.action_space], y_true[:, 1+self.action_space]

        logp = self.gaussian_likelihood(actions, y_pred)

        ratio = K.exp(logp - logp_old_ph)

        p1 = ratio * advantages
        p2 = tf.where(advantages > 0, (1.0 + self.loss_clipping)*advantages, (1.0 - self.loss_clipping)*advantages) # minimum advantage

        actor_loss = -K.mean(K.minimum(p1, p2))

        return actor_loss

    def gaussian_likelihood(self, actions, pred): # for keras custom loss
        log_std = -0.5 * np.ones(self.action_space, dtype=np.float32)
        pre_sum = -0.5 * (((actions-pred)/(K.exp(log_std)+1e-8))**2 + 2*log_std + K.log(2*np.pi))
        return K.sum(pre_sum, axis=1)

    def predict(self, state):
        return self.Actor.predict(state)

# Discrete
class PpoActorDiscrete:
    def __init__(self, input_shape, action_space, lr, optimizer,loss_clipping=0.2,loss_entropy=0.001):

        self.loss_clipping = loss_clipping
        self.loss_entropy = loss_entropy

        X_input = Input(input_shape)
        self.action_space = action_space

        X = Dense(512, activation="relu", kernel_initializer=tf.random_normal_initializer(stddev=0.01))(X_input)
        X = Dense(256, activation="relu", kernel_initializer=tf.random_normal_initializer(stddev=0.01))(X)
        X = Dense(64, activation="relu", kernel_initializer=tf.random_normal_initializer(stddev=0.01))(X)
        output = Dense(self.action_space, activation="softmax")(X)

        self.Actor = Model(inputs = X_input, outputs = output)
        self.Actor.compile(loss=self.ppo_loss, optimizer=optimizer(learning_rate=lr))

    def ppo_loss(self, y_true, y_pred):
        # Defined in https://arxiv.org/abs/1707.06347
        #advantages, prediction_picks, actions = y_true[:, :1], y_true[:, 1:1+self.action_space], y_true[:, 1+self.action_space:]
        advantages,  actions, prediction_picks = y_true[:, :1], y_true[:, 1:1+self.action_space], y_true[:, 1+self.action_space:]

        prob = actions * y_pred
        old_prob = actions * prediction_picks

        prob = K.clip(prob, 1e-10, 1.0)
        old_prob = K.clip(old_prob, 1e-10, 1.0)

        ratio = K.exp(K.log(prob) - K.log(old_prob))

        p1 = ratio * advantages
        p2 = K.clip(ratio, min_value=1 - self.loss_clipping, max_value=1 + self.loss_clipping) * advantages

        actor_loss = -K.mean(K.minimum(p1, p2))

        entropy = -(y_pred * K.log(y_pred + 1e-10))
        entropy = self.loss_entropy * K.mean(entropy)

        total_loss = actor_loss - entropy

        return total_loss

    def predict(self, state):
        return self.Actor.predict(state)


# PPO Critic for discrete or continuous only differs in the initializer
class PpoCritic:
    def __init__(self, input_shape, action_space, lr, optimizer,loss_clipping=0.2,kernel_initializer=False,continuous=False):

        self.loss_clipping = loss_clipping

        if kernel_initializer == False:
            if continuous == False:
                kernel_initializer = 'he_uniform'
            else:
                kernel_initializer=tf.random_normal_initializer(stddev=0.01)

        X_input = Input(input_shape)
        old_values = Input(shape=(1,))

        V = Dense(512, activation="relu", kernel_initializer=kernel_initializer)(X_input)
        V = Dense(256, activation="relu", kernel_initializer=kernel_initializer)(V)
        V = Dense(64, activation="relu", kernel_initializer=kernel_initializer)(V)
        value = Dense(1, activation=None)(V)

        self.Critic = Model(inputs=[X_input, old_values], outputs = value)
        self.Critic.compile(loss=[self.ppo_loss(old_values)], optimizer=optimizer(learning_rate=lr))

    def ppo_loss(self, values):
        def loss(y_true, y_pred):

            clipped_value_loss = values + K.clip(y_pred - values, -self.loss_clipping, self.loss_clipping)
            v_loss1 = (y_true - clipped_value_loss) ** 2
            v_loss2 = (y_true - y_pred) ** 2

            value_loss = 0.5 * K.mean(K.maximum(v_loss1, v_loss2))
            #value_loss = K.mean((y_true - y_pred) ** 2) # standard PPO loss
            return value_loss
        return loss

    def predict(self, state):
        return self.Critic.predict([state, np.zeros((state.shape[0], 1))])


# PPO PPOAgent

class PPOAgent:
    # PPO Main Optimization Algorithm
    def __init__(self, env_name, training_batch=1024, epochs=10, episodes=1000, continuous=False):
        # Initialization
        # Environment and PPO parameters
        self.env_name = env_name
        self.env = gym.make(env_name)
        if continuous:
            self.action_size = self.env.action_space.shape[0]
        else:
            self.action_size = self.env.action_space.n
            
        self.state_size = self.env.observation_space.shape
        self.EPISODES = episodes # total episodes to train through all environments
        self.episode = 0 # used to track the episodes total count of episodes played through all thread environments
        self.max_average = 0 # when average score is above 0 model will be saved
        self.lr = 0.001
        self.epochs = epochs # training epochs
        self.shuffle = True
        self.Training_batch = training_batch
        #self.optimizer = RMSprop
        self.optimizer = Adam
        self.replay_count = 0
        self.continuous_action_space=continuous

        # Instantiate plot memory
        self.scores_, self.episodes_, self.average_ = [], [], [] # used in matplotlib plots

        if continuous:
            self.Actor= PpoActorContinuous(self.state_size, self.action_size, lr=self.lr, optimizer = self.optimizer,loss_clipping = 0.2)
        else:
            self.Actor= PpoActorDiscrete(self.state_size, self.action_size, lr=self.lr, optimizer = self.optimizer,loss_clipping=0.2,loss_entropy=0.001)

        self.Critic = PpoCritic(self.state_size, self.action_size, lr=self.lr, optimizer = self.optimizer,loss_clipping=0.2,kernel_initializer=False,continuous=continuous)

        # do not change bellow
        self.log_std = -0.5 * np.ones(self.action_size, dtype=np.float32)
        self.std = np.exp(self.log_std)

    def act(self, state):
        if self.continuous_action_space:
            # Use the network to predict the next action to take, using the model
            prediction = self.Actor.predict(state)

            low, high = -1.0, 1.0 # -1 and 1 are boundaries of tanh
            action = prediction + np.random.uniform(low, high, size=prediction.shape) * self.std
            action = np.clip(action, low, high)

            logp_t = self.gaussian_likelihood(action, prediction, self.log_std)

            return action[0], action , logp_t[0]
        else:
            prediction = self.Actor.predict(state)[0]
            action = np.random.choice(self.action_size, p=prediction)
            action_onehot = np.zeros([self.action_size])
            action_onehot[action] = 1
            return action, action_onehot, prediction


    def run_batch(self):
        state = self.env.reset()
        state = np.reshape(state, [1, self.state_size[0]])
        done, score = False, 0
        while True:
            # Instantiate or reset games memory
            #states, next_states, actions, rewards, logp_ts , dones = [], [], [], [], [], []    #C
            #states, next_states, actions, rewards, predictions, dones = [], [], [], [], [], [] #D
            states, next_states, actions, rewards, predictions, dones = [], [], [], [], [], []

            for t in range(self.Training_batch):
                #self.env.render()
                # Actor picks an action
                #action, logp_t = self.act(state)                        #C
                #action, action_onehot, prediction = self.act(state)     #D
                action, action_data, prediction = self.act(state)

                # Retrieve new state, reward, and whether the state is terminal
                # next_state, reward, done, _ = self.env.step(action[0])  #C
                # next_state, reward, done, _ = self.env.step(action)     #D
                next_state, reward, done, _ = self.env.step(action) 

                # Memorize (state, next_states, action, reward, done, logp_ts) for training
                states.append(state)
                next_states.append(np.reshape(next_state, [1, self.state_size[0]]))
                # actions.append(action)          #C
                # actions.append(action_onehot)   #D
                actions.append(action_data)
                rewards.append(reward)
                dones.append(done)
                # logp_ts.append(logp_t[0])       #C
                # predictions.append(prediction)  #D
                predictions.append(prediction)

                # Update current state shape
                state = np.reshape(next_state, [1, self.state_size[0]])
                score += reward
                if done:
                    self.episode += 1
                    average = self.checkpoint(score, self.episode)
                    if str(self.episode)[-2:] == "00":
                        print("episode: {}/{}, score: {}, average: {:.2f} {}".format(self.episode, self.EPISODES, score, average, ''))
                    state, done, score = self.env.reset(), False, 0
                    state = np.reshape(state, [1, self.state_size[0]])

            # self.replay(states, actions, rewards, dones, next_states, logp_ts)      #C
            # self.replay(states, actions, rewards, predictions, dones, next_states)  #D
            self.replay(states, actions, rewards, dones, next_states, predictions)

            if self.episode >= self.EPISODES:
                break

        self.env.close()


    def replay(self, states, actions, rewards, dones, next_states, predictions):
        # reshape memory to appropriate shape for training
        states = np.vstack(states)
        next_states = np.vstack(next_states)
        actions = np.vstack(actions)
        predictions = np.vstack(predictions)

        # Get Critic network predictions
        values = self.Critic.predict(states)
        next_values = self.Critic.predict(next_states)

        # Compute discounted rewards and advantages
        advantages, target = self.get_gaes(rewards, dones, np.squeeze(values), np.squeeze(next_values))

        # stack everything to numpy array
        # pack all advantages, predictions and actions to y_true and when they are received
        # in custom loss function we unpack it
        # y_true = np.hstack([advantages, actions, predictions]) #C
        # y_true = np.hstack([advantages, predictions, actions]) #D
        y_true = np.hstack([advantages, actions, predictions])

        # training Actor and Critic networks
        a_loss = self.Actor.Actor.fit(states, y_true, epochs=self.epochs, verbose=0, shuffle=self.shuffle)
        c_loss = self.Critic.Critic.fit([states, values], target, epochs=self.epochs, verbose=0, shuffle=self.shuffle)

        # calculate loss parameters (should be done in loss, but couldn't find working way how to do that with disabled eager execution)
        # pred = self.Actor.predict(states)
        # log_std = -0.5 * np.ones(self.action_size, dtype=np.float32)
        # logp = self.gaussian_likelihood(actions, pred, log_std)
        # approx_kl = np.mean(predictions - logp)
        # approx_ent = np.mean(-logp)

        self.replay_count += 1

    ### Equal fns
    def gaussian_likelihood(self, action, pred, log_std):
        # for continuous only
        # https://github.com/hill-a/stable-baselines/blob/master/stable_baselines/sac/policies.py
        pre_sum = -0.5 * (((action-pred)/(np.exp(log_std)+1e-8))**2 + 2*log_std + np.log(2*np.pi))
        return np.sum(pre_sum, axis=1)

    def get_gaes(self, rewards, dones, values, next_values, gamma = 0.99, lamda = 0.90, normalize=True):
        deltas = [r + gamma * (1 - d) * nv - v for r, d, nv, v in zip(rewards, dones, next_values, values)]
        deltas = np.stack(deltas)
        gaes = copy.deepcopy(deltas)
        for t in reversed(range(len(deltas) - 1)):
            gaes[t] = gaes[t] + (1 - dones[t]) * gamma * lamda * gaes[t + 1]

        target = gaes + values
        if normalize:
            gaes = (gaes - gaes.mean()) / (gaes.std() + 1e-8)
        return np.vstack(gaes), np.vstack(target)

    def load(self):
#         self.Actor.Actor.load_weights(self.Actor_name)
#         self.Critic.Critic.load_weights(self.Critic_name)
        pass

    def save(self):
#         self.Actor.Actor.save_weights(self.Actor_name)
#         self.Critic.Critic.save_weights(self.Critic_name)
        pass

    def checkpoint(self, score, episode):
        self.scores_.append(score)
        self.episodes_.append(episode)
        self.average_.append(sum(self.scores_[-50:]) / len(self.scores_[-50:]))
        saving = False
        # saving best models
        if self.average_[-1] >= self.max_average:
            self.max_average = self.average_[-1]
            self.save()
            # decreaate learning rate every saved model
            self.lr *= 0.95
            K.set_value(self.Actor.Actor.optimizer.learning_rate, self.lr)
            K.set_value(self.Critic.Critic.optimizer.learning_rate, self.lr)
            saving = True

        if str(episode)[-2:] == "00":# much faster than episode % 100
            # Do some logging
            pass

        return self.average_[-1]


In [None]:
discrete_agent = PPOAgent('CartPole-v1', training_batch=1024, epochs=10, episodes=3000, continuous=False)
discrete_agent.run_batch()

episode: 100/3000, score: 56.0, average: 53.92 
episode: 200/3000, score: 130.0, average: 157.42 
episode: 300/3000, score: 114.0, average: 105.08 
episode: 400/3000, score: 101.0, average: 99.30 
episode: 500/3000, score: 94.0, average: 82.38 
episode: 600/3000, score: 98.0, average: 86.54 
episode: 700/3000, score: 96.0, average: 89.78 
episode: 800/3000, score: 101.0, average: 80.92 
episode: 900/3000, score: 94.0, average: 88.36 
episode: 1000/3000, score: 95.0, average: 88.64 
episode: 1100/3000, score: 90.0, average: 89.24 
episode: 1200/3000, score: 93.0, average: 91.10 
episode: 1300/3000, score: 105.0, average: 92.12 
episode: 1400/3000, score: 104.0, average: 99.78 
episode: 1500/3000, score: 135.0, average: 107.36 
episode: 1600/3000, score: 128.0, average: 125.70 
episode: 1700/3000, score: 122.0, average: 143.04 
episode: 1800/3000, score: 185.0, average: 254.36 
episode: 1900/3000, score: 155.0, average: 190.76 
episode: 2000/3000, score: 141.0, average: 177.94 
episode: 

In [None]:
continuous_agent = PPOAgent('InvertedPendulumBulletEnv-v0',training_batch=512, epochs=5,episodes=3000, continuous=True)
continuous_agent.run_batch()