In [1]:
import shutup
shutup.please()

In [2]:
import numpy as np

In [3]:

from src.environments.discrete.cartpole import environment

In [10]:
import tensorflow as tf

from multiprocessing import cpu_count
from tensorflow.keras.optimizers import Adam

from src.agents.agent import Agent
from src.agents.ppo.buffer import PpoBuffer
from src.agents.ppo.networks import get_actor_model,get_critic_model,act

from src.utils.networks import gaussian_likelihood
from src.utils.temporal_difference import discounted_rewards
from src.utils.temporal_difference import generalized_advantage_estimation

tf.compat.v1.disable_eager_execution() 

class PpoAgent(Agent):
    def __init__(self, 
                environment,
                epochs = 10, # training epochs
                shuffle=True,
                batch_size = 1000,
                gamma=0.99,
                lamda=0.9,
                loss_clipping = 0.2,
                loss_entropy = 0.001,
                actor_optimizer=Adam,
                critic_optimizer=Adam,
                actor_learning_rate=0.001,
                critic_learning_rate=0.001,
                actor_kernel_initializer=False, #tf.random_normal_initializer(stddev=0.01),
                critic_kernel_initializer=False, #"he_uniform",
                policy="mlp"
                ):
        super(PpoAgent, self).__init__(environment,args=locals())
        
        # HP
        self.epochs=epochs  
        self.shuffle=shuffle
        self.batch_size=batch_size 
        self.gamma=gamma
        self.lamda=lamda
        self.loss_clipping=loss_clipping
        self.loss_entropy=loss_entropy 
        self.actor_optimizer=actor_optimizer
        self.actor_learning_rate=actor_learning_rate
        self.actor_kernel_initializer=actor_kernel_initializer
        self.critic_optimizer=critic_optimizer
        self.critic_learning_rate=critic_learning_rate
        self.critic_kernel_initializer=critic_kernel_initializer
        self.policy=policy
        
        self.log_std = -0.5 * np.ones(self.n_actions, dtype=np.float32)
        self.std = np.exp(self.log_std)
        
        self.__init_networks()
        self.__init_buffers()
        self.__init_actions()
        
        self._add_models_to_config([self.actor,self.critic])
        self._init_tensorboard()
        
    def __init_networks(self):
        self.actor = get_actor_model(
            self.observation_shape,
            self.n_actions,
            self.loss_clipping,
            self.loss_entropy,
            optimizer=self.actor_optimizer,
            learning_rate=self.actor_learning_rate,
            kernel_initializer=self.actor_kernel_initializer,
            policy=self.policy,
            continuous=self.action_space_mode == 'continuous',
        )
        
        self.critic = get_critic_model(
            self.observation_shape,
            self.loss_clipping,
            optimizer=self.critic_optimizer,
            learning_rate=self.critic_learning_rate,
            kernel_initializer=self.critic_kernel_initializer,
            policy=self.policy,
        )
    
    def __init_buffers(self):
        self.buffer = PpoBuffer()
        
    def __init_actions(self):
        self.act = act(
            self.actor,
            self.n_actions, 
            std=self.std, 
            log_std=self.log_std, 
            continuous=self.action_space_mode == 'continuous'
        )
        
        self.discount_rewards = discounted_rewards(self.gamma)
        self.get_gaes = generalized_advantage_estimation(self.gamma,self.lamda)
        self.gaussian_likelihood = gaussian_likelihood(self.n_actions,lib="numpy")
        
        
    def critic_predict(self, state):
        #print(state.shape,np.zeros((state.shape[0], 1)).shape)
        return self.critic.predict([state, np.zeros((state.shape[0], 1))])
        
    def replay(self,buffer):
        print('replay')
        
        states = np.array(buffer.states,dtype=np.float32)
        actions = np.array(buffer.actions,dtype=np.float32)
        rewards = np.array(buffer.rewards,dtype=np.float32)
        predictions = np.array(buffer.predictions,dtype=np.float32)
        dones= buffer.dones
        next_states= np.array(buffer.next_states,dtype=np.float32)

        states = np.vstack(states)
        next_states = np.vstack(next_states)
        actions = np.vstack(actions)
        predictions = np.vstack(predictions)


        # Get Critic network predictions 
        values = self.critic_predict(states)
        next_values = self.critic_predict(next_states)

        # Compute discounted rewards and advantages
        #discounted_r = self.discount_rewards(rewards)
        #advantages = np.vstack(discounted_r - values)
        advantages, target = self.get_gaes(rewards, dones, np.squeeze(values), np.squeeze(next_values))
    
        print('advantages',advantages[0])
        print('preds',predictions[0],np.mean(predictions))
        print('actions',actions[0],np.mean(actions))
        print('states',states[0])
        y_true = np.array(np.hstack([advantages, predictions, actions]),dtype=np.float32)
        
        print('states')
        print(states[0:3], type(states))
        print('y_true')
        print(y_true[0:3])
        # training Actor and Critic networks
        a_loss = self.actor.fit(states, y_true, epochs=self.epochs, verbose=1, shuffle=self.shuffle)
        c_loss = self.critic.fit([states, values], target, epochs=self.epochs, verbose=1, shuffle=self.shuffle)

        print()
        print('actor loss', np.mean(a_loss.history['loss']))
        print('critic loss',np.mean(c_loss.history['loss']))
        
        print()
        print()
#         self.writer.add_scalar('Data/actor_loss_per_replay', np.sum(a_loss.history['loss']), self.replay_count)
#         self.writer.add_scalar('Data/critic_loss_per_replay', np.sum(c_loss.history['loss']), self.replay_count)
        
        if self.action_space_mode == "continuous":
            # calculate loss parameters (should be done in loss, but couldn't find working way how to do that with disabled eager execution)
            pred = self.actor.predict(states)
            log_std = -0.5 * np.ones(self.n_actions, dtype=np.float32)
            logp = self.gaussian_likelihood(actions, pred, log_std)
            approx_kl = np.mean(predictions - logp)
            approx_ent = np.mean(-logp)
            
#             self.writer.add_scalar('Data/approx_kl_per_replay', approx_kl, self.replay_count)
#             self.writer.add_scalar('Data/approx_ent_per_replay', approx_ent, self.replay_count)
        
        self.replay_count += 1
        
    def learn_single_process(self,timesteps=-1, plot_results=True, reset=False, success_threshold=False, log_level=1, log_each_n_episodes=50,):
        episode = 0
        episodes=100000
        state = self.env.reset()
        #state = np.expand_dims(state,axis=0) #np.reshape(state, [1, self.state_size[0]])
        done, score = False, 0
        avg = []
        while True:
            # Instantiate or reset games memory
            self.buffer.reset()
                               
            for t in range(self.batch_size):
                # self.env.render()
                # Actor picks an action
                
                state = np.expand_dims(state,axis=0)   
                
                # Action data is action for continuous and onehot for discrete
                # prediction_data is logp_t[0] for continuous and prediction for discrete
                action, action_data, prediction_data = self.act(state)
                
                #print("action",action, "action_data",action_data, "prediction_data",prediction_data)
                
                #print(state,action,action_data,prediction_data)
                # Retrieve new state, reward, and whether the state is terminal
                next_state, reward, done, _ = self.env.step(action)
                
                # Memorize (state, next_states, action, reward, done, logp_ts) for training
                
                self.buffer.states.append(state)
                self.buffer.next_states.append(np.expand_dims(next_state,axis=0))
                # on continuous add action on discrete one_hots
                self.buffer.actions.append(action_data)
                self.buffer.rewards.append(reward)
                self.buffer.dones.append(done)
                self.buffer.predictions.append(prediction_data)
                
                # Update current state shape
                #state = np.expand_dims(state,axis=0) #np.reshape(next_state, [1, self.state_size[0]])
                score += reward
                state = next_state
                if done:
                    
                    avg.append(score)
                    if episode % 50 ==0:
                        print('episode',episode,'score',score, 'avg', np.mean(avg[-100:]))
                    episode += 1
#                     average, SAVING = self.PlotModel(score, self.episode)
#                     print("episode: {}/{}, score: {}, average: {:.2f} {}".format(self.episode, self.EPISODES, score, average, SAVING))
#                     self.writer.add_scalar(f'Workers:{1}/score_per_episode', score, self.episode)
#                     self.writer.add_scalar(f'Workers:{1}/learning_rate', self.lr, self.episode)
#                     self.writer.add_scalar(f'Workers:{1}/average_score',  average, self.episode)
                    
                     # Step reward, tensorboard log score, print progress
                    
                    #self.on_learn_episode_end(score,log_each_n_episodes,log_level,success_threshold)
                    state, done, score = self.env.reset(), False, 0
                    #state = np.expand_dims(state,axis=0) #np.reshape(state, [1, self.state_size[0]])

            self.replay(self.buffer)
            
            if episode >= episodes:
                break

        self.env.close()

        
    def learn_multi_process(self,timesteps=-1, plot_results=True, reset=False, success_threshold=False, log_level=1, log_each_n_episodes=50,n_workers=cpu_count()):
        pass
    
    
    def learn(self, timesteps=-1, plot_results=True, reset=False, success_threshold=False, log_level=1, log_each_n_episodes=50, n_workers=cpu_count()):
        self.replay_count = 0
        if n_workers == 1:
            self.learn_single_process(timesteps, plot_results, reset, success_threshold, log_level, log_each_n_episodes)
        else: 
            self.learn_multi_process(timesteps, plot_results, reset, success_threshold, log_level, log_each_n_episodes,n_workers=1)

In [11]:
#from src.environments.discrete.cartpole import environment

#agent = PpoAgent(environment,batch_size=1024)
#agent.learn(log_each_n_episodes=100,n_workers=1, success_threshold=200)

In [12]:
from src.environments.continuous.inverted_pendulum import environment
agent_1 = PpoAgent(
    environment,
    batch_size=1024,
    actor_learning_rate=0.1,
    critic_learning_rate=0.1,
    epochs=10)
agent_1.learn(log_each_n_episodes=1,n_workers=1)


    | ---------------------------------
    | InvertedPendulumBulletEnv-v0
    | 
    | Action space: Continuous with high action-space
    | Environment beated threshold: 200
    | Dev notes:
    |   * Doesn't work with multiprocessing
    | ----------------------------------------------------------   

    
episode 0 score 26.0 avg 26.0
replay
advantages [0.90173694]
preds [-0.46798822] -0.58594644
actions [0.1902129] 0.009857004
states [0.         0.         0.9999627  0.00863607 0.        ]
states
[[ 0.0000000e+00  0.0000000e+00  9.9996269e-01  8.6360741e-03
   0.0000000e+00]
 [ 4.1279980e-04  2.5018169e-02  9.9996936e-01  7.8284256e-03
  -4.8950087e-02]
 [ 1.7069184e-03  7.8431427e-02  9.9998635e-01  5.2264696e-03
  -1.5769768e-01]] <class 'numpy.ndarray'>
y_true
[[ 0.9017369  -0.46798822  0.1902129 ]
 [ 0.8776804  -0.6387278   0.4023589 ]
 [ 0.8506782  -0.7494096   0.49329892]]
Train on 1024 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epo

Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 1024 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

actor loss 0.332649026112631
critic loss 18.976146069169044


episode 200 score 25.0 avg 23.13
replay
advantages [-1.82465012]
preds [-0.4877426] -0.57945365
actions [0.36607558] 0.15054226
states [ 0.04433046  0.5096462   0.98517805 -0.17153482 -1.3422103 ]
states
[[ 0.04433046  0.5096462   0.98517805 -0.17153482 -1.3422103 ]
 [ 0.05363625  0.5639869   0.9805943  -0.19604808 -1.511442  ]
 [ 0.          0.          0.9985802  -0.05326881  0.        ]] <class 'numpy.ndarray'>
y_true
[[-1.8246502  -0.4877426   0.36607558]
 [-2.2136989  -0.41901734  0.13346517]
 [ 0.85379314 -0.74733657  0.63263   ]]
Train on 1024 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 1024 samples
Epoch 1/10
Epoch 

Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

actor loss 0.3336546417558566
critic loss 18.09419158399105


episode 300 score 16.0 avg 24.7
replay
advantages [0.8732991]
preds [-0.41924423] -0.58134085
actions [0.13417193] 0.1531798
states [0.         0.         0.9985525  0.05378614 0.        ]
states
[[ 0.0000000e+00  0.0000000e+00  9.9855250e-01  5.3786136e-02
   0.0000000e+00]
 [ 2.5814815e-04  1.5645342e-02  9.9856484e-01  5.3556282e-02
  -1.3950666e-02]
 [ 2.0568182e-03  1.0901030e-01  9.9872839e-01  5.0414551e-02
  -1.9066590e-01]] <class 'numpy.ndarray'>
y_true
[[ 0.8732991  -0.41924423  0.13417193]
 [ 0.8247879  -0.85504013  0.71561897]
 [ 0.7703421  -0.52009964 -0.12365004]]
Train on 1024 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 1024 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

actor loss 0.3340

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 1024 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

actor loss 0.3342961736023426
critic loss 15.86817529797554


replay
advantages [-1.60231295]
preds [-0.5612245] -0.58089864
actions [-0.22630444] 0.08500254
states [-0.02046729 -0.12489657  0.98424953 -0.17678486 -0.5398339 ]
states
[[-0.02046729 -0.12489657  0.98424953 -0.17678486 -0.5398339 ]
 [-0.02290708 -0.14786601  0.98260134 -0.18572712 -0.5510857 ]
 [-0.02467427 -0.10710258  0.98040146 -0.1970102  -0.69670427]] <class 'numpy.ndarray'>
y_true
[[-1.6023129  -0.5612245  -0.22630444]
 [-1.9425474  -0.4512374   0.25140747]
 [-2.3244042  -0.5964176  -0.26410997]]
Train on 1024 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 1024 samples
Epoch 1/10
Epoch 2/10
Epoch

KeyboardInterrupt: 

In [None]:
xxxxxxxxxxxxxxxx

In [None]:
agent_1.log_std

In [None]:
states= np.array([[-0.00518034, -0.08861306,  0.9955604,  -0.0941247,   0.05484154],
 [-0.007649 ,  -0.14961523 , 0.99579173 ,-0.09164527 , 0.15092096],
 [-0.0087505 , -0.06675814 , 0.99571204 ,-0.09250706 ,-0.05245256]],dtype=np.float32) 
y_true= np.array([[ 0.59678435, -0.74026847, -0.48596317,],
 [ 0.5485846,  -0.8950667 ,  0.59213954],
 [ 0.49422464, -0.45643705, -0.16582818]],dtype=np.float32)

In [None]:
agent_1.actor.fit(states,y_true,epochs=1000)

In [None]:
agent_2.Actor.Actor.fit(states,y_true,epochs=1000)

In [None]:
state = agent.env.reset()
np.reshape(state, [1, state.shape[0]])

In [None]:
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx

In [None]:
import gfootball.env as football_env
import numpy as np

import gym
import copy
import tensorflow as tf

from keras.callbacks import TensorBoard
from keras.layers import Input, Dense, Conv2D, MaxPooling2D, Flatten
from keras.models import Model
from tensorflow.keras.optimizers import Adam
from keras import backend as K
from keras.applications.mobilenet_v2 import MobileNetV2
from tensorflow.python.framework.ops import disable_eager_execution

disable_eager_execution()

clipping_val = 0.2
critic_discount = 0.5
entropy_beta = 0.001
gamma = 0.99
lmbda = 0.95


env = gym.make('CartPole-v1')
    
state = env.reset()
state_dims = env.observation_space.shape

state_size = state_dims
n_actions = env.action_space.n
action_space = n_actions
input_shape = env.observation_space.shape

dummy_n = np.zeros((1, 1, n_actions))
dummy_1 = np.zeros((1, 1, 1))

tensor_board = TensorBoard(log_dir='./logs/')


target_reached = False
best_reward = 0
iters = 0
max_iters = 50
training_batch=128
episode = 0
replay_count = 0
shuffle = False
epochs = 10



def get_advantages(values, masks, rewards):
    returns = []
    gae = 0
    for i in reversed(range(len(rewards))):
        delta = rewards[i] + gamma * values[i + 1] * masks[i] - values[i]
        gae = delta + gamma * lmbda * masks[i] * gae
        returns.insert(0, gae + values[i])

    adv = np.array(returns) - values[:-1]
    return returns, (adv - np.mean(adv)) / (np.std(adv) + 1e-10)

def critic_PPO2_loss(values):
    def loss(y_true, y_pred):
        LOSS_CLIPPING = clipping_val
        clipped_value_loss = values + K.clip(y_pred - values, -LOSS_CLIPPING, LOSS_CLIPPING)
        v_loss1 = (y_true - clipped_value_loss) ** 2
        v_loss2 = (y_true - y_pred) ** 2
            
        value_loss = 0.5 * K.mean(K.maximum(v_loss1, v_loss2))
        #value_loss = K.mean((y_true - y_pred) ** 2) # standard PPO loss
        return value_loss
    return loss

def critic_ppo_loss(y_true, y_pred):
    value_loss = K.mean((y_true - y_pred) ** 2) # standard PPO loss
    return value_loss

def actor_ppo_loss(y_true, y_pred):
    # Defined in https://arxiv.org/abs/1707.06347
    advantages, prediction_picks, actions = y_true[:, :1], y_true[:, 1:1+action_space], y_true[:, 1+action_space:]
    LOSS_CLIPPING = clipping_val
    ENTROPY_LOSS = entropy_beta
        
    prob = actions * y_pred
    old_prob = actions * prediction_picks

    prob = K.clip(prob, 1e-10, 1.0)
    old_prob = K.clip(old_prob, 1e-10, 1.0)

    ratio = K.exp(K.log(prob) - K.log(old_prob))
        
    p1 = ratio * advantages
    p2 = K.clip(ratio, min_value=1 - LOSS_CLIPPING, max_value=1 + LOSS_CLIPPING) * advantages

    actor_loss = -K.mean(K.minimum(p1, p2))

    entropy = -(y_pred * K.log(y_pred + 1e-10))
    entropy = ENTROPY_LOSS * K.mean(entropy)
        
    total_loss = actor_loss - entropy

    return total_loss

def actor_ppo_loss_continuous(y_true, y_pred):
    advantages, logp_old_ph, actions  = y_true[:, :1], y_true[:, 1:1+action_space], y_true[:, 1+action_space]
    LOSS_CLIPPING = clipping_val
    logp = gaussian_likelihood(actions, y_pred)

    ratio = K.exp(logp - logp_old_ph)

    p1 = ratio * advantages
    p2 = tf.where(advantages > 0, (1.0 + LOSS_CLIPPING)*advantages, (1.0 - LOSS_CLIPPING)*advantages) # minimum advantage

    actor_loss = -K.mean(K.minimum(p1, p2))

    return actor_loss

def gaussian_likelihood(self, actions, pred): # for keras custom loss
    log_std = -0.5 * np.ones(self.action_space, dtype=np.float32)
    pre_sum = -0.5 * (((actions-pred)/(K.exp(log_std)+1e-8))**2 + 2*log_std + K.log(2*np.pi))
    return K.sum(pre_sum, axis=1)
    
def get_common_layer(X_input, model="MLP"):
    # Shared CNN layers:
    if model=="CNN":
        X = Conv1D(filters=64, kernel_size=6, padding="same", activation="tanh")(X_input)
        X = MaxPooling1D(pool_size=2)(X)
        X = Conv1D(filters=32, kernel_size=3, padding="same", activation="tanh")(X)
        X = MaxPooling1D(pool_size=2)(X)
        X = Flatten()(X)

    # Shared LSTM layers:
    elif model=="LSTM":
        X = LSTM(512, return_sequences=True)(X_input)
        X = LSTM(256)(X)

    # Shared Dense layers:
    else:
        X = Flatten()(X_input)
        X = Dense(512, activation="relu")(X)
        
    return X

def get_model_actor_simple(input_shape=input_shape, n_actions=n_actions, continuous=False):

    X_input = Input(input_shape)
    X = get_common_layer(X_input)
    X = Dense(512, activation="relu", kernel_initializer=tf.random_normal_initializer(stddev=0.01))(X_input)
    X = Dense(256, activation="relu", kernel_initializer=tf.random_normal_initializer(stddev=0.01))(X)
    X = Dense(64, activation="relu", kernel_initializer=tf.random_normal_initializer(stddev=0.01))(X)
    if continuous:
        output = Dense(n_actions,activation="tanh")(X)
    else:
        output = Dense(n_actions, activation="softmax")(X)

    model = Model(inputs = X_input, outputs = output)
    model.compile(loss=actor_ppo_loss, optimizer=Adam(lr=0.00025))
    return model


def get_model_critic_simple(input_shape=input_shape):
    X_input = Input(input_shape)
    V = get_common_layer(X_input)
    V = Dense(512, activation="relu", kernel_initializer='he_uniform')(X_input)
    V = Dense(256, activation="relu", kernel_initializer='he_uniform')(V)
    V = Dense(64, activation="relu", kernel_initializer='he_uniform')(V)
    value = Dense(1, activation=None)(V)

    model = Model(inputs=X_input, outputs = value)
    model.compile(loss=critic_ppo_loss, optimizer=Adam(lr=0.00025))
    return model
    
actor = get_model_actor_simple()
critic = get_model_critic_simple()

def act(state):
    """ example:
    pred = np.array([0.05, 0.85, 0.1])
    action_size = 3
    np.random.choice(a, p=pred)
    result>>> 1, because it have the highest probability to be taken
    """
    # Use the network to predict the next action to take, using the model
    prediction = actor.predict(state)[0]
    action = np.random.choice(n_actions, p=prediction)
    action_onehot = np.zeros([n_actions])
    action_onehot[action] = 1
    return action, action_onehot, prediction

def discount_rewards(reward):#gaes is better
    # Compute the gamma-discounted rewards over an episode
    # We apply the discount and normalize it to avoid big variability of rewards
    gamma = 0.99    # discount rate
    running_add = 0
    discounted_r = np.zeros_like(reward)
    for i in reversed(range(0,len(reward))):
        running_add = running_add * gamma + reward[i]
        discounted_r[i] = running_add

    discounted_r -= np.mean(discounted_r) # normalizing the result
    discounted_r /= (np.std(discounted_r) + 1e-8) # divide by standard deviation
    return discounted_r

def get_gaes(rewards, dones, values, next_values, gamma = 0.99, lamda = 0.9, normalize=True):
    deltas = [r + gamma * (1 - d) * nv - v for r, d, nv, v in zip(rewards, dones, next_values, values)]
    deltas = np.stack(deltas)
    gaes = copy.deepcopy(deltas)
    for t in reversed(range(len(deltas) - 1)):
        gaes[t] = gaes[t] + (1 - dones[t]) * gamma * lamda * gaes[t + 1]

    target = gaes + values
    if normalize:
        gaes = (gaes - gaes.mean()) / (gaes.std() + 1e-8)
    return np.vstack(gaes), np.vstack(target)

def test_reward():
    state = env.reset()
    done = False
    total_reward = 0
   
    limit = 0
    while not done:
        state_input = K.expand_dims(state, 0)
        action_probs = model_actor.predict([state_input, dummy_n, dummy_1, dummy_1, dummy_1], steps=1)
        action = np.argmax(action_probs)
        next_state, reward, done, _ = env.step(action)
        state = next_state
        total_reward += reward
        #print('test reward',reward)
        limit += 1
        if limit > 20:
            break
    print('testing...', total_reward)
    return total_reward

def critic_predict(state):
    return critic.predict([state, np.zeros((state.shape[0], 1))])

def replay(states, actions, rewards, predictions, dones, next_states):
    
    print('states.shape0',np.array(states).shape)
    # reshape memory to appropriate shape for training
    states = np.vstack(states)
    next_states = np.vstack(next_states)
    actions = np.vstack(actions)
    predictions = np.vstack(predictions)

    print('states.shape1',np.array(states).shape)
    # Get Critic network predictions 
    values = critic_predict(states)
    next_values = critic_predict(next_states)

    # Compute discounted rewards and advantages
    #discounted_r = self.discount_rewards(rewards)
    #advantages = np.vstack(discounted_r - values)
    advantages, target = get_gaes(rewards, dones, np.squeeze(values), np.squeeze(next_values))
    
#     print('advantages')
#     print(advantages)
        
#     print('')
#     print('targets')
#     print(target)
    '''
        pylab.plot(advantages,'.')
        pylab.plot(target,'-')
        ax=pylab.gca()
        ax.grid(True)
        pylab.subplots_adjust(left=0.05, right=0.98, top=0.96, bottom=0.06)
        pylab.show()
        '''
    # stack everything to numpy array
    # pack all advantages, predictions and actions to y_true and when they are received
    # in custom PPO loss function we unpack it
    y_true = np.hstack([advantages, predictions, actions])
        
    # training Actor and Critic networks
        
    print('advantages',advantages[0])
    print('preds',predictions[0])
    print('actions',actions[0])
    print()
 
    a_loss = actor.fit(states, y_true, epochs=epochs, verbose=0, shuffle=shuffle)
    print('actor loss',np.mean(a_loss.history['loss']))
    print()

    c_loss = critic.fit(states, target, epochs=epochs, verbose=0, shuffle=shuffle)
    print('critic loss',np.mean(c_loss.history['loss']))
    print()
#     self.writer.add_scalar('Data/actor_loss_per_replay', np.sum(a_loss.history['loss']), self.replay_count)
#     self.writer.add_scalar('Data/critic_loss_per_replay', np.sum(c_loss.history['loss']), self.replay_count)
#     self.replay_count += 1
    #replay_count += 1

def run_batch(): # train every self.Training_batch episodes
    scores_ = []
    episodes_ = []
    averages_= [] 
    episode = 0
    state = env.reset()
    state = np.reshape(state, [1,state_size[0]])
    done, score, SAVING = False, 0, ''
    while True:
        # Instantiate or reset games memory
        states, next_states, actions, rewards, predictions, dones = [], [], [], [], [], []
        for t in range(training_batch):
           # env.render()
            # Actor picks an action
            action, action_onehot, prediction = act(state)
                # Retrieve new state, reward, and whether the state is terminal
            next_state, reward, done, _ = env.step(action)
            # Memorize (state, action, reward) for training
            states.append(state)
            next_states.append(np.reshape(next_state, [1, state_size[0]]))
            actions.append(action_onehot)
            rewards.append(reward)
            dones.append(done)
            predictions.append(prediction)
            # Update current state
            state = np.reshape(next_state, [1, state_size[0]])
            score += reward
            if done:
                episode += 1
                SAVING = False
                scores_.append(score)
                averages_.append(sum(scores_[-50:]) / len(scores_[-50:]))

                print('score', averages_[-1])
#                     average, SAVING = self.PlotModel(score, self.episode)
#                     print("episode: {}/{}, score: {}, average: {:.2f} {}".format(self.episode, self.EPISODES, score, average, SAVING))
#                     self.writer.add_scalar(f'Workers:{1}/score_per_episode', score, self.episode)
#                     self.writer.add_scalar(f'Workers:{1}/learning_rate', self.lr, self.episode)

                state, done, score, SAVING = env.reset(), False, 0, ''
                state = np.reshape(state, [1, state_size[0]])
   
        replay(states, actions, rewards, predictions, dones, next_states)
        if episode >= 1000:
            break
    env.close()  




In [None]:

run_batch() # train as PPO, train every epesode

In [None]:
state_size

In [None]:
type(critic)

In [None]:


import os
os.environ['CUDA_VISIBLE_DEVICES'] = '-1' # -1:cpu, 0:first gpu
import random
import gym
import pylab
import numpy as np
import tensorflow as tf
from tensorboardX import SummaryWriter
#tf.config.experimental_run_functions_eagerly(True) # used for debuging and development
tf.compat.v1.disable_eager_execution() # usually using this for fastest performance
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras import backend as K
import copy

from threading import Thread, Lock
from multiprocessing import Process, Pipe
import time

gpus = tf.config.experimental.list_physical_devices('GPU')
if len(gpus) > 0:
    print(f'GPUs {gpus}')
    try: tf.config.experimental.set_memory_growth(gpus[0], True)
    except RuntimeError: pass

class Environment(Process):
    def __init__(self, env_idx, child_conn, env_name, state_size, action_size, visualize=False):
        super(Environment, self).__init__()
        self.env = gym.make(env_name)
        self.is_render = visualize
        self.env_idx = env_idx
        self.child_conn = child_conn
        self.state_size = state_size
        self.action_size = action_size

    def run(self):
        super(Environment, self).run()
        state = self.env.reset()
        state = np.reshape(state, [1, self.state_size])
        self.child_conn.send(state)
        while True:
            action = self.child_conn.recv()
            if self.is_render and self.env_idx == 0:
                self.env.render()

            state, reward, done, info = self.env.step(action)
            state = np.reshape(state, [1, self.state_size])

            if done:
                state = self.env.reset()
                state = np.reshape(state, [1, self.state_size])

            self.child_conn.send([state, reward, done, info])


class Actor_Model:
    def __init__(self, input_shape, action_space, lr, optimizer):
        X_input = Input(input_shape)
        self.action_space = action_space

        X = Dense(512, activation="relu", kernel_initializer=tf.random_normal_initializer(stddev=0.01))(X_input)
        X = Dense(256, activation="relu", kernel_initializer=tf.random_normal_initializer(stddev=0.01))(X)
        X = Dense(64, activation="relu", kernel_initializer=tf.random_normal_initializer(stddev=0.01))(X)
        output = Dense(self.action_space, activation="softmax")(X)

        self.Actor = Model(inputs = X_input, outputs = output)
        self.Actor.compile(loss=self.ppo_loss, optimizer=optimizer(lr=lr))

    def ppo_loss(self, y_true, y_pred):
        # Defined in https://arxiv.org/abs/1707.06347
        advantages, prediction_picks, actions = y_true[:, :1], y_true[:, 1:1+self.action_space], y_true[:, 1+self.action_space:]
        LOSS_CLIPPING = 0.2
        ENTROPY_LOSS = 0.001
        
        prob = actions * y_pred
        old_prob = actions * prediction_picks

        prob = K.clip(prob, 1e-10, 1.0)
        old_prob = K.clip(old_prob, 1e-10, 1.0)

        ratio = K.exp(K.log(prob) - K.log(old_prob))
        
        p1 = ratio * advantages
        p2 = K.clip(ratio, min_value=1 - LOSS_CLIPPING, max_value=1 + LOSS_CLIPPING) * advantages

        actor_loss = -K.mean(K.minimum(p1, p2))

        entropy = -(y_pred * K.log(y_pred + 1e-10))
        entropy = ENTROPY_LOSS * K.mean(entropy)
        
        total_loss = actor_loss - entropy

        return total_loss

    def predict(self, state):
        return self.Actor.predict(state)


class Critic_Model:
    def __init__(self, input_shape, action_space, lr, optimizer):
        X_input = Input(input_shape)
        old_values = Input(shape=(1,))

        V = Dense(512, activation="relu", kernel_initializer='he_uniform')(X_input)
        V = Dense(256, activation="relu", kernel_initializer='he_uniform')(V)
        V = Dense(64, activation="relu", kernel_initializer='he_uniform')(V)
        value = Dense(1, activation=None)(V)

        self.Critic = Model(inputs=[X_input, old_values], outputs = value)
        self.Critic.compile(loss=[self.critic_PPO2_loss(old_values)], optimizer=optimizer(lr=lr))

    def critic_PPO2_loss(self, values):
        def loss(y_true, y_pred):
            LOSS_CLIPPING = 0.2
            clipped_value_loss = values + K.clip(y_pred - values, -LOSS_CLIPPING, LOSS_CLIPPING)
            v_loss1 = (y_true - clipped_value_loss) ** 2
            v_loss2 = (y_true - y_pred) ** 2
            
            value_loss = 0.5 * K.mean(K.maximum(v_loss1, v_loss2))
            #value_loss = K.mean((y_true - y_pred) ** 2) # standard PPO loss
            return value_loss
        return loss

    def predict(self, state):
        return self.Critic.predict([state, np.zeros((state.shape[0], 1))])

class PPOAgent:
    # PPO Main Optimization Algorithm
    def __init__(self, env_name):
        # Initialization
        # Environment and PPO parameters
        self.env_name = env_name       
        self.env = gym.make(env_name)
        self.action_size = self.env.action_space.n
        self.state_size = self.env.observation_space.shape
        self.EPISODES = 10000 # total episodes to train through all environments
        self.episode = 0 # used to track the episodes total count of episodes played through all thread environments
        self.max_average = 0 # when average score is above 0 model will be saved
        self.lr = 0.00025
        self.epochs = 10 # training epochs
        self.shuffle=False
        self.Training_batch = 1000
        #self.optimizer = RMSprop
        self.optimizer = Adam

        self.replay_count = 0
        self.writer = SummaryWriter(comment="_"+self.env_name+"_"+self.optimizer.__name__+"_"+str(self.lr))
        
        # Instantiate plot memory
        self.scores_, self.episodes_, self.average_ = [], [], [] # used in matplotlib plots

        # Create Actor-Critic network models
        self.Actor = Actor_Model(input_shape=self.state_size, action_space = self.action_size, lr=self.lr, optimizer = self.optimizer)
        self.Critic = Critic_Model(input_shape=self.state_size, action_space = self.action_size, lr=self.lr, optimizer = self.optimizer)
        
        self.Actor_name = f"{self.env_name}_PPO_Actor.h5"
        self.Critic_name = f"{self.env_name}_PPO_Critic.h5"

        
    def act(self, state):
        """ example:
        pred = np.array([0.05, 0.85, 0.1])
        action_size = 3
        np.random.choice(a, p=pred)
        result>>> 1, because it have the highest probability to be taken
        """
        # Use the network to predict the next action to take, using the model
        prediction = self.Actor.predict(state)[0]
        action = np.random.choice(self.action_size, p=prediction)
        action_onehot = np.zeros([self.action_size])
        action_onehot[action] = 1
        return action, action_onehot, prediction

    def discount_rewards(self, reward):#gaes is better
        # Compute the gamma-discounted rewards over an episode
        # We apply the discount and normalize it to avoid big variability of rewards
        gamma = 0.99    # discount rate
        running_add = 0
        discounted_r = np.zeros_like(reward)
        for i in reversed(range(0,len(reward))):
            running_add = running_add * gamma + reward[i]
            discounted_r[i] = running_add

        discounted_r -= np.mean(discounted_r) # normalizing the result
        discounted_r /= (np.std(discounted_r) + 1e-8) # divide by standard deviation
        return discounted_r

    def get_gaes(self, rewards, dones, values, next_values, gamma = 0.99, lamda = 0.9, normalize=True):
        deltas = [r + gamma * (1 - d) * nv - v for r, d, nv, v in zip(rewards, dones, next_values, values)]
        deltas = np.stack(deltas)
        gaes = copy.deepcopy(deltas)
        for t in reversed(range(len(deltas) - 1)):
            gaes[t] = gaes[t] + (1 - dones[t]) * gamma * lamda * gaes[t + 1]

        target = gaes + values
        if normalize:
            gaes = (gaes - gaes.mean()) / (gaes.std() + 1e-8)
        return np.vstack(gaes), np.vstack(target)

    def replay(self, states, actions, rewards, predictions, dones, next_states):
        print('replay')
        # reshape memory to appropriate shape for training
        states = np.vstack(states)
        next_states = np.vstack(next_states)
        actions = np.vstack(actions)
        predictions = np.vstack(predictions)

        # Get Critic network predictions 
        values = self.Critic.predict(states)
        next_values = self.Critic.predict(next_states)

        # Compute discounted rewards and advantages
        #discounted_r = self.discount_rewards(rewards)
        #advantages = np.vstack(discounted_r - values)
        advantages, target = self.get_gaes(rewards, dones, np.squeeze(values), np.squeeze(next_values))
        '''
        pylab.plot(advantages,'.')
        pylab.plot(target,'-')
        ax=pylab.gca()
        ax.grid(True)
        pylab.subplots_adjust(left=0.05, right=0.98, top=0.96, bottom=0.06)
        pylab.show()
        '''
        # stack everything to numpy array
        # pack all advantages, predictions and actions to y_true and when they are received
        # in custom PPO loss function we unpack it
        y_true = np.hstack([advantages, predictions, actions])
        
            
        print('advantages',advantages[0])
        print('preds',predictions[0])
        print('actions',actions[0])
        # training Actor and Critic networks
        a_loss = self.Actor.Actor.fit(states, y_true, epochs=self.epochs, verbose=0, shuffle=self.shuffle)
        c_loss = self.Critic.Critic.fit([states, values], target, epochs=self.epochs, verbose=0, shuffle=self.shuffle)

        #a_loss = actor.fit(states, y_true, epochs=epochs, verbose=0, shuffle=shuffle)
        print('actor loss',np.mean(a_loss.history['loss']))
        print()

        #c_loss = critic.fit(states, target, epochs=epochs, verbose=0, shuffle=shuffle)
        print('critic loss',np.mean(c_loss.history['loss']))
        print()
        self.writer.add_scalar('Data/actor_loss_per_replay', np.sum(a_loss.history['loss']), self.replay_count)
        self.writer.add_scalar('Data/critic_loss_per_replay', np.sum(c_loss.history['loss']), self.replay_count)
        self.replay_count += 1
 
    def load(self):
        self.Actor.Actor.load_weights(self.Actor_name)
        self.Critic.Critic.load_weights(self.Critic_name)

    def save(self):
#         self.Actor.Actor.save_weights(self.Actor_name)
#         self.Critic.Critic.save_weights(self.Critic_name)
        pass

    pylab.figure(figsize=(18, 9))
    pylab.subplots_adjust(left=0.05, right=0.98, top=0.96, bottom=0.06)
    
    def PlotModel(self, score, episode):
        self.scores_.append(score)
        self.episodes_.append(episode)
        self.average_.append(sum(self.scores_[-50:]) / len(self.scores_[-50:]))
        if str(episode)[-2:] == "00":# much faster than episode % 100
            pylab.plot(self.episodes_, self.scores_, 'b')
            pylab.plot(self.episodes_, self.average_, 'r')
            pylab.title(self.env_name+" PPO training cycle", fontsize=18)
            pylab.ylabel('Score', fontsize=18)
            pylab.xlabel('Steps', fontsize=18)
            try:
                pylab.grid(True)
                pylab.savefig(self.env_name+".png")
            except OSError:
                pass
        # saving best models
        if self.average_[-1] >= self.max_average:
            self.max_average = self.average_[-1]
            self.save()
            SAVING = "SAVING"
            # decreaate learning rate every saved model
            self.lr *= 0.95
            K.set_value(self.Actor.Actor.optimizer.learning_rate, self.lr)
            K.set_value(self.Critic.Critic.optimizer.learning_rate, self.lr)
        else:
            SAVING = ""

        return self.average_[-1], SAVING
    
    def run(self): # train only when episode is finished
        state = self.env.reset()
        state = np.reshape(state, [1, self.state_size[0]])
        done, score, SAVING = False, 0, ''
        while True:
            # Instantiate or reset games memory
            states, next_states, actions, rewards, predictions, dones = [], [], [], [], [], []
            while not done:
                #self.env.render()
                # Actor picks an action
                action, action_onehot, prediction = self.act(state)
                # Retrieve new state, reward, and whether the state is terminal
                next_state, reward, done, _ = self.env.step(action)
                # Memorize (state, action, reward) for training
                states.append(state)
                next_states.append(np.reshape(next_state, [1, self.state_size[0]]))
                actions.append(action_onehot)
                rewards.append(reward)
                dones.append(done)
                predictions.append(prediction)
                # Update current state
                state = np.reshape(next_state, [1, self.state_size[0]])
                score += reward
                if done:
                    self.episode += 1
                    average, SAVING = self.PlotModel(score, self.episode)
                    print("episode: {}/{}, score: {}, average: {:.2f} {}".format(self.episode, self.EPISODES, score, average, SAVING))
                    self.writer.add_scalar(f'Workers:{1}/score_per_episode', score, self.episode)
                    self.writer.add_scalar(f'Workers:{1}/learning_rate', self.lr, self.episode)
                    
                    self.replay(states, actions, rewards, predictions, dones, next_states)

                    state, done, score, SAVING = self.env.reset(), False, 0, ''
                    state = np.reshape(state, [1, self.state_size[0]])

            if self.episode >= self.EPISODES:
                break
        self.env.close()

    def run_batch(self): # train every self.Training_batch episodes
        state = self.env.reset()
        state = np.reshape(state, [1, self.state_size[0]])
        done, score, SAVING = False, 0, ''
        while True:
            # Instantiate or reset games memory
            states, next_states, actions, rewards, predictions, dones = [], [], [], [], [], []
            for t in range(self.Training_batch):
                #self.env.render()
                # Actor picks an action
                action, action_onehot, prediction = self.act(state)
                
                print(state,action,action_onehot,prediction)
                # Retrieve new state, reward, and whether the state is terminal
                next_state, reward, done, _ = self.env.step(action)
                # Memorize (state, action, reward) for training
                states.append(state)
                next_states.append(np.reshape(next_state, [1, self.state_size[0]]))
                actions.append(action_onehot)
                rewards.append(reward)
                dones.append(done)
                predictions.append(prediction)
                # Update current state
                state = np.reshape(next_state, [1, self.state_size[0]])
                score += reward
                if done:
                    self.episode += 1
                    average, SAVING = self.PlotModel(score, self.episode)
                    print("episode: {}/{}, score: {}, average: {:.2f} {}".format(self.episode, self.EPISODES, score, average, SAVING))
                    self.writer.add_scalar(f'Workers:{1}/score_per_episode', score, self.episode)
                    self.writer.add_scalar(f'Workers:{1}/learning_rate', self.lr, self.episode)

                    state, done, score, SAVING = self.env.reset(), False, 0, ''
                    state = np.reshape(state, [1, self.state_size[0]])
                    
            self.replay(states, actions, rewards, predictions, dones, next_states)
            if self.episode >= self.EPISODES:
                break
        self.env.close()  

        
    def run_multiprocesses(self, num_worker = 4):
        works, parent_conns, child_conns = [], [], []
        for idx in range(num_worker):
            parent_conn, child_conn = Pipe()
            work = Environment(idx, child_conn, self.env_name, self.state_size[0], self.action_size, True)
            work.start()
            works.append(work)
            parent_conns.append(parent_conn)
            child_conns.append(child_conn)

        states =        [[] for _ in range(num_worker)]
        next_states =   [[] for _ in range(num_worker)]
        actions =       [[] for _ in range(num_worker)]
        rewards =       [[] for _ in range(num_worker)]
        dones =         [[] for _ in range(num_worker)]
        predictions =   [[] for _ in range(num_worker)]
        score =         [0 for _ in range(num_worker)]

        state = [0 for _ in range(num_worker)]
        for worker_id, parent_conn in enumerate(parent_conns):
            state[worker_id] = parent_conn.recv()

        while self.episode < self.EPISODES:
            predictions_list = self.Actor.predict(np.reshape(state, [num_worker, self.state_size[0]]))
            actions_list = [np.random.choice(self.action_size, p=i) for i in predictions_list]

            for worker_id, parent_conn in enumerate(parent_conns):
                parent_conn.send(actions_list[worker_id])
                action_onehot = np.zeros([self.action_size])
                action_onehot[actions_list[worker_id]] = 1
                actions[worker_id].append(action_onehot)
                predictions[worker_id].append(predictions_list[worker_id])

            for worker_id, parent_conn in enumerate(parent_conns):
                next_state, reward, done, _ = parent_conn.recv()

                states[worker_id].append(state[worker_id])
                next_states[worker_id].append(next_state)
                rewards[worker_id].append(reward)
                dones[worker_id].append(done)
                state[worker_id] = next_state
                score[worker_id] += reward

                if done:
                    average, SAVING = self.PlotModel(score[worker_id], self.episode)
                    print("episode: {}/{}, worker: {}, score: {}, average: {:.2f} {}".format(self.episode, self.EPISODES, worker_id, score[worker_id], average, SAVING))
                    self.writer.add_scalar(f'Workers:{num_worker}/score_per_episode', score[worker_id], self.episode)
                    self.writer.add_scalar(f'Workers:{num_worker}/learning_rate', self.lr, self.episode)
                    score[worker_id] = 0
                    if(self.episode < self.EPISODES):
                        self.episode += 1
                        
            for worker_id in range(num_worker):
                if len(states[worker_id]) >= self.Training_batch:
                    self.replay(states[worker_id], actions[worker_id], rewards[worker_id], predictions[worker_id], dones[worker_id], next_states[worker_id])
                    
                    states[worker_id] = []
                    next_states[worker_id] = []
                    actions[worker_id] = []
                    rewards[worker_id] = []
                    dones[worker_id] = []
                    predictions[worker_id] = []

        # terminating processes after while loop
        works.append(work)
        for work in works:
            work.terminate()
            print('TERMINATED:', work)
            work.join()
            

    def test(self, test_episodes = 100):
        self.load()
        for e in range(100):
            state = self.env.reset()
            state = np.reshape(state, [1, self.state_size[0]])
            done = False
            score = 0
            while not done:
                self.env.render()
                action = np.argmax(self.Actor.predict(state)[0])
                state, reward, done, _ = self.env.step(action)
                state = np.reshape(state, [1, self.state_size[0]])
                score += reward
                if done:
                    print("episode: {}/{}, score: {}".format(e, test_episodes, score))
                    break
        self.env.close()

if __name__ == "__main__":
    env_name = 'CartPole-v1'
    agent = PPOAgent(env_name)
    agent.run_batch() # train as PPO, train every epesode
    #agent.run_batch() # train as PPO, train every batch, trains better
    #agent.run_multiprocesses(num_worker = 8)  # train PPO multiprocessed (fastest)
    #agent.test()


In [None]:
#================================================================
#
#   File name   : BipedalWalker-v3_PPO
#   Author      : PyLessons
#   Created date: 2020-10-18
#   Website     : https://pylessons.com/
#   GitHub      : https://github.com/pythonlessons/Reinforcement_Learning
#   Description : BipedalWalker-v3 PPO continuous agent
#   TensorFlow  : 2.3.1
#
#================================================================
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '-1' # -1:cpu, 0:first gpu
import sys
import random
import gym
import pylab
import numpy as np
import tensorflow as tf
from tensorboardX import SummaryWriter
#tf.config.experimental_run_functions_eagerly(True) # used for debuging and development
tf.compat.v1.disable_eager_execution() # usually using this for fastest performance
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam, RMSprop, Adagrad, Adadelta
from tensorflow.keras import backend as K
import copy

from threading import Thread, Lock
from multiprocessing import Process, Pipe
import time

gpus = tf.config.experimental.list_physical_devices('GPU')
if len(gpus) > 0:
    print(f'GPUs {gpus}')
    try: tf.config.experimental.set_memory_growth(gpus[0], True)
    except RuntimeError: pass

class Environment(Process):
    def __init__(self, env_idx, child_conn, env_name, state_size, action_size, visualize=False):
        super(Environment, self).__init__()
        self.env = gym.make(env_name)
        self.is_render = visualize
        self.env_idx = env_idx
        self.child_conn = child_conn
        self.state_size = state_size
        self.action_size = action_size

    def run(self):
        super(Environment, self).run()
        state = self.env.reset()
        state = np.reshape(state, [1, self.state_size])
        self.child_conn.send(state)
        while True:
            action = self.child_conn.recv()
            #if self.is_render and self.env_idx == 0:
                #self.env.render()

            state, reward, done, info = self.env.step(action)
            state = np.reshape(state, [1, self.state_size])

            if done:
                state = self.env.reset()
                state = np.reshape(state, [1, self.state_size])

            self.child_conn.send([state, reward, done, info])


class Actor_Model:
    def __init__(self, input_shape, action_space, lr, optimizer):
        X_input = Input(input_shape)
        self.action_space = action_space
        
        X = Dense(512, activation="relu", kernel_initializer=tf.random_normal_initializer(stddev=0.01))(X_input)
        X = Dense(256, activation="relu", kernel_initializer=tf.random_normal_initializer(stddev=0.01))(X)
        X = Dense(64, activation="relu", kernel_initializer=tf.random_normal_initializer(stddev=0.01))(X)
        output = Dense(self.action_space, activation="tanh")(X)

        self.Actor = Model(inputs = X_input, outputs = output)
        self.Actor.compile(loss=self.ppo_loss_continuous, optimizer=optimizer(lr=lr))
        #print(self.Actor.summary())

    def ppo_loss_continuous(self, y_true, y_pred):
        tf.print("\n y_true:", type(y_true), output_stream=sys.stdout)
        advantages, actions, logp_old_ph, = y_true[:, :1], y_true[:, 1:1+self.action_space], y_true[:, 1+self.action_space]
        LOSS_CLIPPING = 0.2
        logp = self.gaussian_likelihood(actions, y_pred)

        ratio = K.exp(logp - logp_old_ph)

        p1 = ratio * advantages
        p2 = tf.where(advantages > 0, (1.0 + LOSS_CLIPPING)*advantages, (1.0 - LOSS_CLIPPING)*advantages) # minimum advantage

        actor_loss = -K.mean(K.minimum(p1, p2))

        return actor_loss

    def gaussian_likelihood(self, actions, pred): # for keras custom loss
        log_std = -0.5 * np.ones(self.action_space, dtype=np.float32)
        pre_sum = -0.5 * (((actions-pred)/(K.exp(log_std)+1e-8))**2 + 2*log_std + K.log(2*np.pi))
        return K.sum(pre_sum, axis=1)

    def predict(self, state):
        return self.Actor.predict(state)


class Critic_Model:
    def __init__(self, input_shape, action_space, lr, optimizer):
        X_input = Input(input_shape)
        old_values = Input(shape=(1,))

        V = Dense(512, activation="relu", kernel_initializer=tf.random_normal_initializer(stddev=0.01))(X_input)
        V = Dense(256, activation="relu", kernel_initializer=tf.random_normal_initializer(stddev=0.01))(V)
        V = Dense(64, activation="relu", kernel_initializer=tf.random_normal_initializer(stddev=0.01))(V)
        value = Dense(1, activation=None)(V)

        self.Critic = Model(inputs=[X_input, old_values], outputs = value)
        self.Critic.compile(loss=[self.critic_PPO2_loss(old_values)], optimizer=optimizer(lr=lr))

    def critic_PPO2_loss(self, values):
        def loss(y_true, y_pred):
            LOSS_CLIPPING = 0.2
            clipped_value_loss = values + K.clip(y_pred - values, -LOSS_CLIPPING, LOSS_CLIPPING)
            v_loss1 = (y_true - clipped_value_loss) ** 2
            v_loss2 = (y_true - y_pred) ** 2
            
            value_loss = 0.5 * K.mean(K.maximum(v_loss1, v_loss2))
            #value_loss = K.mean((y_true - y_pred) ** 2) # standard PPO loss
            return value_loss
        return loss

    def predict(self, state):
        return self.Critic.predict([state, np.zeros((state.shape[0], 1))])
    

class PPOAgent:
    # PPO Main Optimization Algorithm
    def __init__(self, env_name, model_name=""):
        # Initialization
        # Environment and PPO parameters
        self.env_name = env_name       
        self.env = gym.make(env_name)
        self.action_size = self.env.action_space.shape[0]
        self.state_size = self.env.observation_space.shape
        self.EPISODES = 200000 # total episodes to train through all environments
        self.episode = 0 # used to track the episodes total count of episodes played through all thread environments
        self.max_average = 0 # when average score is above 0 model will be saved
        self.lr = 0.00025
        self.epochs = 10 # training epochs
        self.shuffle = True
        self.Training_batch = 512
        #self.optimizer = RMSprop
        self.optimizer = Adam

        self.replay_count = 0
        self.writer = SummaryWriter(comment="_"+self.env_name+"_"+self.optimizer.__name__+"_"+str(self.lr))
        
        # Instantiate plot memory
        self.scores_, self.episodes_, self.average_ = [], [], [] # used in matplotlib plots

        # Create Actor-Critic network models
        self.Actor = Actor_Model(input_shape=self.state_size, action_space = self.action_size, lr=self.lr, optimizer = self.optimizer)
        self.Critic = Critic_Model(input_shape=self.state_size, action_space = self.action_size, lr=self.lr, optimizer = self.optimizer)
        
        self.Actor_name = f"{self.env_name}_PPO_Actor.h5"
        self.Critic_name = f"{self.env_name}_PPO_Critic.h5"
        #self.load() # uncomment to continue training from old weights

        # do not change bellow
        self.log_std = -0.5 * np.ones(self.action_size, dtype=np.float32)
        self.std = np.exp(self.log_std)


    def act(self, state):
        # Use the network to predict the next action to take, using the model
        pred = self.Actor.predict(state)

        low, high = -1.0, 1.0 # -1 and 1 are boundaries of tanh
        action = pred + np.random.uniform(low, high, size=pred.shape) * self.std
        action = np.clip(action, low, high)
        
        logp_t = self.gaussian_likelihood(action, pred, self.log_std)
        print('act',self.std, self.log_std)
        print('logp_t',logp_t,'action',action, 'pred',pred)
        print()
        return action, logp_t

    def gaussian_likelihood(self, action, pred, log_std):
        # https://github.com/hill-a/stable-baselines/blob/master/stable_baselines/sac/policies.py
        pre_sum = -0.5 * (((action-pred)/(np.exp(log_std)+1e-8))**2 + 2*log_std + np.log(2*np.pi)) 
        return np.sum(pre_sum, axis=1)

    def discount_rewards(self, reward):#gaes is better
        # Compute the gamma-discounted rewards over an episode
        # We apply the discount and normalize it to avoid big variability of rewards
        gamma = 0.99    # discount rate
        running_add = 0
        discounted_r = np.zeros_like(reward)
        for i in reversed(range(0,len(reward))):
            running_add = running_add * gamma + reward[i]
            discounted_r[i] = running_add

        discounted_r -= np.mean(discounted_r) # normalizing the result
        discounted_r /= (np.std(discounted_r) + 1e-8) # divide by standard deviation
        return discounted_r

    def get_gaes(self, rewards, dones, values, next_values, gamma = 0.99, lamda = 0.90, normalize=True):
        deltas = [r + gamma * (1 - d) * nv - v for r, d, nv, v in zip(rewards, dones, next_values, values)]
        deltas = np.stack(deltas)
        gaes = copy.deepcopy(deltas)
        for t in reversed(range(len(deltas) - 1)):
            gaes[t] = gaes[t] + (1 - dones[t]) * gamma * lamda * gaes[t + 1]

        target = gaes + values
        if normalize:
            gaes = (gaes - gaes.mean()) / (gaes.std() + 1e-8)
        return np.vstack(gaes), np.vstack(target)

    def replay(self, states, actions, rewards, dones, next_states, logp_ts):
        # reshape memory to appropriate shape for training
        states = np.vstack(states)
        next_states = np.vstack(next_states)
        actions = np.vstack(actions)
        logp_ts = np.vstack(logp_ts)

        # Get Critic network predictions 
        values = self.Critic.predict(states)
        next_values = self.Critic.predict(next_states)

        # Compute discounted rewards and advantages
        #discounted_r = self.discount_rewards(rewards)
        #advantages = np.vstack(discounted_r - values)
        advantages, target = self.get_gaes(rewards, dones, np.squeeze(values), np.squeeze(next_values))
        '''
        pylab.plot(adv,'.')
        pylab.plot(target,'-')
        ax=pylab.gca()
        ax.grid(True)
        pylab.subplots_adjust(left=0.05, right=0.98, top=0.96, bottom=0.06)
        pylab.show()
        if str(episode)[-2:] == "00": pylab.savefig(self.env_name+"_"+self.episode+".png")
        '''
        # stack everything to numpy array
        # pack all advantages, predictions and actions to y_true and when they are received
        # in custom loss function we unpack it
        y_true = np.hstack([advantages, actions, logp_ts])
        print('states')
        print(states[0:3])
        print('y_true')
        print(y_true[0:3])
            
        # training Actor and Critic networks
        a_loss = self.Actor.Actor.fit(states, y_true, epochs=self.epochs, verbose=1, shuffle=self.shuffle)
        c_loss = self.Critic.Critic.fit([states, values], target, epochs=self.epochs, verbose=1, shuffle=self.shuffle)

        print('advantages',advantages[0])
        print('preds',logp_ts[0], np.mean(logp_ts))
        print('actions',actions[0], np.mean(actions))
        print()
        print('actor loss', np.mean(a_loss.history['loss']))
        print('critic loss',np.mean(c_loss.history['loss']))
        print(a_loss.history['loss'])
        print()
        # calculate loss parameters (should be done in loss, but couldn't find working way how to do that with disabled eager execution)
        pred = self.Actor.predict(states)
        log_std = -0.5 * np.ones(self.action_size, dtype=np.float32)
        logp = self.gaussian_likelihood(actions, pred, log_std)
        approx_kl = np.mean(logp_ts - logp)
        approx_ent = np.mean(-logp)

        self.writer.add_scalar('Data/actor_loss_per_replay', np.sum(a_loss.history['loss']), self.replay_count)
        self.writer.add_scalar('Data/critic_loss_per_replay', np.sum(c_loss.history['loss']), self.replay_count)
        self.writer.add_scalar('Data/approx_kl_per_replay', approx_kl, self.replay_count)
        self.writer.add_scalar('Data/approx_ent_per_replay', approx_ent, self.replay_count)
        self.replay_count += 1
 
    def load(self):
        self.Actor.Actor.load_weights(self.Actor_name)
        self.Critic.Critic.load_weights(self.Critic_name)

    def save(self):
#         self.Actor.Actor.save_weights(self.Actor_name)
#         self.Critic.Critic.save_weights(self.Critic_name)
        pass

    def run_batch(self):
        state = self.env.reset()
        state = np.reshape(state, [1, self.state_size[0]])
        done, score, SAVING = False, 0, ''
        while True:
            # Instantiate or reset games memory
            states, next_states, actions, rewards, dones, logp_ts = [], [], [], [], [], []
            for t in range(self.Training_batch):
                #self.env.render()
                # Actor picks an action
                action, logp_t = self.act(state)
                # Retrieve new state, reward, and whether the state is terminal
                next_state, reward, done, _ = self.env.step(action[0])
                # Memorize (state, next_states, action, reward, done, logp_ts) for training
                states.append(state)
                next_states.append(np.reshape(next_state, [1, self.state_size[0]]))
                actions.append(action)
                rewards.append(reward)
                dones.append(done)
                logp_ts.append(logp_t[0])
                # Update current state shape
                state = np.reshape(next_state, [1, self.state_size[0]])
                score += reward
                if done:
                    self.episode += 1
                    average, SAVING = self.PlotModel(score, self.episode)
                    print("episode: {}/{}, score: {}, average: {:.2f} {}".format(self.episode, self.EPISODES, score, average, SAVING))
                    self.writer.add_scalar(f'Workers:{1}/score_per_episode', score, self.episode)
                    self.writer.add_scalar(f'Workers:{1}/learning_rate', self.lr, self.episode)
                    self.writer.add_scalar(f'Workers:{1}/average_score',  average, self.episode)
                    
                    state, done, score, SAVING = self.env.reset(), False, 0, ''
                    state = np.reshape(state, [1, self.state_size[0]])

            self.replay(states, actions, rewards, dones, next_states, logp_ts)
            if self.episode >= self.EPISODES:
                break

        self.env.close()



            
import pybullet_envs

ENV_NAME = 'InvertedPendulumBulletEnv-v0'

# newest gym fixed bugs in 'BipedalWalker-v2' and now it's called 'BipedalWalker-v3'
env_name = ENV_NAME
agent_2 = PPOAgent(env_name)
agent_2.run_batch() # train as PPO
    #agent.run_multiprocesses(num_worker = 16)  # train PPO multiprocessed (fastest)
    #agent.test()