In [1]:
import shutup
shutup.please()

In [2]:
import os
import numpy as np
import random
from datetime import datetime
from multiprocessing import cpu_count
from threading import Thread

from src.agents.agent import Agent
from src.utils.buffer import Buffer

import tensorflow as tf
import tensorflow.keras as keras
import tensorflow.keras.backend as K
import tensorflow_probability as tfp
from tensorflow.keras.layers import Input, Dense, Concatenate, Lambda
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras import Model

from tensorflow.python.framework.ops import disable_eager_execution

disable_eager_execution()


In [3]:
import numpy as np

class ReplayBuffer:
    def __init__(self):
        self.states = []
        self.actions = []
        self.rewards = []
        self.size = 0

    def reset(self):
        self.size = 0
        self.states.clear()
        self.actions.clear()
        self.rewards.clear()

    def remember(self, state, action_onehot, reward ):
        self.size +=1
        self.states.append(state)
        self.actions.append(action_onehot)
        self.rewards.append(reward)

#     def sample(self, batch_size=64):
#         max_mem = min(self.buffer_counter, self.buffer_size)

#         batch = np.random.choice(max_mem, batch_size)

#         states = self.state_memory[batch]
#         states_ = self.new_state_memory[batch]
#         actions = self.action_memory[batch]
#         rewards = self.reward_memory[batch]
#         dones = self.done_memory[batch]

#         return states, actions, rewards, states_, dones

In [8]:
GLOBAL_EPISODE_NUM = 0

class A3CWorker(Thread):
    def __init__(self, 
                worker_id,
                env, 
                global_actor, 
                global_critic, 
                action_space_mode,
                observation_shape,
                policy,
                n_actions,
                actor_optimizer,
                critic_optimizer,
                std_bound,
                max_episodes = 10000):
        Thread.__init__(self)
        
        self.worker_id = worker_id
        self.env = env
        self.action_space_mode = action_space_mode
        self.observation_shape = observation_shape
        self.policy = policy
        self.n_actions = n_actions
        self.actor_optimizer = actor_optimizer
        self.critic_optimizer = critic_optimizer
        self.std_bound = std_bound
        self.max_episodes = 10000

        self.global_actor = global_actor
        self.global_critic = global_critic
        self.__init_networks()

        #self.actor.set_weights(self.global_actor.get_weights())
        #self.critic.set_weights(self.global_critic.get_weights())

    def __init_networks(self):
        X_input = Input(shape=self.observation_shape) 
        X = CommonLayer(X_input,self.policy,rename=False)
        
        action = Dense(self.n_actions, activation="softmax", kernel_initializer='he_uniform')(X)
        value = Dense(1, kernel_initializer='he_uniform')(X)
        
        if self.action_space_mode == "discrete":
            action = Dense(self.n_actions, activation="softmax", kernel_initializer='he_uniform')(X)
            self.actor = Model(inputs = X_input, outputs = action)
            self.actor.compile(loss='categorical_crossentropy', optimizer=self.actor_optimizer)
        else:
            mu = Dense(self.n_actions, activation="tanh", kernel_initializer='he_uniform')(X)
            mu = Lambda(lambda x: x * self.action_bound)(mu)
            sigma = Dense(self.n_actions, activation="softplus", kernel_initializer='he_uniform')(X)
            
            self.actor = Model(inputs = X_input, outputs = Concatenate()([mu,sigma]))
            self.actor.compile(loss=self.continuous_actor_loss, optimizer=self.actor_optimizer)
        
        self.critic = Model(inputs = X_input, outputs = value)
        self.critic.compile(loss='mse', optimizer=self.critic_optimizer)
    
    def __init_buffers(self):
        self.buffer = ReplayBuffer()
        
    def log_pdf(self,mu, sigma, action):
        std = tf.clip_by_value(sigma, self.std_bound[0], self.std_bound[1])
        var = std ** 2
        log_policy_pdf = -0.5 * (action - mu) ** 2 / var - 0.5 * tf.math.log(
            var * 2 * np.pi
        )
        return tf.reduce_sum(log_policy_pdf, 1, keepdims=True)
    
    def continuous_actor_loss(self, y_true, y_pred):
        actions, advantages = y_true[:, :1], y_true[:, 1:]
        mu,sigma = y_pred[:,:1], y_pred[:,1:]
        log_policy_pdf = self.log_pdf(mu,sigma,actions)
        loss_policy = log_policy_pdf * advantages
        
        return tf.reduce_sum(-loss_policy)

    def act(self,state):
  
        if self.action_space_mode == "discrete":
            prediction = self.global_actor.predict(state)[0]
            action = np.random.choice(self.n_actions, p=prediction)
            action_onehot = np.zeros([self.n_actions])
            action_onehot[action] = 1
        else:
            prediction = self.global_actor.predict(state)[0]
            mu = prediction[0]
            sigma = prediction[1]
            action = np.random.normal(mu, sigma,self.n_actions)
            action = np.clip(action, -self.action_bound, self.action_bound)
            action_onehot = action
        return action, action_onehot, prediction
    
    def discount_rewards(self, reward):
        # Compute the gamma-discounted rewards over an episode
        gamma = 0.99    # discount rate
        running_add = 0
        discounted_r = np.zeros_like(reward)
        for i in reversed(range(0,len(reward))):
            running_add = running_add * self.gamma + reward[i]
            discounted_r[i] = running_add

        discounted_r -= np.mean(discounted_r) # normalizing the result
        discounted_r /= (np.std(discounted_r) + 1e-8) # divide by standard deviation
        
        return discounted_r
    
    def replay(self):

        if self.buffer.size > 1:
            # reshape memory to appropriate shape for training
            states = np.vstack(self.buffer.states)
            actions = np.vstack(self.buffer.actions)

            # Compute discounted rewards
            discounted_r = self.discount_rewards(self.buffer.rewards)

            # Get Critic network predictions
            values = self.global_critic.predict(states)[:, 0]
            # Compute advantages
            advantages = discounted_r - values
            # training Actor and Critic networks


            if self.action_space_mode == "discrete":
                self.global_actor.fit(states, actions, sample_weight=advantages, epochs=1, verbose=0)
            else:
                self.global_actor.fit(states,np.concatenate([actions,np.reshape(advantages,newshape=(len(advantages),1))],axis=1), epochs=1,verbose=0)

            self.global_critic.fit(states, discounted_r, epochs=1, verbose=0)
            
            # Reset weights
            self.actor.set_weights(self.global_actor.get_weights())
            self.critic.set_weights(
                self.global_critic.get_weights()
            )
            # reset training memory
            self.buffer.reset()
        

    def learn(self):
        global GLOBAL_EPISODE_NUM
        while self.max_episodes >= GLOBAL_EPISODE_NUM:
            state_batch = []
            action_batch = []
            reward_batch = []
            episode_reward, done = 0, False

            state = self.env.reset()

            while not done:
                # self.env.render()
                state = np.expand_dims(state, axis=0)
                action = self.act(state)
                action = np.clip(action, -self.action_bound, self.action_bound)

                next_state, reward, done, _ = self.env.step(action)

                state = np.reshape(state, [1, self.state_dim])
                action = np.reshape(action, [1, 1])
                next_state = np.reshape(next_state, [1, self.state_dim])
                reward = np.reshape(reward, [1, 1])
                state_batch.append(state)
                action_batch.append(action)
                reward_batch.append(reward)

                if len(state_batch) >= args.update_interval or done:
                    states = np.array([state.squeeze() for state in state_batch])
                    actions = np.array([action.squeeze() for action in action_batch])
                    rewards = np.array([reward.squeeze() for reward in reward_batch])
                    next_v_value = self.critic.model.predict(next_state)
                    td_targets = self.n_step_td_target(rewards, next_v_value, done)
                    advantages = td_targets - self.critic.model.predict(states)

                    actor_loss = self.global_actor.train(states, actions, advantages)
                    critic_loss = self.global_critic.train(states, td_targets)

                    self.actor.model.set_weights(self.global_actor.model.get_weights())
                    self.critic.model.set_weights(
                        self.global_critic.model.get_weights()
                    )

                    state_batch = []
                    action_batch = []
                    reward_batch = []

                episode_reward += reward[0][0]
                state = next_state[0]

            print(f"Episode#{GLOBAL_EPISODE_NUM}, Worker#{self.worker_id}, Reward:{episode_reward}")
            tf.summary.scalar("episode_reward", episode_reward, step=GLOBAL_EPISODE_NUM)
            GLOBAL_EPISODE_NUM += 1

    def run(self):
        self.learn()

In [9]:
from src.agents.agent import Agent
from src.utils.networks import CommonLayer
    

class A3CAgent(Agent):
    def __init__(self,
        environment,
        gamma = 0.99,
        policy="mlp",
        actor_optimizer=RMSprop(0.0001),
        critic_optimizer=RMSprop(0.0001),
        std_bound = [1e-2, 1.0],
        batch_size=64,
        n_workers=cpu_count()
    ):
        
        super(A3CAgent, self).__init__(environment,args=locals())
        
        
        # Args
        self.environment = environment
        self.gamma = gamma
        self.std_bound = std_bound
        self.batch_size = batch_size
        self.policy = policy 
        self.actor_optimizer=actor_optimizer
        self.critic_optimizer=critic_optimizer
        self.n_workers = n_workers

        # Bootstrap
        self.__init_networks()
        self.__init_buffers()
        self._add_models_to_config([self.global_actor,self.global_critic])
        
    def __init_networks(self):
        X_input = Input(shape=self.observation_shape) 
        X = CommonLayer(X_input,self.policy)
        
        action = Dense(self.n_actions, activation="softmax", kernel_initializer='he_uniform')(X)
        value = Dense(1, kernel_initializer='he_uniform')(X)
        
        if self.action_space_mode == "discrete":
            action = Dense(self.n_actions, activation="softmax", kernel_initializer='he_uniform')(X)
            self.global_actor = Model(inputs = X_input, outputs = action)
            self.global_actor.compile(loss='categorical_crossentropy', optimizer=self.actor_optimizer)
        else:
            mu = Dense(self.n_actions, activation="tanh", kernel_initializer='he_uniform')(X)
            mu = Lambda(lambda x: x * self.action_bound)(mu)
            sigma = Dense(self.n_actions, activation="softplus", kernel_initializer='he_uniform')(X)
            
            self.global_actor = Model(inputs = X_input, outputs = Concatenate()([mu,sigma]))
            self.global_actor.compile(loss=self.continuous_actor_loss, optimizer=self.actor_optimizer)
        
        self.global_critic = Model(inputs = X_input, outputs = value)
        self.global_critic.compile(loss='mse', optimizer=self.critic_optimizer)
    
    def __init_buffers(self):
        self.buffer = ReplayBuffer()
        
    def log_pdf(self,mu, sigma, action):
        std = tf.clip_by_value(sigma, self.std_bound[0], self.std_bound[1])
        var = std ** 2
        log_policy_pdf = -0.5 * (action - mu) ** 2 / var - 0.5 * tf.math.log(
            var * 2 * np.pi
        )
        return tf.reduce_sum(log_policy_pdf, 1, keepdims=True)
    
    def continuous_actor_loss(self, y_true, y_pred):
        actions, advantages = y_true[:, :1], y_true[:, 1:]
        mu,sigma = y_pred[:,:1], y_pred[:,1:]
        log_policy_pdf = self.log_pdf(mu,sigma,actions)
        loss_policy = log_policy_pdf * advantages
        
        return tf.reduce_sum(-loss_policy)

    

    def learn(self, timesteps=-1, plot_results=True, reset=False, success_threshold=False, log_level=1, log_each_n_episodes=50,max_episodes=10000):
        workers = []

        for i in range(self.n_workers):
            env = self.environment()
            workers.append(
                A3CWorker(
                    i,
                    self.env, 
                    self.global_actor, 
                    self.global_critic, 
                    self.action_space_mode,
                    self.observation_shape,
                    self.policy,
                    self.n_actions,
                    self.actor_optimizer,
                    self.critic_optimizer,
                    self.std_bound,
                    max_episodes
                )
            )

        for worker in workers:
            worker.start()

        for worker in workers:
            worker.join()

In [10]:
from src.environments.discrete.cartpole import environment
agent = A3CAgent(environment, n_workers=1)
agent.learn()

Exception in thread Thread-6:
Traceback (most recent call last):
  File "C:\Users\filip\anaconda3\envs\ai_4\lib\threading.py", line 1009, in _bootstrap_inner
    self.run()
  File "C:\Users\filip\AppData\Local\Temp\ipykernel_25144\3697331783.py", line 195, in run
  File "C:\Users\filip\AppData\Local\Temp\ipykernel_25144\3697331783.py", line 154, in learn
  File "C:\Users\filip\AppData\Local\Temp\ipykernel_25144\3697331783.py", line 81, in act
  File "C:\Users\filip\anaconda3\envs\ai_4\lib\site-packages\keras\engine\training_v1.py", line 970, in predict
    return func.predict(
  File "C:\Users\filip\anaconda3\envs\ai_4\lib\site-packages\keras\engine\training_arrays_v1.py", line 700, in predict
    return predict_loop(
  File "C:\Users\filip\anaconda3\envs\ai_4\lib\site-packages\keras\engine\training_arrays_v1.py", line 178, in model_iteration
    f = _make_execution_function(model, mode)
  File "C:\Users\filip\anaconda3\envs\ai_4\lib\site-packages\keras\engine\training_arrays_v1.py", l


| ---------------------------------
| CartPole-v1
| Action space: Discrete with high state-space
| Environment beated threshold: 200
| Dev notes:
|   * Agents that track State/Action combinations like 
|     Q learning will fail due to high state space
| ----------------------------------------------------------   



| ---------------------------------
| CartPole-v1
| Action space: Discrete with high state-space
| Environment beated threshold: 200
| Dev notes:
|   * Agents that track State/Action combinations like 
|     Q learning will fail due to high state space
| ----------------------------------------------------------   


<keras.engine.functional.Functional object at 0x000002169B45FF10>


In [7]:
#from src.environments.continuous.inverted_pendulum import environment
#agent = A2CAgent(environment)
#agent.learn()



NameError: name 'A2CAgent' is not defined