## **Overview**

This code is adapted from a YouTube Tutorial by brthor. (Source:https://youtu.be/q-IPTPS9hgs?si=qZvMP9lkqZYbyfvO retrieved in January 2023.)

Human-level control through deep reinforcement learning (paper): A3C algorithm introduced by deepmind

The A3C algorithm, is a deep reinforcement learning method that learns directly from high-dimensional sensory inputs, such as images in Atari games.

It combines policy-based and value-based methods to improve sample efficiency and stability.

A3C agents are saved using serialization libraries and can be visualized using TensorBoard to monitor training dynamics and algorithm convergence. Preprocessing steps are necessary for Atari games.


## **Set up**

In [1]:
!pip install "gym[atari, accept-rom-license]"



# **Importing all the necessary Libraries:**

In [2]:
import gym
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Flatten
from tensorflow.keras.optimizers import Adam
import threading
import os
import cv2
import multiprocessing

# **Hyperparameters**
Learning Rate (LR): Learning rate for the optimizer

Discount Factor (GAMMA): Discount factor for future rewards

Number of Workers (NUM_WORKERS): Number of worker threads (parallel agents)

Number of Steps (NUM_STEPS): Number of steps each worker takes before updating the network

Maximum Gradient Norm (MAX_GRAD_NORM):Maximum gradient norm for gradient clipping

(VALUE_LOSS_COEFF): Value function loss coefficient

Entropy Coefficient (ENTROPY_COEFF): Entropy coefficient to encourage exploration

Replay Buffer Size (BUFFER_SIZE): Size of the replay buffer

In [3]:
# Hyperparameters

learning_rate = 0.001
discount_factor = 0.99
entropy_beta = 0.01
state_size = (84, 84, 4)  # Assuming pre-processing to 84x84 and 4 stacked frames

memory_size = 10000  # Experience replay memory size
batch_size = 32  # Batch size for experience replay sampling


  and should_run_async(code)


# **Pre-processing Function:**

In [4]:
def preprocess(frame):
    frame_resized = cv2.resize(frame, (80, 80), interpolation=cv2.INTER_LINEAR)
    frame_gray = cv2.cvtColor(frame_resized, cv2.COLOR_RGB2GRAY)
    frame_normalized = frame_gray / 255.0
    return np.expand_dims(frame_normalized, axis=2)  # Add channel dimension

# **Defining Actor-Critic Network**

In [5]:
class ActorCriticModel(Model):
    def __init__(self, action_size):
        super(ActorCriticModel, self).__init__()
        self.dense1 = Dense(128, activation='relu')
        self.flatten = Flatten()
        self.policy_logits = Dense(action_size)
        self.values = Dense(1)

    def call(self, inputs):
        x = self.flatten(inputs)
        x = self.dense1(x)
        logits = self.policy_logits(x)
        values = self.values(x)
        return logits, values


# **Class defination and training phase of the agent**:
This code generates a global model A3C DQN agent and uses several worker threads for asynchronous training. The creation, operation, and synchronization of worker threads are coordinated by the train function.

In [6]:
class A3CAgent:
    def __init__(self, env_name):
        self.env_name = env_name
        self.env = gym.make(env_name)
        self.state_size = (80, 80, 1)  # Processed frame dimensions
        self.action_size = self.env.action_space.n
        self.optimizer = Adam(lr=0.001)
        self.global_model = ActorCriticModel(self.action_size)
        self.global_model(tf.convert_to_tensor(np.random.random((1, *self.state_size)), dtype=tf.float32))

    def train(self, max_episodes=1000):
        workers = []

        for i in range(multiprocessing.cpu_count()):
            worker_name = f'worker{i}'
            workers.append(Worker(self.global_model, self.optimizer, self.env_name, worker_name, max_episodes, self.action_size, self.state_size))

        for worker in workers:
            worker.start()

        for worker in workers:
            worker.join()

# **Worker thread in A3C:**
Worker threads act as independent agents, collecting experience and potentially contributing to model updates through gradients or experience sharing.

In [7]:
class Worker(threading.Thread):
    def __init__(self, global_model, optimizer, env_name, name, max_episodes, action_size, state_size):
        super(Worker, self).__init__()
        self.global_model = global_model
        self.optimizer = optimizer
        self.env = gym.make(env_name)
        self.local_model = ActorCriticModel(action_size)
        self.name = name
        self.max_episodes = max_episodes
        self.state_size = state_size
        self.action_size = action_size

    def run(self):
        total_step = 1
        for episode in range(self.max_episodes):
            current_state = self.env.reset()
            current_state = preprocess(current_state)
            done = False
            while not done:
                logits, _ = self.local_model(tf.convert_to_tensor(current_state[None, :], dtype=tf.float32))
                probs = tf.nn.softmax(logits)
                action = np.random.choice(self.action_size, p=probs.numpy()[0])
                new_state, reward, done, _ = self.env.step(action)
                new_state = preprocess(new_state)
                current_state = new_state
                total_step += 1
            print(f"{self.name} Finished Episode {episode}")


In [None]:
if __name__ == "__main__":
    agent = A3CAgent('Breakout-v0')
    agent.train()

  logger.warn(
  deprecation(
  deprecation(
  if not isinstance(done, (bool, np.bool8)):
  logger.deprecation(


worker1 Finished Episode 0
worker0 Finished Episode 0
worker1 Finished Episode 1
worker1 Finished Episode 2
worker0 Finished Episode 1
worker1 Finished Episode 3
worker0 Finished Episode 2
worker1 Finished Episode 4
worker0 Finished Episode 3
worker1 Finished Episode 5
worker0 Finished Episode 4
worker1 Finished Episode 6
worker0 Finished Episode 5
worker1 Finished Episode 7
worker0 Finished Episode 6
worker1 Finished Episode 8
worker1 Finished Episode 9
worker0 Finished Episode 7
worker1 Finished Episode 10
worker0 Finished Episode 8
worker1 Finished Episode 11
worker1 Finished Episode 12
worker0 Finished Episode 9
worker1 Finished Episode 13
worker0 Finished Episode 10
worker1 Finished Episode 14
worker0 Finished Episode 11
worker1 Finished Episode 15
worker1 Finished Episode 16
worker0 Finished Episode 12
worker0 Finished Episode 13
worker1 Finished Episode 17
worker0 Finished Episode 14
worker1 Finished Episode 18
worker1 Finished Episode 19
worker0 Finished Episode 15
worker1 Fini