Imports

In [8]:
import numpy as np
import tensorflow as tf
from keras.models import Model
from sympy.core import function
from collections import deque
import matplotlib.pyplot as plt
from datetime import date, datetime
import gym
import random

Global variables

In [20]:
DEBUG_PRINT = False
REPLAY_BUFFER_LEN = 10000
DISCOUNT_FACTOR = 0.99
BATCH_SIZE = 64
NUM_BATCHES = 4
STEPS_LIMIT = 1000
LOG_PATH = ''

CLASSIC_ENVIRONMENT_NAME = 'CartPole-v1'
ATARI_ENVIRONMENT_NAME = 'DemonAttack-v4'

policy_learning_rate = 1e-3
value_learning_rate = 1e-3
episodes_no = 500


Policies

In [10]:

def random_policy(action_space, state, model):
    return action_space.sample()

def network_model_policy(action_space, state, model):
    action_probs = model(tf.expand_dims(state,0))
    if DEBUG_PRINT:
        print(f"Action probs: {action_probs}")
    return np.random.choice(action_space.n, p=np.squeeze(action_probs))



Losses and metrics

In [11]:
def policy_loss(target, output_value):
    return tf.multiply(-target, tf.math.log(output_value))


def value_loss(target, output_value):
    return tf.multiply(-target, output_value)

def value_mse(target, output_value):
    return tf.math.squared_difference(target, output_value)


Network builders

In [12]:

def dense_policy_network(input_shape, outputs_no):
    x = tf.keras.layers.Input(shape = input_shape)
    h = tf.keras.layers.Dense(128, activation='relu', kernel_initializer='random_normal', bias_initializer='zeros')(x)
    # h = layers.Dense(64, activation='relu', kernel_initializer='random_normal', bias_initializer='zeros')(h)
    # h = layers.Dense(32, activation='relu')(h)
    h = tf.keras.layers.Dense(outputs_no, kernel_initializer='random_normal', bias_initializer='zeros')(h)
    y = tf.keras.layers.Softmax()(h)
    model = Model(inputs=x,outputs=y)
    return model


def conv_policy_network(input_shape, outputs_no):
    x = tf.keras.layers.Input(shape = input_shape)
    h = tf.keras.layers.Lambda(lambda obs: tf.cast(obs, np.float32)/255.)(x) 
    h = tf.keras.layers.Conv2D(filters = 64, kernel_size = (4,4), padding='same', activation = 'relu')(h)
    h = tf.keras.layers.Conv2D(filters = 64, kernel_size = (8,8), padding='same',  activation = 'relu')(h)
    # h = layers.Conv2D(filters = 64, kernel_size = (3,3), padding='same', activation='relu')(h)
    h = tf.keras.layers.Flatten()(h)
    h = tf.keras.layers.Dense(128, activation='relu')(h)
    # h = layers.Dense(32, activation='relu')(h)
    h = tf.keras.layers.Dense(outputs_no)(h)
    y = tf.keras.layers.Softmax()(h)
    model = Model(inputs = x, outputs = y)
    return model


def dense_value_network(input_shape):
    x = tf.keras.layers.Input(shape = input_shape)
    h = tf.keras.layers.Dense(128, activation='relu', kernel_initializer='random_normal')(x)
    # h = tf.keras.layers.Dense(32, activation='relu', kernel_initializer='random_normal', bias_initializer='zeros')(h)
    # h = tf.keras.layers.Dense(32, activation='relu', kernel_initializer='random_normal', bias_initializer='zeros')(h)
    y = tf.keras.layers.Dense(1)(h)
    model = Model(inputs = x, outputs = y)
    return model


def conv_value_network(input_shape):
    x = tf.keras.layers.Input(shape = input_shape)
    h = tf.keras.layers.Lambda(lambda obs: tf.cast(obs, np.float32)/255.)(x)
    h = tf.keras.layers.Conv2D(filters = 64, kernel_size = (4,4), padding='same')(h)
    h = tf.keras.layers.Conv2D(filters = 64, kernel_size = (8,8), padding='same')(h)
    h = tf.keras.layers.Flatten()(h)
    h = tf.keras.layers.Dense(128, activation='relu')(h)
    # h = tf.keras.layers.Dense(32, activation='relu')(h)
    # h = tf.keras.layers.Dense(32, activation='relu')(h)
    y = tf.keras.layers.Dense(1)(h)
    model = Model(inputs = x, outputs = y)
    return model


Network training functions

In [13]:
def policy_training_step(model, states, actions, targets, optimizer):
    """
    Training step for policy network, involving calculationa and application of gradients
    """
    with tf.GradientTape() as tape:
        output_values = model(states)
        # Select output for the performed action
        action_outputs = [output_values[i][actions[i]] for i in range(len(states))]
        loss = policy_loss(targets, action_outputs)
        if DEBUG_PRINT:
            print(f" |||| Policy |||| Targets: {targets}, outputs: {np.array(action_outputs)} and loss: {loss}")

    policy_grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(policy_grads, model.trainable_variables))
    return tf.reduce_sum(loss)

def value_training_step(model, states, targets, optimizer, metrics):
    """
    training step for value network for reinforce with baseline
    """
    with tf.GradientTape() as tape:
        output_values = model(states)
        # delta = G_t - v(S,w)
        deltas = targets - output_values
        loss = value_loss(deltas, output_values)
        if DEBUG_PRINT:
            print(f" |||| Value |||| Targets: {targets}, outputs: {output_values}, deltas: {deltas} and loss: {loss}")
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))
    metrics.update_state(model(states), tf.constant(deltas))
    return tf.reduce_sum(loss)

def a2c_value_training_step(model, states, rewards, next_states, dones,  optimizer, metrics, discount_factor):
    """
    training step for value network in actor critic
    """
    with tf.GradientTape() as tape:
        # target = R + gamma * v(S',w) OR target = R if S' is terminal
        targets = tf.add(tf.constant(rewards, dtype = 'float32'), tf.multiply(tf.multiply(tf.constant(discount_factor, dtype = 'float32'), model(next_states)),tf.constant(~dones,dtype='float32')))
        output_values = model(states)
        # delta = target - v(S,w)
        deltas = targets - output_values
        loss = value_loss(deltas, output_values)
        if DEBUG_PRINT:
            print(f" |||| Value |||| Targets: {targets}, outputs: {output_values}, deltas: {deltas} and loss: {loss}")
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))
    metrics.update_state(model(states), tf.constant(deltas))
    return tf.reduce_sum(loss)

Definition of Agent and ReplayBuffer classes

In [24]:
class ReplayBuffer:
    """
    Class to represent replay buffer and sample from it
    """

    def __init__(self, max_length) -> None:
        self.buffer = deque(maxlen=max_length)

    def add_experience(self, experience : tuple):
        self.buffer.append(experience)

    def get_random_experience_batch(self, batch_size):
        """
        function to sample a random experience batch from buffer
        Arguments:
        @ batch_size - number of desired samples in batch
        """
        sample = []
        for i in range(batch_size):
            sample.append(random.choice(self.buffer))
        return [np.array([experience[field_index] for experience in sample]) for field_index in range(len(sample[0]))]

    def clear(self):
        self.buffer.clear()

    @property
    def len(self):
        return len(self.buffer)


class Agent:

    def __init__(self, environment, start_state):
        self.state = start_state
        self.environment = environment
        self.state_history = []
        self.action_history = []
        self.rewards_history = []
        self.replay_buffer = ReplayBuffer(max_length=REPLAY_BUFFER_LEN)

    def take_action(self, policy, policy_model):
        action = policy(action_space=self.environment.action_space, model = policy_model, state=self.state)
        next_state, reward, done, info = self.environment.step(action)
        if DEBUG_PRINT:
            print(f"Selected action: {action}, reward {reward}")
        self.state_history.append(self.state)
        self.state = next_state
        self.action_history.append(action)
        self.rewards_history.append(reward)
        return done

    def play_episode(self, policy, policy_model, steps_limit=STEPS_LIMIT, experience_replay = False, discount_factor = 1):
        done = False
        for step in range(steps_limit):
            done = self.take_action(policy=policy, policy_model=policy_model)
            if done:
                break
        if experience_replay:
            for t in range(len(self.state_history)):
                G_t = tf.math.reduce_sum([self.rewards_history[tt] * discount_factor ** tt for tt in range(t + 1,
                                                                                  len(self.state_history), 1)])
                if t < len(self.state_history) - 1: # state is not terminal
                    experience = (self.state_history[t], self.action_history[t], G_t, self.state_history[t+1], False)
                else: 
                    experience = (self.state_history[t], self.action_history[t], G_t, [], True)
                if DEBUG_PRINT:
                    print(f"Adding experience: {experience}")
                self.replay_buffer.add_experience(experience)
            
        return step
    
    def reset_episode(self, state):
        self.state = state
        self.state_history = []
        self.action_history = []
        self.rewards_history = []

Supporting functions



In [27]:
def progress_bar(current, total, reward, bar_length=20):
    fraction = current / total
    arrow = int(fraction * bar_length - 1) * '-' + '>'
    padding = int(bar_length - len(arrow)) * ' '
    ending = '\n' if current == total else '\r'
    print(f'Progress: [{arrow}{padding}] {int(fraction * 100)}%, cumulative reward: {reward}', end=ending)


def save_to_file(data, file_path):
    if not isinstance(file_path, str):
        return ""
    np.savetxt(file_path + '.csv', data, delimiter=',')


def load_from_file(file_path):
    return np.loadtxt(file_path, delimiter=',')


def make_plot(x_data, y_data, x_label, y_label, title):
    plt.plot(x_data, y_data)
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.title(title)
    plt.show()

def file_name(function_name : str, learning_rate, object, discount_factor = DISCOUNT_FACTOR, batch_size = BATCH_SIZE):
    date_time = datetime.now()
    dt_string = date_time.strftime("%d-%m_%H-%M")
    return f"{object}\\{function_name}_lr{learning_rate}_gamma{discount_factor}_batch{batch_size}_{dt_string}.csv"




Learning functions

In [28]:
def reinforce_classic(episodes_no = episodes_no,
    policy_learning_rate = policy_learning_rate,
    value_learning_rate = value_learning_rate,
    discount_factor = DISCOUNT_FACTOR, 
    batch_size = BATCH_SIZE):

    """
    Function to perform the training of the agent with the REINFORCE algorithm in the classic environment with experience replay
    """

    # Creation of file writer for tensorboard
    current_time = datetime.now().strftime("%Y%m%d-%H%M%S")
    train_log_dir = f'{LOG_PATH}/reinforce_classic/lr{[value_learning_rate, policy_learning_rate]}_batch{BATCH_SIZE}_' + current_time + '/train'
    train_summary_writer = tf.summary.create_file_writer(train_log_dir)

    # Environment initialisation
    env = gym.make(CLASSIC_ENVIRONMENT_NAME)
    # Agent initialisation
    agent = Agent(environment=env, start_state=env.reset())
    # Policy network creation
    policy_model = dense_policy_network(input_shape = env.observation_space.shape, outputs_no = env.action_space.n)
    policy_optimizer = tf.keras.optimizers.Adam(learning_rate = policy_learning_rate)

    rewards_mem = []
    steps_mem = []

    
    for episode in range(episodes_no):
        if DEBUG_PRINT:
            print(f"================== Episode {episode} =====================")
        step = agent.play_episode(policy = network_model_policy, policy_model= policy_model, experience_replay=True, discount_factor = discount_factor)
        # After some episodes when we have enough samples in the buffer - start training
        if episode >= 20:
            total_loss = 0
            states, actions, targets, _ , _ = agent.replay_buffer.get_random_experience_batch(batch_size = batch_size)
            total_loss = policy_training_step(model = policy_model, states = states, actions = actions, targets = targets, optimizer=policy_optimizer)
            # Write data to tensorboard
            with train_summary_writer.as_default():
                tf.summary.scalar('policy mean loss', total_loss/batch_size, step = episode)
                tf.summary.scalar('cumulative reward', tf.reduce_sum(agent.rewards_history), step = episode)
        rewards_mem.append(tf.reduce_sum(agent.rewards_history))
        steps_mem.append(step)
        agent.reset_episode(state = env.reset())
        if not DEBUG_PRINT:
            progress_bar(episode, episodes_no, rewards_mem[-1], 80)

    env.close() 
    # make_plot(range(1, episodes_no + 1, 1), rewards_mem, 'Episode', 'Total reward', f'Cumulative reward for reinforce - {CLASSIC_ENVIRONMENT_NAME}')
    np.savetxt(file_name('reinforce_classic\\reinforce_test_classic', policy_learning_rate, 'rewards', discount_factor=discount_factor, batch_size=batch_size), rewards_mem, delimiter=',')



def reinforce_baseline_classic(episodes_no = episodes_no,
                                policy_learning_rate = policy_learning_rate,
                                value_learning_rate = value_learning_rate, 
                                discount_factor = DISCOUNT_FACTOR, 
                                batch_size = BATCH_SIZE):
    """
    Function to perform the training of the agent with the REINFORCE algorithm in the classic environment with baseline and experience replay
    """

    # Creation of file writer for tensorboard
    current_time = datetime.now().strftime("%Y%m%d-%H%M%S")
    train_log_dir = f'{LOG_PATH}/baseline_classic/lr{[value_learning_rate, policy_learning_rate]}_batch{BATCH_SIZE}_' + current_time + '/train'
    train_summary_writer = tf.summary.create_file_writer(train_log_dir)

    # Environment initialisation
    env = gym.make(CLASSIC_ENVIRONMENT_NAME)
    # Agent initialisation
    agent = Agent(environment=env, start_state=env.reset())
    # Policy and value netowrks creation
    value_model = dense_value_network(input_shape=env.observation_space.shape)
    policy_model = dense_policy_network(input_shape = env.observation_space.shape, outputs_no = env.action_space.n)
    # Optimizers and metrics initialisation
    policy_optimizer = tf.keras.optimizers.Adam(learning_rate = policy_learning_rate)
    value_optimizer = tf.keras.optimizers.Adam(learning_rate = value_learning_rate)
    value_metrics = tf.keras.metrics.MeanSquaredError()

    rewards_mem = []
    steps_mem = []

    for episode in range(episodes_no):
        step = agent.play_episode(policy = network_model_policy, policy_model= policy_model, experience_replay=True, discount_factor = discount_factor)
        if episode >= 20:
            states, actions, targets, _ , _ = agent.replay_buffer.get_random_experience_batch(batch_size = batch_size)
            # Value network training step
            total_value_loss = value_training_step(model = value_model, states = states, targets = targets, optimizer=value_optimizer, metrics=value_metrics)
            # Calculation of baselines
            baselines = value_model(states)
            # delta = G_t - v(S,w)
            deltas = tf.math.subtract(targets, baselines)
            # Policy network training step
            total_policy_loss = policy_training_step(model = policy_model, states = states, actions = actions, targets = deltas, optimizer=policy_optimizer)
            # Writing data to tensorboard
            with train_summary_writer.as_default():
                tf.summary.scalar('policy mean loss', total_policy_loss/batch_size, step = episode)
                tf.summary.scalar('value mean loss', total_value_loss/batch_size, step = episode)
                tf.summary.scalar('cumulative reward', tf.reduce_sum(agent.rewards_history), step = episode)
                tf.summary.scalar('value mse', value_metrics.result(), step=episode)

        rewards_mem.append(tf.reduce_sum(agent.rewards_history))
        steps_mem.append(step)
        agent.reset_episode(state = env.reset())
        if not DEBUG_PRINT:
            progress_bar(episode, episodes_no, rewards_mem[-1], 80)

    env.close() 
    make_plot(range(1, episodes_no + 1, 1), rewards_mem, 'Episode', 'Total reward', f'Cumulative reward for reinforce baseline - {CLASSIC_ENVIRONMENT_NAME}')
    # np.savetxt(file_name('baseline_classic\\reinforce_baseline_test_classic', [value_learning_rate, policy_learning_rate], 'rewards', discount_factor=discount_factor, batch_size=batch_size), rewards_mem, delimiter=',')


def reinforce_baseline_classic_no_batch(episodes_no = episodes_no, 
                                        value_learning_rate = value_learning_rate, 
                                        policy_learning_rate = policy_learning_rate, 
                                        discount_factor = DISCOUNT_FACTOR, 
                                        batch_size = BATCH_SIZE):

    """
    Function to perform the training of the agent with the REINFORCE algorithm in the classic environment
    """
    # For tensorflow
    current_time = datetime.now().strftime("%Y%m%d-%H%M%S")
    train_log_dir = f'{LOG_PATH}/baseline_classic_nb/lr{[value_learning_rate, policy_learning_rate]}_batch{BATCH_SIZE}_' + current_time + '/train'
    train_summary_writer = tf.summary.create_file_writer(train_log_dir)

    env = gym.make(CLASSIC_ENVIRONMENT_NAME)
    agent = Agent(environment=env, start_state=env.reset())

    # building networks
    value_model = dense_value_network(input_shape=env.observation_space.shape)
    policy_model = dense_policy_network(input_shape = env.observation_space.shape, outputs_no = env.action_space.n)

    policy_optimizer = tf.keras.optimizers.Adam(learning_rate = policy_learning_rate)
    value_optimizer = tf.keras.optimizers.Adam(learning_rate = value_learning_rate)
    value_metrics = tf.keras.metrics.MeanSquaredError()

    rewards_mem = []
    steps_mem = []

    for episode in range(episodes_no):
        if DEBUG_PRINT:
            print(f"================== Episode {episode} =====================")
        step = agent.play_episode(policy = network_model_policy, policy_model= policy_model, experience_replay=True, discount_factor = discount_factor)
        total_loss = 0
        value_loss = 0
        G_t_mem = []
        # learning 

        for t in range(len(agent.state_history)-1):
            state_t = agent.state_history[t]
            action_t = agent.action_history[t]
            # calculate G_t
            G_t = tf.math.reduce_sum(
                    [agent.rewards_history[tt] * discount_factor ** tt for tt in range(t + 1, len(agent.state_history), 1)])
            G_t_mem.append(G_t)
            if DEBUG_PRINT:
                print(f"G_t: {G_t}")
            # value approximator training step
            value_loss += value_training_step(model = value_model, 
                                            states = tf.expand_dims(state_t,0), 
                                            targets=(tf.expand_dims(G_t,0)), 
                                            optimizer= value_optimizer, 
                                            metrics = value_metrics)

            # calculate baseline as current approximated state value
            baseline = value_model(tf.expand_dims(state_t,0))
            if DEBUG_PRINT:
                print(f"Value after: {baseline}")
            
            delta_t = G_t - baseline
            total_loss += policy_training_step(model = policy_model, states = tf.expand_dims(state_t,0), actions=tf.expand_dims(action_t,0), targets=tf.expand_dims(delta_t,0), optimizer=policy_optimizer )
            if DEBUG_PRINT:
                print(f"Action probs after: {policy_model(tf.expand_dims(state_t,0))} and action was: {action_t}")

        mean_loss = total_loss / np.sum(G_t_mem)
        value_mean_loss = value_loss/np.sum(G_t_mem)

        with train_summary_writer.as_default():
            tf.summary.scalar('policy mean loss', mean_loss, step = episode)
            tf.summary.scalar('value mean loss', value_mean_loss, step = episode)
            tf.summary.scalar('cumulative reward', tf.reduce_sum(agent.rewards_history), step = episode)
            tf.summary.scalar('value mse', value_metrics.result(), step=episode)
        
        rewards_mem.append(tf.reduce_sum(agent.rewards_history))
        steps_mem.append(step)
        agent.reset_episode(state = env.reset())
        if not DEBUG_PRINT:
            progress_bar(episode, episodes_no, rewards_mem[-1], 80)

    env.close() 
    make_plot(range(1, episodes_no + 1, 1), rewards_mem, 'Episode', 'Total reward', f'Cumulative reward for reinforce baseline - {CLASSIC_ENVIRONMENT_NAME}')
    # np.savetxt(file_name('baseline_classic\\reinforce_baseline_test_classic_nb', [value_learning_rate, policy_learning_rate], 'rewards', discount_factor=discount_factor, batch_size=1), rewards_mem, delimiter=',')

def reinforce_classic_no_batch(episodes_no = episodes_no, 
                                policy_learning_rate = policy_learning_rate,
                                value_learning_rate = value_learning_rate, 
                                discount_factor = DISCOUNT_FACTOR, 
                                batch_size = BATCH_SIZE):

    """
    Function to perform the training of the agent with the REINFORCE algorithm in the classic environment with experience replay
    """

    current_time = datetime.now().strftime("%Y%m%d-%H%M%S")
    train_log_dir = f'{LOG_PATH}/classic_nb/lr{[value_learning_rate, policy_learning_rate]}_batch{BATCH_SIZE}_' + current_time + '/train'
    train_summary_writer = tf.summary.create_file_writer(train_log_dir)

    env = gym.make(CLASSIC_ENVIRONMENT_NAME)
    agent = Agent(environment=env, start_state=env.reset())

    policy_model = dense_policy_network(input_shape = env.observation_space.shape, outputs_no = env.action_space.n)
    policy_optimizer = tf.keras.optimizers.Adam(learning_rate = policy_learning_rate)

    rewards_mem = []
    steps_mem = []

    for episode in range(episodes_no):
        step = agent.play_episode(policy = network_model_policy, policy_model= policy_model, experience_replay=False)
        total_loss = 0
        G_t_mem = []

        for t in range(len(agent.state_history)-1):
            state_t = agent.state_history[t]
            action_t = agent.action_history[t]
            G_t = tf.math.reduce_sum(
                    [agent.rewards_history[tt] * discount_factor ** tt for tt in range(t + 1, len(agent.state_history), 1)])

            G_t_mem.append(G_t)
            total_loss += policy_training_step(model = policy_model, states = tf.expand_dims(state_t,0), actions=tf.expand_dims(action_t,0), targets=tf.expand_dims(G_t,0), optimizer=policy_optimizer )
            if DEBUG_PRINT:
                print(f"Action probs after: {policy_model(tf.expand_dims(state_t,0))} and action was: {action_t}, total_loss {total_loss}")

        mean_loss = total_loss / np.sum(G_t_mem)
        if DEBUG_PRINT:
            print(f"And mean loss: {mean_loss}")

        with train_summary_writer.as_default():
            tf.summary.scalar('policy mean loss', mean_loss, step = episode)
            tf.summary.scalar('cumulative reward', tf.reduce_sum(agent.rewards_history), step = episode)

        rewards_mem.append(tf.reduce_sum(agent.rewards_history))
        steps_mem.append(step)
        agent.reset_episode(state = env.reset())
        if not DEBUG_PRINT:
            progress_bar(episode, episodes_no, rewards_mem[-1], 80)

    env.close() 
    make_plot(range(1, episodes_no + 1, 1), rewards_mem, 'Episode', 'Total reward', f'Cumulative reward for reinforce baseline - {CLASSIC_ENVIRONMENT_NAME}')
    # np.savetxt(file_name('reinforce_classic\\reinforce_classic_nb', policy_learning_rate, 'rewards', discount_factor=discount_factor, batch_size=1), rewards_mem, delimiter=',')

def reinforce_baseline_atari(episodes_no = episodes_no, 
                            policy_learning_rate = policy_learning_rate, 
                            value_learning_rate = value_learning_rate, 
                            discount_factor = DISCOUNT_FACTOR, 
                            batch_size = BATCH_SIZE):
  
    """
    Function to perform the training of the agent with the REINFORCE algorithm in the atari environment with experience replay
    """
   
    current_time = datetime.now().strftime("%Y%m%d-%H%M%S")
    train_log_dir = f'{LOG_PATH}/baseline_atari/lr{[value_learning_rate, policy_learning_rate]}_batch{BATCH_SIZE}_' + current_time + '/train'
    train_summary_writer = tf.summary.create_file_writer(train_log_dir)

    env = gym.make(ATARI_ENVIRONMENT_NAME, full_action_space = False)
    env = gym.wrappers.ResizeObservation(env, (84, 84))
    env = gym.wrappers.FrameStack(env, 4)    
    agent = Agent(environment=env, start_state=env.reset())

    value_model = conv_value_network(input_shape=env.observation_space.shape)
    policy_model = conv_policy_network(input_shape = env.observation_space.shape, outputs_no = env.action_space.n)

    policy_optimizer = tf.keras.optimizers.Adam(learning_rate = policy_learning_rate)
    value_optimizer = tf.keras.optimizers.Adam(learning_rate = value_learning_rate)
    value_metrics = tf.keras.metrics.MeanSquaredError()

    rewards_mem = []
    steps_mem = []

    for episode in range(episodes_no):
        step = agent.play_episode(policy = network_model_policy, policy_model= policy_model, experience_replay=True, discount_factor = discount_factor)
        print(f"Episode {episode+1}/{episodes_no} finished in {step} steps with reward {tf.reduce_sum(agent.rewards_history)}")
        if episode >= 0.05 * episodes_no:
            states, actions, targets, _ , _ = agent.replay_buffer.get_random_experience_batch(batch_size = batch_size)
            value_mean_loss = value_training_step(model = value_model, states = states, targets = targets, optimizer=value_optimizer, metrics=value_metrics)
            baselines = value_model(states)
            deltas = tf.math.subtract(targets, baselines)
            policy_mean_loss = policy_training_step(model = policy_model, states = states, actions = actions, targets = deltas, optimizer=policy_optimizer)
            with train_summary_writer.as_default():
                tf.summary.scalar('policy mean loss', tf.reduce_mean(policy_mean_loss), step = episode)
                tf.summary.scalar('value mean loss', tf.reduce_mean(value_mean_loss), step = episode)
                tf.summary.scalar('cumulative reward', tf.reduce_sum(agent.rewards_history), step = episode)
                tf.summary.scalar('value mse', value_metrics.result(), step=episode)

        rewards_mem.append(tf.reduce_sum(agent.rewards_history))
        steps_mem.append(step)
        agent.reset_episode(state = env.reset())
        # if not DEBUG_PRINT:
        #     progress_bar(episode, episodes_no, rewards_mem[-1], 80)

    env.close() 
    make_plot(range(1, episodes_no + 1, 1), rewards_mem, 'Episode', 'Total reward', f'Cumulative reward for reinforce baseline - {CLASSIC_ENVIRONMENT_NAME}')
    # np.savetxt(file_name('reinforce_baseline_test_classic', policy_learning_rate, 'rewards'), rewards_mem, delimiter=',')
    # value_model.save_weights(f'value_{value_learning_rate}')
    # policy_model.save_weights(f'policy_{policy_learning_rate}')

def reinforce_baseline_atari_nb(episodes_no = episodes_no,
                                value_learning_rate = value_learning_rate, 
                                policy_learning_rate = policy_learning_rate, 
                                discount_factor = DISCOUNT_FACTOR, 
                                batch_size = BATCH_SIZE):

    current_time = datetime.now().strftime("%Y%m%d-%H%M%S")
    train_log_dir = f'{LOG_PATH}/reinforce_baseline_atari' + current_time + '/train'
    train_summary_writer = tf.summary.create_file_writer(train_log_dir)

    
    env = gym.make(ATARI_ENVIRONMENT_NAME, full_action_space = False)
    env = gym.wrappers.ResizeObservation(env, (80, 80))
    env = gym.wrappers.FrameStack(env, 4)    
    agent = Agent(environment=env, start_state=env.reset())

    value_model = conv_value_network(input_shape=env.observation_space.shape)
    policy_model = conv_policy_network(input_shape = env.observation_space.shape, outputs_no = env.action_space.n)

    policy_optimizer = tf.keras.optimizers.Adam(learning_rate = policy_learning_rate)
    value_optimizer = tf.keras.optimizers.Adam(learning_rate = value_learning_rate)
    value_metrics = tf.keras.metrics.MeanSquaredError()

    rewards_mem = []
    steps_mem = []

    for episode in range(episodes_no):
        step = agent.play_episode(policy = network_model_policy, policy_model= policy_model, experience_replay=False, discount_factor = discount_factor)
        print(f"Episode {episode} finished in {step} steps with reward {tf.reduce_sum(agent.rewards_history)}")
        total_loss = 0
        value_loss = 0
        G_t_mem = []
        for t in range(len(agent.state_history)-1):
            state_t = agent.state_history[t]
            action_t = agent.action_history[t]
            G_t = tf.math.reduce_sum(
                    [agent.rewards_history[tt] * discount_factor ** tt for tt in range(t + 1, len(agent.state_history), 1)])
            G_t_mem.append(G_t)
            value_loss+= value_training_step(model = value_model, 
                                            states = tf.expand_dims(state_t,0), 
                                            targets=(tf.expand_dims(G_t,0)), 
                                            optimizer= value_optimizer, 
                                            metrics = value_metrics)

            baseline = value_model(tf.expand_dims(state_t,0))
            if DEBUG_PRINT:
                print(f"Value after: {baseline}")
            delta_t = G_t - baseline
            total_loss += policy_training_step(model = policy_model, 
                                               states = tf.expand_dims(state_t,0), 
                                               actions=tf.expand_dims(action_t,0), 
                                               targets=tf.expand_dims(delta_t,0), 
                                               optimizer=policy_optimizer )
            if DEBUG_PRINT:
                print(f"Action probs after: {policy_model(tf.expand_dims(state_t,0))} and action was: {action_t}")

        mean_loss = total_loss / np.sum(G_t_mem)
        value_mean_loss = value_loss/ np.sum(G_t_mem)

        with train_summary_writer.as_default():
            tf.summary.scalar('policy mean loss', tf.reduce_mean(mean_loss), step = episode)
            tf.summary.scalar('value mean loss', tf.reduce_mean(value_mean_loss), step = episode)
            tf.summary.scalar('cumulative reward', tf.reduce_sum(agent.rewards_history), step = episode)
            tf.summary.scalar('value mse', value_metrics.result(), step=episode)

        rewards_mem.append(tf.reduce_sum(agent.rewards_history))
        steps_mem.append(step)
        agent.reset_episode(state = env.reset())
        if not DEBUG_PRINT:
            progress_bar(episode, episodes_no, rewards_mem[-1], 80)

    env.close() 
    make_plot(range(1, episodes_no + 1, 1), rewards_mem, 'Episode', 'Total reward', f'Cumulative reward for reinforce baseline - {ATARI_ENVIRONMENT_NAME}')
    # np.savetxt(file_name('reinforce_baseline_test_atari', policy_learning_rate, 'rewards'), rewards_mem, delimiter=',')
    # value_model.save_weights(f'value_{value_learning_rate}')
    # policy_model.save_weights(f'policy_{policy_learning_rate}')

def actor_critic_classic_no_batch(episodes_no = episodes_no, 
                                    value_learning_rate = value_learning_rate, 
                                    policy_learning_rate = policy_learning_rate, 
                                    discount_factor = DISCOUNT_FACTOR, 
                                    batch_size = BATCH_SIZE):


    current_time = datetime.now().strftime("%Y%m%d-%H%M%S")
    train_log_dir = f'{LOG_PATH}/actor_critic_nb' + current_time + '/train'
    train_summary_writer = tf.summary.create_file_writer(train_log_dir)

    env = gym.make(CLASSIC_ENVIRONMENT_NAME)
    agent = Agent(environment=env, start_state=env.reset())

    # building networks
    value_model = dense_value_network(input_shape=env.observation_space.shape)
    policy_model = dense_policy_network(input_shape = env.observation_space.shape, outputs_no = env.action_space.n)

    policy_optimizer = tf.keras.optimizers.Adam(learning_rate = policy_learning_rate)
    value_optimizer = tf.keras.optimizers.Adam(learning_rate = value_learning_rate)
    value_metrics = tf.keras.metrics.MeanSquaredError()

    rewards_mem = []
    steps_mem = []

    for episode in range(episodes_no):
        if DEBUG_PRINT:
            print(f"================== Episode {episode} =====================")
        total_loss = 0
        total_value_loss = 0
        # play an episode while learning
        for step in range(STEPS_LIMIT):

            done = agent.take_action(policy=network_model_policy, policy_model=policy_model)
            state = agent.state_history[-1] # S
            action = agent.action_history[-1] # A taken in S
            reward = agent.rewards_history[-1] # R received after taking action A in S
             # if S' is not terminal
            if not done:
                # S' next state
                next_state = agent.state 
                # target = R + gamma * v(S',w)
                target = reward + discount_factor * value_model(tf.expand_dims(next_state,0))[0] 
            else:
                target = reward
            if DEBUG_PRINT:
                print(f"State: {state}, action {action}, reward {reward}")
                print(f"State value before: {value_model(tf.expand_dims(state, 0))}, action probs before: {policy_model(tf.expand_dims(state, 0))}")
            total_value_loss += value_training_step(model = value_model, 
                states = tf.expand_dims(state,0), 
                targets= tf.expand_dims(target,0), 
                optimizer = value_optimizer, 
                metrics = value_metrics
                )
            # delta = R + gamma * v(S',w) - v(S,w')
            delta = target - value_model(tf.expand_dims(state, 0))
            total_loss += policy_training_step(model = policy_model,
                states = tf.expand_dims(state,0),
                actions = tf.expand_dims(action,0),
                targets = tf.expand_dims(delta,0),
                optimizer = policy_optimizer   
                )
            if DEBUG_PRINT:
                print(f"State value after: {value_model(tf.expand_dims(state, 0))}, action probs before: {policy_model(tf.expand_dims(state, 0))}")
            if done: 
                break


        steps_mem.append(step)
        rewards_mem.append(tf.reduce_sum(agent.rewards_history))
        mean_loss = total_loss / np.sum(agent.rewards_history)
        value_mean_loss = total_value_loss/np.sum(agent.rewards_history)

        with train_summary_writer.as_default():
            tf.summary.scalar('policy mean loss', mean_loss, step = episode)
            tf.summary.scalar('value mean loss', value_mean_loss, step = episode)
            tf.summary.scalar('cumulative reward', tf.reduce_sum(agent.rewards_history), step = episode)
            tf.summary.scalar('value mse', value_metrics.result(), step=episode)

        agent.reset_episode(state = env.reset())
        if not DEBUG_PRINT:
            progress_bar(episode, episodes_no, rewards_mem[-1], 80)
            
    env.close() 
    make_plot(range(1, episodes_no + 1, 1), rewards_mem, 'Episode', 'Total reward', f'Cumulative reward for actor critic- {CLASSIC_ENVIRONMENT_NAME}')
    np.savetxt(file_name('a2c\\actor_critic_nb', [policy_learning_rate, value_learning_rate], 'rewards', discount_factor=discount_factor, batch_size=1), rewards_mem, delimiter=',')



Test code

In [None]:

functions_dict = {
    1 : reinforce_classic, 
    2 : reinforce_baseline_classic, 
    3 : reinforce_classic_no_batch, 
    4 : reinforce_baseline_classic_no_batch, 
    5 : reinforce_baseline_atari,
    6 : actor_critic_classic_no_batch,
}


# fun = functions_dict[config.chosen_fun]
# fun()
# reinforce_baseline_atari(episodes_no=episodes_no, policy_learning_rate=policy_learning_rate, value_learning_rate=value_learning_rate)

learning_rates = [1e-5, 1e-3]
batch_sizes = [8, 64, 128]
discount_factors = [0.8, 0.95, 0.99, 1]

for fun_no in [1,2,3,6]:
    for df in discount_factors:
        for bs in batch_sizes:
            for policy_lr in learning_rates:
                for value_lr in learning_rates:
               
                        test_fun = functions_dict[fun_no]
                        print("-------------------------------------------------------------------------------------------------------------------")
                        print(f"Testing {test_fun.__name__}, with gamma = {df}, batch = {bs}, lrs = {policy_lr, value_lr}")
                        test_fun(episodes_no = episodes_no, 
                                value_learning_rate = value_lr, 
                                policy_learning_rate = policy_lr, 
                                discount_factor = df, 
                                batch_size = bs)

-------------------------------------------------------------------------------------------------------------------
Testing reinforce_classic, with gamma = 0.8, batch = 8, lrs = (1e-05, 1e-05)




-------------------------------------------------------------------------------------------------------------------
Testing reinforce_classic, with gamma = 0.8, batch = 8, lrs = (1e-05, 0.001)
-------------------------------------------------------------------------------------------------------------------
Testing reinforce_classic, with gamma = 0.8, batch = 8, lrs = (0.001, 1e-05)
-------------------------------------------------------------------------------------------------------------------
Testing reinforce_classic, with gamma = 0.8, batch = 8, lrs = (0.001, 0.001)
-------------------------------------------------------------------------------------------------------------------
Testing reinforce_classic, with gamma = 0.8, batch = 64, lrs = (1e-05, 1e-05)
-------------------------------------------------------------------------------------------------------------------
Testing reinforce_classic, with gamma = 0.8, batch = 64, lrs = (1e-05, 0.001)
---------------------------------