In [1]:
# %pip install matplotlib
# %pip install numpy
# %pip install gymnasium
# #%pip install gymnasium[mujoco]
# # %pip install tensorflow
# %pip install pandas
# %pip install seaborn
# %pip install torch

In [2]:
#%
# %pip install gymnasium
#%pip uninstall mujoco-py
#%pip uninstall -y -r <(pip freeze)

In [3]:
from __future__ import annotations

import random

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import torch
import torch.nn as nn
from torch.distributions.normal import Normal

import gymnasium as gym
import mujoco

print(mujoco.__version__)
plt.rcParams["figure.figsize"] = (10, 5)


2.3.0


In [4]:
import sys
import time
print(sys.version)

3.10.11 (tags/v3.10.11:7d4cc5a, Apr  5 2023, 00:38:17) [MSC v.1929 64 bit (AMD64)]


In [5]:
import keras
import tensorflow as tf
from collections import deque
from random import sample
from keras.models import Model
from keras.layers import Dense, Input, Concatenate, BatchNormalization
from keras.optimizers import Adam
from keras.losses import MeanSquaredError
from keras.initializers import RandomUniform

In [49]:
class ActorCriticHandler():
    def __init__(self, env, beta):
        self.state_size = env.observation_space.shape
        self.action_size = env.action_space.shape
        self.upper_action_bound = env.action_space.high[0]
        self.lower_action_bound = env.action_space.low[0]
        self.beta = beta

    def _create_actor(self):
        last_init = keras.initializers.RandomUniform(minval=-0.003, maxval=0.003)
    
        inputs = Input(shape=self.state_size)
        out = Dense(256, activation="relu")(inputs)
        out = Dense(256, activation="relu")(out)
        outputs = Dense(self.action_size[0], activation="tanh", kernel_initializer=last_init)(out)
    
        outputs = outputs * self.upper_action_bound
        actor = keras.Model(inputs, outputs)
        
        return actor

    def _create_critic(self):
        state_input = Input(shape=self.state_size)
        state_out = Dense(16, activation="relu")(state_input)
        state_out = Dense(32, activation="relu")(state_out)
    
        action_input = Input(shape=self.action_size)
        action_out = Dense(32, activation="relu")(action_input)
    
        concat = Concatenate()([state_out, action_out])
    
        out = Dense(256, activation="relu")(concat)
        out = Dense(256, activation="relu")(out)
        outputs = Dense(1)(out)

        critic = Model([state_input, action_input], outputs)

        return critic

    def create_networks():
        actor = actor_critic_handler._create_actor()
        target_actor = actor_critic_handler._create_actor()
        target_actor.set_weights(actor.get_weights())
        
        critic = actor_critic_handler._create_critic()
        target_critic = actor_critic_handler._create_critic()
        target_critic.set_weights(critic.get_weights())

        return actor, target_actor, critic, target_critic

    def _calculate_new_target_weights(self, base_network, target_network):
        base_weights = base_network.get_weights()
        target_weights = target_network.get_weights()

        for i in range(0, len(base_weights)):
            target_weights[i] = self.beta * base_weights[i] + (1 - self.beta) * target_weights[i]

        target_network.set_weights(target_weights)
        
    def _update_target_networks(self, actor, target_actor, critic, target_critic):
        self._calculate_new_target_weights(actor, target_actor)
        self._calculate_new_target_weights(critic, target_critic)

    def _add_gaussian_noise(self, action):
        action = action.numpy()
        if action.shape == ():
            action = np.array([action])
            
        action += np.random.normal(0,1,len(action)) * 0.1
        action = np.clip(action, self.lower_action_bound, self.upper_action_bound)
        return action

    @tf.function
    def _gradient_descent_critic(self, critic, target_critic, batch_states, batch_actions, batch_rewards, batch_states_prime):
        with tf.GradientTape() as tape:
            # https://keras.io/examples/rl/ddpg_pendulum/
            target_actions = target_actor(batch_states_prime)

            y = batch_rewards + gamma * target_critic([batch_states_prime, target_actions])
            critic_prediction = critic([batch_states, batch_actions])
            
            critic_loss =  keras.ops.mean(keras.ops.square(y - critic_prediction))
        
        # gradient of critic with respect to critic parameters
        critic_gradient = tape.gradient(critic_loss, critic.trainable_variables)
        critic_optimizer.apply_gradients(zip(critic_gradient, critic.trainable_variables))

    @tf.function
    def _gradient_ascent_actor(self, actor, target_actor, batch_states, batch_actions, batch_rewards, batch_states_prime):
        with tf.GradientTape() as tape:
            policy_actions = actor(batch_states)
            critic_values = critic([batch_states, policy_actions])
            actor_loss =  -keras.ops.mean(critic_values)
        
        actor_gradient = tape.gradient(actor_loss, actor.trainable_variables)
        actor_optimizer.apply_gradients(zip(actor_gradient, actor.trainable_variables))
        

class ReplayMemory():
    def __init__(self, max_size, minibatch_size):
        self.max_size = max_size
        self.minibatch_size = minibatch_size
        self.num_items = 0
        self.individual_memories = []

    def add(self, to_add):
        if self.num_items == self.max_size:
            for i in range(0, len(self.individual_memories)):
                self.individual_memories[i] = np.delete(self.individual_memories[i], (0), axis=0)
            self.num_items -= 1
        
        if self.individual_memories == []:
            for item in to_add:
                self.individual_memories.append(item)
        else:
            for i in range(0, len(to_add)):
                item = to_add[i]
                self.individual_memories[i] = np.vstack([self.individual_memories[i], item])

        self.num_items += 1
                
    def random_batch(self):
        return_items = []

        batch_indices = np.random.randint(0, self.num_items, size=self.minibatch_size)

        for item in self.individual_memories:
            if item.shape[1] == 1:
                return_items.append(keras.ops.cast(keras.ops.convert_to_tensor(item[batch_indices]), dtype="float32"))
            else:
                return_items.append(keras.ops.convert_to_tensor(item[batch_indices]))

        return return_items

In [50]:
keras.__version__

'3.2.1'

In [None]:
env = gym.make('Ant-v4')#gym.make('Ant-v4', render_mode="human"), render_mode="human", healthy_z_range=(0.3, 1)

replay_memory = ReplayMemory(max_size = 50000, minibatch_size = 64)

critic_lr = 0.002
actor_lr = 0.001
gamma = 0.99
num_episodes = 5000

critic_optimizer = keras.optimizers.Adam(critic_lr)
actor_optimizer = keras.optimizers.Adam(actor_lr)

actor_critic_handler = ActorCriticHandler(env, beta = 0.005)

actor, target_actor, critic, target_critic = ActorCriticHandler.create_networks()

episode_rewards = []
average_episode_rewards = []

# for each episode
for i in range(0, num_episodes):
    # initialise a current state
    current_observation, info = env.reset()

    # haven't ended episode yet
    terminated = False
    truncated = False

    total_episode_reward = 0
    episode_len = 0
    startTime = time.time()

    # while episode hasn't ended
    while not terminated and not truncated:
        before_batch_start = time.time()
        episode_len += 1
        # input state into the actor network with the starting weights to get an action        
        action = actor(keras.ops.expand_dims(keras.ops.convert_to_tensor(current_observation), 0))
        action = actor_critic_handler._add_gaussian_noise(keras.ops.squeeze(action))
        
        # execute the action with noise, observe reward + next state
        next_observation, reward, terminated, truncated, info = env.step(action)

        # add reward to total reward
        total_episode_reward += reward
        
        # store the transition in D
        replay_memory.add([current_observation, action, reward, next_observation])

        current_observation = next_observation
        
        if replay_memory.num_items > replay_memory.minibatch_size:
            batch_states, batch_actions, batch_rewards, batch_states_prime = replay_memory.random_batch()
            
            actor_critic_handler._gradient_descent_critic(critic, target_critic, batch_states, batch_actions, batch_rewards, batch_states_prime)
            actor_critic_handler._gradient_ascent_actor(actor, target_actor, batch_states, batch_actions, batch_rewards, batch_states_prime)

            actor_critic_handler._update_target_networks(actor, target_actor, critic, target_critic)

    episode_rewards.append(total_episode_reward)
    # Mean of last 40 episodes
    new_average = np.mean(episode_rewards[-40:])
    print("Episode * {} * Time taken * {} * Reward was * {} * Avg Reward is ==> {}".format(i, time.time() - startTime, total_episode_reward, new_average))
    average_episode_rewards.append(new_average)

    # save weights if episode multiple of 500
    if i % 500 == 0:
        actor.save(f'./Saved Models/Actor{i}.keras')
        target_actor.save(f'./Saved Models/ActorTarget{i}.keras')
        critic.save(f'./Saved Models/Critic{i}.keras')
        target_critic.save(f'./Saved Models/CriticTarget{i}.keras')

    try:
        np.savetxt("averageRewards.csv", np.array(episode_rewards), delimiter=",")
        np.savetxt("rewards.csv", np.array(average_episode_rewards), delimiter=",")
    except PermissionError:
        pass;
        
        
env.reset()
env.close()

Episode * 0 * Time taken * 21.077702522277832 * Reward was * 496.97273722615756 * Avg Reward is ==> 496.97273722615756
Episode * 1 * Time taken * 21.031168460845947 * Reward was * 835.4878989810925 * Avg Reward is ==> 666.230318103625
Episode * 2 * Time taken * 20.31258535385132 * Reward was * 853.3210561267844 * Avg Reward is ==> 728.5938974446781
Episode * 3 * Time taken * 20.030256986618042 * Reward was * 623.0706169312666 * Avg Reward is ==> 702.2130773163252
Episode * 4 * Time taken * 20.823976278305054 * Reward was * 715.693914063241 * Avg Reward is ==> 704.9092446657083
Episode * 5 * Time taken * 3.0878865718841553 * Reward was * 109.8207287288654 * Avg Reward is ==> 605.7278253429012
Episode * 6 * Time taken * 17.45344352722168 * Reward was * 348.4592363709317 * Avg Reward is ==> 568.975169775477
Episode * 7 * Time taken * 22.610170602798462 * Reward was * 590.8819922387985 * Avg Reward is ==> 571.7135225833922
Episode * 8 * Time taken * 23.5241379737854 * Reward was * 438.6266