In [None]:
import sys
import torch
from Agent import SACAgent
from Networks import Critic, Actor

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
if "google.colab" in sys.modules:
    from google.colab import drive
    drive.mount("/content/drive")
    %cd "/content/drive/MyDrive/Python/Bath University/RL1_CW/Louie/SAC"
    !pip install swig
    !pip install gymnasium[box2d]

# Training - Stage 1 - Complete Normal Mode

In [None]:
actor = Actor()
critic1 = Critic()
critic2 = Critic()

agent = SACAgent(critic_network1=critic1,
                 critic_network2=critic2,
                 actor_network=actor,
                 log_alpha=0.0,
                 device=device,
                 hardcore=False,
                 max_buffer_length=1_000_000)

agent.learn(n_episodes=2000,
            discount_factor=0.99,
            minibatch_size=256,
            tau=0.005,
            random_exploration_steps=10_000,
            actor_exploration_steps=1000,
            vid_every=50,
            stop_after=1,
            reset_optim=True,
            reset_buffer=True,
            # ====================== #
            critic_lr=3e-4,
            actor_lr=3e-4,
            alpha_lr=3e-4,
            critic_grad_clip=1.0,
            actor_grad_clip=1.0,
            alpha_grad_clip=0.0,
            updates_per_step=1)

# Training - Stage 2 - Go Faster

In [None]:
agent.learn(n_episodes=500,
            discount_factor=0.99,
            minibatch_size=256,
            tau=0.005,
            random_exploration_steps=0,
            actor_exploration_steps=0,
            vid_every=50,
            stop_after=None,
            reset_optim=False,
            reset_buffer=False,
            # ====================== #
            critic_lr=3e-4,
            actor_lr=3e-4,
            alpha_lr=3e-4,
            critic_grad_clip=1.0,
            actor_grad_clip=1.0,
            alpha_grad_clip=0.0,
            updates_per_step=1)

# Training - Stage 3 - Complete Hardcore Mode

In [None]:
critic1 = Critic()
critic2 = Critic()

stage2_actor = Actor()
stage2_actor.load_state_dict(torch.load("outputs/stage 2/actor_network.pth", map_location=device))

agent = SACAgent(critic_network1=critic1,
                 critic_network2=critic2,
                 actor_network=stage2_actor,
                 log_alpha=0.0,
                 device=device,
                 hardcore=True,
                 max_buffer_length=1_000_000)

agent.learn(n_episodes=2000,
            discount_factor=0.99,
            minibatch_size=256,
            tau=0.005,
            random_exploration_steps=10_000,
            actor_exploration_steps=1000,
            vid_every=50,
            stop_after=1,
            reset_optim=True,
            reset_buffer=True,
            # ====================== #
            critic_lr=3e-4,
            actor_lr=3e-4,
            alpha_lr=3e-4,
            critic_grad_clip=1.0,
            actor_grad_clip=1.0,
            alpha_grad_clip=0.0,
            updates_per_step=1)

# Training - Stage 4 - Complete Hardcore Consistently

In [None]:
agent.learn(n_episodes=500,
            discount_factor=0.99,
            minibatch_size=256,
            tau=0.005,
            random_exploration_steps=0,
            actor_exploration_steps=0,
            vid_every=50,
            stop_after=None,
            reset_optim=False,
            reset_buffer=False,
            # ====================== #
            critic_lr=3e-4,
            actor_lr=3e-4,
            alpha_lr=3e-4,
            critic_grad_clip=1.0,
            actor_grad_clip=1.0,
            alpha_grad_clip=0.0,
            updates_per_step=1)