In [None]:
%pip install --upgrade pip
%pip install tensorflow
%pip install --upgrade 'gymnasium[atari]' 'gymnasium[accept-rom-license]' moviepy

In [None]:
import csv
import random
import gymnasium as gym
import numpy as np
import tensorflow as tf

from time import perf_counter
from datetime import datetime
from collections import deque
from glob import glob
from gymnasium.wrappers import (
    RecordVideo,
    GrayScaleObservation,
    ResizeObservation,
    FrameStack,
    NormalizeObservation,
)
from pathlib import Path
from keras.layers import Conv2D, Flatten, Dense

In [None]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices("GPU")))

In [None]:
def create_env(env_id="ALE/AirRaid-v5", capture_video: bool = False) -> gym.Env:
    """Create an environment with some standard wrappers. These are similar to the wrappers used in the
    Atari Preprocessing wrapper.

    Parameters
    ----------
    env_id : str, optional
        Environment ID of the gym environment, by default "ALE/AirRaid-v5"
    capture_video : bool, optional
        If True, the environment will be recorded as a video every 50 episodes, by default False

    Returns
    -------
    gym.Env
        Gym environment with wrappers applied
    """
    if capture_video:
        run_name = datetime.now().strftime("%Y%m%d-%H%M%S")
        env = gym.make(env_id, render_mode="rgb_array")
        env = RecordVideo(
            env, f"videos/{run_name}.mp4", episode_trigger=lambda x: x % 50
        )
    else:
        env = gym.make(env_id)
    env = ResizeObservation(env, shape=84)
    env = GrayScaleObservation(env)
    env = NormalizeObservation(env)
    env = FrameStack(env, 4)
    return env

In [None]:
# Model File Path
run_id = datetime.now().strftime("%Y%m%d-%H%M%S")
model_path = Path("models")
log_path = Path("logs")

# Define the state and action space sizes
env = create_env()
state, _ = env.reset()
action_space = env.action_space.n  # 6 actions
state_space = state.shape  # (4, 84, 84) 4 frames, 84x84 pixels

In [None]:
# Hyperparameters
input_shape = state_space
num_episodes = 500
batch_size = 128
learning_rate = 1e-2
discount_rate = 0.99
exploration_rate = 1
max_exploration_rate = 1
min_exploration_rate = 0.01
exploration_decay_rate = 0.99
update_target_network_steps = 10_000
replay_memory = deque(maxlen=25_000)

In [None]:
def log_episode_to_csv(episode_data: dict):
    print(episode_data)
    # Check if file exists to determine if header is needed
    filename = log_path / f"{run_id}.csv"
    file_exists = Path(filename).exists()

    # Open the file in append mode
    with open(filename, "a", newline="") as csvfile:
        # Create a writer object
        writer = csv.DictWriter(csvfile, fieldnames=episode_data.keys())

        # Write the header only if the file didn't exist
        if not file_exists:
            writer.writeheader()

        # Write the episode data
        writer.writerow(episode_data)

In [None]:
def create_model(input_shape: tuple, output_shape: int) -> tf.keras.Model:
    """Create a convolutional neural network model.

    Returns
    -------
    tf.keras.Model
        A sequential model with the following layers:
        - Conv2D with 32 filters, kernel size of 8, stride of 4, and relu activation
        - Conv2D with 64 filters, kernel size of 4, stride of 2, and relu activation
        - Conv2D with 64 filters, kernel size of 3, stride of 1, and relu activation
        - Flatten layer
        - Dense layer with 4096 units and relu activation
        - Dense layer with 512 units and relu activation
        - Dense layer with 5 units and linear activation
    """
    model = tf.keras.Sequential(
        [
            Conv2D(
                32, (8, 8), strides=(4, 4), activation="relu", input_shape=input_shape
            ),
            Conv2D(64, (4, 4), strides=(2, 2), activation="relu"),
            Conv2D(64, (3, 3), strides=(1, 1), activation="relu"),
            Flatten(),
            Dense(4096, activation="relu"),
            Dense(512, activation="relu"),
            Dense(output_shape, activation="linear"),
        ]
    )
    return model

In [None]:
input_shape = (
    *state_space,
    1,
)  # (175, 41, 1) we need to add a channel dimension to the input shape
model = create_model(input_shape, action_space)
target_model = create_model(input_shape, action_space)

# Compile the model with an optimizer and loss function
model.compile(
    optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=learning_rate), loss="mse"
)

# Initially, set the target model weights equal to the model's weights
target_model.set_weights(model.get_weights())

In [None]:
def train(
    model: tf.keras.Model,
    target_model: tf.keras.Model,
    minibatch: np.ndarray,
    discount_rate: float,
):
    """Train the model using the minibatch of transitions.

    Parameters
    ----------
    model : tf.keras.Model
        The main neural network model that is being trained.
    target_model : tf.keras.Model
        The target neural network model that is used to predict the Q-values for the next state.
    minibatch : np.ndarray
        The minibatch of transitions to train the model on.
    discount_rate : float
        The discount rate to use when calculating the Q-values.
    """
    # Extract information from the minibatch
    states = np.array([transition[0] for transition in minibatch])  # (64, 175, 41)
    actions = np.array([transition[1] for transition in minibatch])  # (64,)
    rewards = np.array([transition[2] for transition in minibatch])  # (64,)
    next_states = np.array([transition[3] for transition in minibatch])  # (64, 175, 41)
    dones = np.array([transition[4] for transition in minibatch])  # (64,)

    # Predict Q-values for starting state and next state
    current_q_values = model.predict(states, verbose=0)
    next_q_values = target_model.predict(next_states, verbose=0)

    target_q_values = current_q_values.copy()
    for i in range(len(minibatch)):
        if dones[i]:
            # If the episode is done, the Q-value is simply the reward
            target_q_values[i][actions[i]] = rewards[i]
        else:
            # If the episode is not done, the Q-value is the reward plus the discounted predicted reward
            target_q_values[i][actions[i]] = rewards[i] + discount_rate * np.amax(
                next_q_values[i]
            )

    model.fit(states, target_q_values, epochs=1, verbose=0, batch_size=len(minibatch))

In [None]:
def save_model(model: tf.keras.Model, name_prefix: str, episode: int):
    """Save the model weights to the specified path.

    Parameters
    ----------
    model : tf.keras.Model
        The model to save the weights of.
    name_prefix : str
        Prefix to use when saving the model weights.
    episode : int
        The episode number to use when saving the model weights.
    """
    model.save_weights(model_path / f"{name_prefix}_episode_{episode}.h5")

In [None]:
def load_model() -> int:
    """Load the model weights from the specified path.

    Parameters
    ----------
    episode : int | None
        The episode to load the model weights from. If None, then the latest model weights will be loaded.

    Returns
    -------
    int
        The episode number that the model weights were loaded from.
    """
    glob_path = model_path / "*.h5"
    model_files = glob(str(glob_path))
    episode = 0
    if model_files:
        episode = max(
            [int(Path(model_file).stem.split("_")[-1]) for model_file in model_files]
        )
        print(f"Loading models from episode {episode}...")
        model.load_weights(model_path / f"main_episode_{episode}.h5")
        target_model.load_weights(model_path / f"target_episode_{episode}.h5")
    return episode

In [None]:
step_counter = 0
start_episode = load_model()

for episode in range(start_episode, num_episodes + 1):
    state, _ = env.reset()
    done = False
    start_time = perf_counter()
    episode_reward = 0
    exploitation_steps = 0
    exploration_steps = 0
    while not done:
        # Exploration-exploitation trade-off
        exploration_threshold = random.uniform(0, 1)
        # If exploration_threshold > exploration_rate, then exploitation
        if exploration_threshold > exploration_rate:
            exploitation_steps += 1
            q_values = model.predict(
                np.expand_dims(state, axis=0), verbose=0
            )  # add batch dimension
            action = np.argmax(q_values[0])
        else:
            exploration_steps += 1
            action = env.action_space.sample()

        # Increment the step counter
        step_counter += 1

        # Take action and observe the next state and reward
        next_state, reward, done, _, _ = env.step(action)
        episode_reward += reward

        # Add the experience to replay memory
        replay_memory.append((state, action, reward, next_state, done))

        # Sample a minibatch from the replay buffer
        if len(replay_memory) > batch_size:
            minibatch = random.sample(replay_memory, batch_size)
            # Train the model on the minibatch
            train(model, target_model, minibatch, discount_rate)

        if step_counter % update_target_network_steps == 0:
            # Update the the target network with new weights
            target_model.set_weights(model.get_weights())

    # End of episode
    # Decay the exploration rate
    exploration_rate = max(
        min_exploration_rate, exploration_rate * exploration_decay_rate
    )
    log_episode_to_csv(
        {
            "episode": episode,
            "duration": f"{perf_counter() - start_time:.2f}",
            "exploration_rate": f"{exploration_rate:.2f}",
            "exploitation_steps": exploitation_steps,
            "exploration_steps": exploration_steps,
            "total_steps": step_counter,
            "episode_reward": episode_reward,
        }
    )
    if episode % 50 == 0:
        # Save the model weights every 50 episodes
        save_model(model, "main", episode)
        save_model(target_model, "target", episode)

# Save the final model weights
save_model(model, "main", episode)
save_model(target_model, "target", episode)