In [None]:
import random
import gymnasium as gym
import numpy as np
import tensorflow as tf

from glob import glob
from time import perf_counter
from collections import deque
from pathlib import Path
from keras.layers import Conv2D, Flatten, Dense
from skimage.color import rgb2gray

In [None]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

In [None]:
# Create tetris environment
env = gym.make("ALE/Tetris-v5")

In [None]:
# Model File Path
model_path = Path("models")

# Define the state and action space sizes
action_space = env.action_space.n # 5
state_space = env.observation_space.shape # (210, 160, 3)

In [None]:
# Hyperparameters
input_shape = (*state_space[:2], 1) # (210, 160, 1) as we will convert to grayscale
num_episodes = 10000
batch_size = 256
learning_rate = 0.0001
discount_rate = 0.99
exploration_rate = 1
max_exploration_rate = 1
min_exploration_rate = 0.01
exploration_decay_rate = 0.99999
update_target_network_steps = 10000

In [None]:
def create_model() -> tf.keras.Model:
    """Create a convolutional neural network model.

    Returns
    -------
    tf.keras.Model
        A sequential model with the following layers:
        - Conv2D with 32 filters, kernel size of 8, stride of 4, and relu activation
        - Conv2D with 64 filters, kernel size of 4, stride of 2, and relu activation
        - Conv2D with 64 filters, kernel size of 3, stride of 1, and relu activation
        - Flatten layer
        - Dense layer with 512 units and relu activation
        - Dense layer with 5 units and linear activation
    """
    model = tf.keras.Sequential([
        Conv2D(32, (8, 8), strides=(4, 4), activation='relu', input_shape=input_shape),
        Conv2D(64, (4, 4), strides=(2, 2), activation='relu'),
        Conv2D(64, (3, 3), strides=(1, 1), activation='relu'),
        Flatten(),
        Dense(512, activation='relu'),
        Dense(action_space, activation='linear')
    ])
    return model

In [None]:
model = create_model()
target_model = create_model()

# Compile the model with an optimizer and loss function
model.compile(optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=learning_rate), loss='mse')

# Initially, set the target model weights equal to the model's weights
target_model.set_weights(model.get_weights())

In [None]:
def train(model: tf.keras.Model, target_model: tf.keras.Model, minibatch: np.ndarray, discount_rate: float):
    """Train the model using the minibatch of transitions.

    Parameters
    ----------
    model : tf.keras.Model
        The main neural network model that is being trained.
    target_model : tf.keras.Model
        The target neural network model that is used to predict the Q-values for the next state.
    minibatch : np.ndarray
        The minibatch of transitions to train the model on.
    discount_rate : float
        The discount rate to use when calculating the Q-values.
    """
    # Extract information from the minibatch
    states = np.array([transition[0] for transition in minibatch]) # (64, 210, 160, 3)
    actions = np.array([transition[1] for transition in minibatch]) # (64,)
    rewards = np.array([transition[2] for transition in minibatch]) # (64,)
    next_states = np.array([transition[3] for transition in minibatch]) # (64, 210, 160, 3)
    dones = np.array([transition[4] for transition in minibatch]) # (64,)

    # Predict Q-values for starting state and next state
    current_q_values = model.predict(states, verbose=0)
    next_q_values = target_model.predict(next_states, verbose=0)

    target_q_values = current_q_values.copy()
    for i in range(len(minibatch)):
        if dones[i]:
            target_q_values[i][actions[i]] = rewards[i]
        else:
            target_q_values[i][actions[i]] = rewards[i] + discount_rate * np.amax(next_q_values[i])
    
    model.fit(states, target_q_values, epochs=1, verbose=0, batch_size=len(minibatch))

In [None]:
def save_model(model: tf.keras.Model, name_prefix: str, episode: int):
    """Save the model weights to the specified path.

    Parameters
    ----------
    model : tf.keras.Model
        The model to save the weights of.
    name_prefix : str
        Prefix to use when saving the model weights.
    episode : int
        The episode number to use when saving the model weights.
    """
    model.save_weights(model_path / f"{name_prefix}_episode_{episode}.h5")

In [None]:
def load_models(episode: int | None = None) -> int:
    """Load the model weights from the specified path.

    Parameters
    ----------
    episode : int | None
        The episode to load the model weights from. If None, then the latest model weights will be loaded.
    
    Returns
    -------
    int
        The episode number that the model weights were loaded from.
    """
    if episode is None:
        glob_path = model_path / "*.h5"
        model_files = glob(str(glob_path))
        episode = max([int(Path(model_file).stem.split("_")[-1]) for model_file in model_files])
    print(f"Loading models from episode {episode}...")
    model.load_weights(model_path / f"main_episode_{episode}.h5")
    target_model.load_weights(model_path / f"target_episode_{episode}.h5")
    return episode

In [None]:
# Initialize a step counter and replay memory
step_counter = 0
replay_memory = deque(maxlen=10000)
start_episode = load_models() + 1

for episode in range(start_episode, num_episodes):
    state, _ = env.reset()
    state = rgb2gray(state)
    done = False
    start_time = perf_counter()
    while not done:
        # Exploration-exploitation trade-off
        exploration_threshold = random.uniform(0, 1)
        # If exploration_threshold > exploration_rate, then exploitation
        if exploration_threshold > exploration_rate:
            q_values = model.predict(np.expand_dims(state, axis=0), verbose=0) # add batch dimension
            action = np.argmax(q_values[0])
        else:
            action = env.action_space.sample()
        
        # Increment the step counter
        step_counter += 1
        
        # Take action and observe the next state and reward
        next_state, reward, done, _, _ = env.step(action)
        next_state = rgb2gray(next_state)

        # Add the experience to replay memory
        replay_memory.append((state, action, reward, next_state, done))
        
        # Sample a minibatch from the replay buffer
        if len(replay_memory) > batch_size:
            minibatch = random.sample(replay_memory, batch_size)
            # Train the model on the minibatch
            train(model, target_model, minibatch, discount_rate)

        if step_counter % update_target_network_steps == 0:
            # Update the the target network with new weights
            target_model.set_weights(model.get_weights())

        # Decay the exploration rate
        exploration_rate =  max(min_exploration_rate, exploration_rate * exploration_decay_rate)
    # End of episode
    print(f'Episode: {episode}, Exploration Rate: {exploration_rate:.2f}, Time: {perf_counter() - start_time:.2f}s')
    if episode % 100 == 0:
        # Save the model weights every 50 episodes
        save_model(model, "main", episode)
        save_model(target_model, "target", episode)
    
# Save the final model weights
save_model(model, "main", episode)
save_model(target_model, "target", episode)

In [None]:
# Visualize the agent playing the game
# env = gym.make("ALE/Tetris-v5", render_mode='human')
# state, _ = env.reset()
# state = rgb2gray(state)
# done = False
# while not done:
#     q_values = target_model.predict(np.expand_dims(state, axis=0), verbose=0) # add batch dimension
#     action = np.argmax(q_values[0])
#     next_state, reward, done, _, _ = env.step(action)
#     next_state = rgb2gray(next_state)
#     env.render()
#     state = next_state