In [None]:
import os
os.environ["KERAS_BACKEND"] = "tensorflow"
import keras
from keras import layers
from gymnasium.wrappers import FrameStack
from gymnasium.experimental.wrappers import GrayscaleObservationV0
import gymnasium as gym
import numpy as np
import tensorflow as tf

env = gym.make('ALE/Frogger-v5')

env = GrayscaleObservationV0(env)
print(env.observation_space.shape)  # Print observation space dimensions
env = FrameStack(env, 4)
frames, width, height = env.observation_space.shape
env.seed(seed)

# Define the create_CNN function
def create_CNN(input_shape, num_actions):
    model = keras.Sequential([
        layers.Conv2D(32, (8, 8), strides=(4, 4), activation='relu', input_shape=input_shape, padding='same'),
        layers.Conv2D(64, (4, 4), strides=(2, 2), activation='relu', padding='same'),
        layers.Conv2D(64, (3, 3), activation='relu', padding='same'),
        layers.Conv2D(64, (3, 3), activation='relu', padding='same'),
        layers.Flatten(),
        layers.Dense(512, activation='relu'),
        layers.Dense(num_actions, activation='linear')
    ])
    return model


# The first neural net makes the predictions for Q-values, which are used to take an action.
cnn1 = create_CNN((frames, width, height), num_actions=5)

# A second cnn is used to predict future rewards. The weights of the second cnn get updated every 10000 steps.
cnn2 = create_CNN((frames, width, height), num_actions=5)

# Hyperparameters DQN Algorithm
gamma = 0.99  # Discount factor in Bellman's equation
epsilon = 1  # Epsilon greedy parameter for Q learning algorithm
max_steps_per_episode = 50  # Deepmind trained for "a total of 50 million frames (~38 days of game play)"
max_episodes = 10000  # Number of episodes you let the AI train. Keep above 1!
epsilon_min = 0.1  # Smallest epsilon value possible
epsilon_max = 1.0  # Largest epsilon value possible
epsilon_interval = (epsilon_max - epsilon_min)  # Rate we reduce chance of random action being taken (eventually, we don't want to take many random actions)

# Some more important variables
batch_size = 32  # Size of sample taken from "replay buffer"
optimizer = keras.optimizers.Adam(learning_rate=0.00025, clipnorm=1.0)

# Experience replay buffers
action_history = []
state_history = []
state_next_history = []
rewards_history = []
done_history = []
episode_reward_history = []
running_reward = 0
episode_count = 0
frame_count = 0
# Number of frames to take random action and observe output
epsilon_random_frames = 50000
# Number of frames for exploration
epsilon_greedy_frames = 1000000.0
# Maximum replay length
# Note: A Deepmind paper suggests 1000000, however this can cause memory issues
max_memory_length = 100000
# Train the model after 4 actions
update_after_actions = 4
# How often to update cnn2
update_cnn2 = 10000
# Using Huber loss to check for convergence of Qs
loss_function = keras.losses.Huber()


# DQN Algorithm
while True:
    observation, _ = env.reset()
    state = np.array(observation)
    episode_reward = 0
    
    for timestep in range(1, max_steps_per_episode):
        frame_count += 1
        
        if frame_count < epsilon_random_frames or epsilon > np.random.rand(1)[0]:
            action = np.random.choice(5)  # Change 5 to num_actions
        else:
            state_tensor = keras.ops.convert_to_tensor(state)
            state_tensor = keras.ops.expand_dims(state_tensor, 0)
            action_probs = cnn1(state_tensor, training=False)
            action = keras.ops.argmax(action_probs[0]).numpy()
        
        epsilon -= epsilon_interval / epsilon_greedy_frames
        epsilon = max(epsilon, epsilon_min)
        
        state_next, reward, done, _, _ = env.step(action)
        state_next = np.array(state_next)
        
        episode_reward += reward

        action_history.append(action)
        state_history.append(state)
        state_next_history.append(state_next)
        done_history.append(done)
        rewards_history.append(reward)
        state = state_next
        
        if frame_count % update_after_actions == 0 and len(done_history) > batch_size:
            indices = np.random.choice(range(len(done_history)), size=batch_size)

            state_sample = np.array([state_history[i] for i in indices])
            state_next_sample = np.array([state_next_history[i] for i in indices])
            rewards_sample = [rewards_history[i] for i in indices]
            action_sample = [action_history[i] for i in indices]
            done_sample = keras.ops.convert_to_tensor(
                [float(done_history[i]) for i in indices]
            )

            future_rewards = cnn2.predict(state_next_sample)
            updated_q_values = rewards_sample + gamma * keras.ops.amax(future_rewards, axis=1)

            updated_q_values = updated_q_values * (1 - done_sample) - done_sample

            masks = keras.ops.one_hot(action_sample, 5)  # Change 5 to num_actions
            
            with tf.GradientTape() as tape:
                q_values = cnn1(state_sample)

                q_action = keras.ops.sum(keras.ops.multiply(q_values, masks), axis=1)
                loss = loss_function(updated_q_values, q_action)

            grads = tape.gradient(loss, cnn1.trainable_variables)
            optimizer.apply_gradients(zip(grads, cnn1.trainable_variables))
        
        if frame_count % update_cnn2 == 0:
            cnn2.set_weights(cnn1.get_weights())

        if len(rewards_history) > max_memory_length:
            del rewards_history[:1]
            del state_history[:1]
            del state_next_history[:1]
            del action_history[:1]
            del done_history[:1]

        if done:
            break

    episode_reward_history.append(episode_reward)
    if len(episode_reward_history) > 100:
        del episode_reward_history[:1]
    
    running_reward = np.mean(episode_reward_history)
    episode_count += 1

    if running_reward > 40:
        print("Learned at episode {}!".format(episode_count))
        break

    if (max_episodes > 0 and episode_count >= max_episodes):
        print("Stopped at episode {}!".format(episode_count))
        break
