In [1]:
import numpy as np
import tensorflow as tf

In [2]:
# Define the grid world environment
GRID_SIZE = 5
NUM_TREASURES = 5

In [3]:
# Create the treasure locations randomly
treasure_locations = np.random.randint(GRID_SIZE, size=(NUM_TREASURES, 2))

In [4]:
# Initialize the Q-network
num_actions = 4
input_shape = (GRID_SIZE, GRID_SIZE, 1)
inputs = tf.keras.Input(shape=input_shape)
x = tf.keras.layers.Flatten()(inputs)
x = tf.keras.layers.Dense(32, activation='relu')(x)
outputs = tf.keras.layers.Dense(num_actions)(x)
q_network = tf.keras.Model(inputs=inputs, outputs=outputs)

In [5]:
# Define the epsilon-greedy exploration strategy
def epsilon_greedy(action_values, epsilon):
    if np.random.rand() < epsilon:
        return np.random.randint(num_actions)
    else:
        return np.argmax(action_values)

In [6]:
# Define the loss function for training the Q-network
loss_fn = tf.keras.losses.MeanSquaredError()

In [7]:
# Define the optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

In [8]:
# Define the training loop
num_episodes = 50
epsilon = 1.0  # Initial exploration factor
epsilon_min = 0.01  # Minimum exploration factor
epsilon_decay = 0.99  # Decay rate for exploration factor

for episode in range(num_episodes):
    state = np.zeros((GRID_SIZE, GRID_SIZE, 1))
    state[0, 0, 0] = 1  # Agent's initial position
    total_reward = 0
    done = False

    while not done:
        # Convert the state to a tensor and get the action values from the Q-network
        state_tensor = tf.convert_to_tensor(state[np.newaxis, ...], dtype=tf.float32)
        action_values = q_network(state_tensor)

        # Choose an action using epsilon-greedy exploration
        action = epsilon_greedy(action_values[0], epsilon)

        # Simulate the action and observe the next state and reward
        next_state = np.copy(state)
        reward = 0

        if action == 0:  # Move up
            if next_state[0, 0, 0] > 0:
                next_state[0, 0, 0] -= 1
        elif action == 1:  # Move down
            if next_state[0, 0, 0] < GRID_SIZE - 1:
                next_state[0, 0, 0] += 1
        elif action == 2:  # Move left
            if next_state[0, 0, 0] > 0:
                next_state[0, 0, 0] -= 1
        elif action == 3:  # Move right
            if next_state[0, 0, 0] < GRID_SIZE - 1:
                next_state[0, 0, 0] += 1

        for i in range(NUM_TREASURES):
            if np.array_equal(next_state[0, 0], treasure_locations[i]):
                reward += 1
                treasure_locations[i] = np.array([-1, -1])  # Remove the collected treasure

# Update the Q-network using the Bellman equation
        with tf.GradientTape() as tape:
            next_state_tensor = tf.convert_to_tensor(next_state[np.newaxis, ...], dtype=tf.float32)
            next_action_values = q_network(next_state_tensor)
            max_next_action_value = tf.reduce_max(next_action_values, axis=-1)
            target = tf.where(done, reward, reward + max_next_action_value)
            target_f = tf.reshape(target, shape=(1, 1))
            predicted_values = tf.reduce_sum(action_values * tf.one_hot(action, num_actions), axis=-1)
            predicted_values_f = tf.reshape(predicted_values, shape=(1, 1))
            loss = loss_fn(target_f, predicted_values_f)

        grads = tape.gradient(loss, q_network.trainable_variables)
        optimizer.apply_gradients(zip(grads, q_network.trainable_variables))

        state = next_state
        total_reward += reward

        if reward == NUM_TREASURES:
            done = True

# Decay the exploration factor after each episode
    epsilon = max(epsilon_min, epsilon * epsilon_decay)

    print("Episode:", episode, "Total Reward:", total_reward, "Epsilon:", epsilon)

KeyboardInterrupt: ignored