In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
import random
from collections import deque

# Define environment
state_size = 1  # The agent's position (0 to 4)
action_size = 2  # Left (-1) or Right (+1)
goal_state = 4
episodes = 1000  # Training episodes

# Hyperparameters
learning_rate = 0.01
gamma = 0.95  # Discount factor
epsilon = 1.0  # Exploration rate
epsilon_min = 0.01
epsilon_decay = 0.99
batch_size = 32
memory_size = 1000

# Build the Q-Network (Simple Neural Network)
def build_model():
    model = keras.Sequential([
        keras.layers.Dense(10, activation="relu", input_shape=(state_size,)),
        keras.layers.Dense(10, activation="relu"),
        keras.layers.Dense(action_size, activation="linear")
    ])
    model.compile(optimizer=keras.optimizers.Adam(learning_rate), loss="mse")
    return model

# Initialize memory and model
memory = deque(maxlen=memory_size)
model = build_model()

# Function to choose an action (Explore or Exploit)
def choose_action(state):
    if np.random.rand() < epsilon:
        return np.random.choice([0, 1])  # Random action (Explore)
    q_values = model.predict(np.array([state]), verbose=0)
    return np.argmax(q_values[0])  # Best action (Exploit)

# Function to train the model
def train_model():
    global epsilon
    if len(memory) < batch_size:
        return
    minibatch = random.sample(memory, batch_size)
    
    states, targets = [], []
    for state, action, reward, next_state, done in minibatch:
        target = reward if done else reward + gamma * np.max(model.predict(np.array([next_state]), verbose=0))
        target_f = model.predict(np.array([state]), verbose=0)
        target_f[0][action] = target
        states.append(state)
        targets.append(target_f[0])
    
    model.train_on_batch(np.array(states), np.array(targets))

    if epsilon > epsilon_min:
        epsilon *= epsilon_decay  # Reduce exploration over time

# Training Loop
for episode in range(episodes):
    state = np.array([0])  # Start at position 0
    done = False
    total_reward = 0

    while not done:
        action = choose_action(state)
        next_state = np.array([max(0, min(4, state[0] + (1 if action == 1 else -1)))])  # Move left (-1) or right (+1)
        reward = 10 if next_state[0] == goal_state else -1  # Reward for reaching goal
        done = next_state[0] == goal_state
        memory.append((state, action, reward, next_state, done))
        state = next_state
        total_reward += reward
        train_model()

    print(f"Episode {episode+1}: Total Reward = {total_reward}, Epsilon = {epsilon:.4f}")

# Test the trained agent
state = np.array([0])  # Start at 0
print("\nTrained Agent Path:")
while state[0] != goal_state:
    action = np.argmax(model.predict(np.array([state]), verbose=0))  # Choose best action
    state = np.array([max(0, min(4, state[0] + (1 if action == 1 else -1)))])
    print(f"Moved to {state[0]}")

print("Goal Reached! 🎉")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Episode 1: Total Reward = 7, Epsilon = 1.0000
Episode 2: Total Reward = -6, Epsilon = 1.0000
Episode 3: Total Reward = -2, Epsilon = 0.9703
Episode 4: Total Reward = -12, Epsilon = 0.7700
Episode 5: Total Reward = 1, Epsilon = 0.6964
Episode 6: Total Reward = 3, Epsilon = 0.6426
Episode 7: Total Reward = 3, Epsilon = 0.5930
Episode 8: Total Reward = 2, Epsilon = 0.5417
Episode 9: Total Reward = 3, Epsilon = 0.4998
Episode 10: Total Reward = 1, Epsilon = 0.4520
Episode 11: Total Reward = 3, Epsilon = 0.4171
Episode 12: Total Reward = 5, Epsilon = 0.3927
Episode 13: Total Reward = 3, Epsilon = 0.3624
Episode 14: Total Reward = 4, Epsilon = 0.3378
Episode 15: Total Reward = 5, Epsilon = 0.3180
Episode 16: Total Reward = 3, Epsilon = 0.2934
Episode 17: Total Reward = 5, Epsilon = 0.2763
Episode 18: Total Reward = 7, Epsilon = 0.2654
Episode 19: Total Reward = 6, Epsilon = 0.2524
Episode 20: Total Reward = 7, Epsilon = 0.2424
Episode 21: Total Reward = 7, Epsilon = 0.2329
Episode 22: Total 