In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
from collections import deque

In [None]:
# --- 1. Define the DQN Agent ---

class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size

        # Hyperparameters
        self.memory = deque(maxlen=2000) # Experience Replay memory
        self.gamma = 0.95    # Discount factor for future rewards
        self.epsilon = 1.0   # Exploration rate (starts at 100%)
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001

        # The Deep Neural Network model
        self.model = self._build_model()

    def _build_model(self):
        # A simple sequential neural network
        model = nn.Sequential(
            nn.Linear(self.state_size, 24),
            nn.ReLU(),
            nn.Linear(24, 24),
            nn.ReLU(),
            nn.Linear(24, self.action_size)
        )
        return model

    def remember(self, state, action, reward, next_state, done):
        """Stores an experience in the replay memory."""
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        """Chooses an action using the Epsilon-Greedy policy."""
        # With probability epsilon, take a random action (explore)
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        
        # Otherwise, use the model to predict the best action (exploit)
        state_tensor = torch.FloatTensor(state).unsqueeze(0) # Convert state to a PyTorch tensor
        with torch.no_grad():
            act_values = self.model(state_tensor)
        return np.argmax(act_values.cpu().data.numpy())

    def replay(self, batch_size):
        """Trains the network using a random batch from the replay memory."""
        if len(self.memory) < batch_size:
            return # Don't train if memory is not full enough
        
        minibatch = random.sample(self.memory, batch_size)
        
        optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate)
        loss_fn = nn.MSELoss()

        for state, action, reward, next_state, done in minibatch:
            state_tensor = torch.FloatTensor(state).unsqueeze(0)
            next_state_tensor = torch.FloatTensor(next_state).unsqueeze(0)
            
            # The Bellman Equation: target = reward + gamma * max(Q(next_state))
            target = reward
            if not done:
                # Predict future reward with the model
                target = reward + self.gamma * torch.max(self.model(next_state_tensor)[0]).item()
            
            # Get current Q-values prediction
            current_q_values = self.model(state_tensor)
            
            # Create the target Q-values. We only update the Q-value for the action we took.
            target_q_values = current_q_values.clone()
            target_q_values[0][action] = target

            # Standard PyTorch training step
            optimizer.zero_grad()
            loss = loss_fn(current_q_values, target_q_values)
            loss.backward()
            optimizer.step()

        # Decay epsilon to reduce exploration over time
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay


In [4]:
# Environment parameters
STATE_SIZE = 1  # The agent's position on the line
ACTION_SIZE = 2 # 0: move left, 1: move right
EPISODES = 500
BATCH_SIZE = 32

# Create our agent
agent = DQNAgent(STATE_SIZE, ACTION_SIZE)

# The simple 1D environment
line_size = 5 # A line from 0 to 4
goal_position = 4

for e in range(EPISODES):
    # Reset the environment for a new episode
    state = np.array([0]) # Start at position 0
    
    for time in range(50): # Max steps per episode
        # Agent chooses an action
        action = agent.act(state)
        
        # Environment reacts to the action
        next_state = state.copy()
        if action == 0: # Move left
            next_state[0] = max(0, state[0] - 1)
        elif action == 1: # Move right
            next_state[0] = min(line_size - 1, state[0] + 1)
            
        # Determine the reward
        done = next_state[0] == goal_position
        reward = 10 if done else -1 # Big reward for reaching goal, small penalty otherwise
        
        # Agent stores this experience
        agent.remember(state, action, reward, next_state, done)
        
        # Update the state
        state = next_state
        
        if done:
            print(f"Episode: {e+1}/{EPISODES}, Score: {time+1}, Epsilon: {agent.epsilon:.2}")
            break
    
    # Train the agent with experience replay
    agent.replay(BATCH_SIZE)

Episode: 1/500, Score: 32, Epsilon: 1.0
Episode: 2/500, Score: 5, Epsilon: 0.99
Episode: 3/500, Score: 6, Epsilon: 0.99
Episode: 4/500, Score: 16, Epsilon: 0.99
Episode: 5/500, Score: 15, Epsilon: 0.98
Episode: 6/500, Score: 22, Epsilon: 0.98
Episode: 7/500, Score: 6, Epsilon: 0.97
Episode: 8/500, Score: 24, Epsilon: 0.97
Episode: 9/500, Score: 5, Epsilon: 0.96
Episode: 10/500, Score: 7, Epsilon: 0.96
Episode: 11/500, Score: 6, Epsilon: 0.95
Episode: 12/500, Score: 12, Epsilon: 0.95
Episode: 13/500, Score: 11, Epsilon: 0.94
Episode: 14/500, Score: 18, Epsilon: 0.94
Episode: 15/500, Score: 29, Epsilon: 0.93
Episode: 16/500, Score: 28, Epsilon: 0.93
Episode: 17/500, Score: 16, Epsilon: 0.92
Episode: 18/500, Score: 9, Epsilon: 0.92
Episode: 19/500, Score: 6, Epsilon: 0.91
Episode: 20/500, Score: 4, Epsilon: 0.91
Episode: 21/500, Score: 6, Epsilon: 0.9
Episode: 22/500, Score: 40, Epsilon: 0.9
Episode: 23/500, Score: 28, Epsilon: 0.9
Episode: 24/500, Score: 6, Epsilon: 0.89
Episode: 25/500,