## 1. Environment Setup & Demo

First, let's import libraries and test the custom ShariqQuest environment.

In [None]:
# Install dependencies (uncomment if needed)
# !pip install gymnasium numpy matplotlib pygame torch seaborn

import numpy as np
import matplotlib.pyplot as plt
import sys
import os

# Add project paths
sys.path.append('1-custom-environment')
sys.path.append('2-q-learning-agent')
sys.path.append('3-dqn-agent')

print("âœ“ Libraries loaded successfully!")

### Environment Specifications

- **Grid Size:** 7Ã—7
- **Agent Start:** Bottom-left (6, 0)
- **Goal:** Top-right (0, 6)
- **Traps:** 4 hell states
- **Obstacles:** 5 barriers

**Rewards:**
- Goal: +10
- Trap: -1
- Step: -0.1

In [None]:
# Import custom environment
import sys
import os

# Add the q-learning agent directory to path
sys.path.insert(0, os.path.join(os.getcwd(), '2-q-learning-agent'))

from padm_env import create_env

# Create environment
env = create_env(
    goal_coordinates=(0, 6),
    hell_state_coordinates=[(3, 2), (2, 3), (4, 4), (3, 5)],
    obstacle_coordinates=[(3, 1), (3, 3), (4, 3), (5, 3), (1, 5)],
    render_mode=False  # Disable Pygame window in notebook
)

print(f"âœ“ Environment created!")
print(f"  Action space: {env.action_space}")
print(f"  Observation space: {env.observation_space}")

### Test Random Agent

In [None]:
# Test with random agent
state, _ = env.reset()
done = False
total_reward = 0
steps = 0
max_steps = 50

print("Starting position:", state)

while not done and steps < max_steps:
    action = env.action_space.sample()  # Random action
    state, reward, done, truncated, info = env.step(action)
    total_reward += reward
    steps += 1

print(f"\nâœ“ Episode finished!")
print(f"  Steps: {steps}")
print(f"  Total reward: {total_reward:.2f}")
print(f"  Final state: {state}")
print(f"  Goal reached: {info.get('goal_reached', False)}")

---

## 2. Q-Learning Agent

**Algorithm:** Tabular Q-Learning

$$Q(s,a) \leftarrow Q(s,a) + \alpha[r + \gamma \cdot \max_{a'} Q(s',a') - Q(s,a)]$$

Where:
- Î± = learning rate
- Î³ = discount factor
- r = reward

In [None]:
# Q-Learning parameters
learning_rate = 0.03
gamma = 0.99
epsilon = 1.0
epsilon_min = 0.05
epsilon_decay = 0.999

# Initialize Q-table
q_table = np.zeros((7, 7, 4))  # 7x7 grid, 4 actions

print("Q-Learning Agent initialized!")
print(f"Q-table shape: {q_table.shape}")

### Training Loop (Mini Demo - 1000 episodes)

In [None]:
# Mini training (1000 episodes for demo)
num_episodes = 1000
rewards_history = []

for episode in range(num_episodes):
    state, _ = env.reset()
    done = False
    episode_reward = 0
    
    while not done:
        # Epsilon-greedy action selection
        if np.random.random() < epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(q_table[state[0], state[1]])
        
        # Take action
        next_state, reward, done, truncated, info = env.step(action)
        
        # Q-learning update
        old_q = q_table[state[0], state[1], action]
        next_max_q = np.max(q_table[next_state[0], next_state[1]])
        new_q = old_q + learning_rate * (reward + gamma * next_max_q - old_q)
        q_table[state[0], state[1], action] = new_q
        
        state = next_state
        episode_reward += reward
    
    # Decay epsilon
    epsilon = max(epsilon_min, epsilon * epsilon_decay)
    rewards_history.append(episode_reward)
    
    if (episode + 1) % 200 == 0:
        avg_reward = np.mean(rewards_history[-100:])
        print(f"Episode {episode + 1}/{num_episodes} | Avg Reward: {avg_reward:.2f} | Îµ: {epsilon:.4f}")

print("\nâœ“ Q-Learning training completed!")

### Visualize Q-Learning Performance

In [None]:
# Plot training curve
plt.figure(figsize=(10, 5))
plt.plot(rewards_history, alpha=0.3, label='Raw rewards')
plt.plot(np.convolve(rewards_history, np.ones(100)/100, mode='valid'), 
         label='Moving average (100)', linewidth=2)
plt.xlabel('Episode')
plt.ylabel('Total Reward')
plt.title('Q-Learning Training Progress')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print(f"Final average reward: {np.mean(rewards_history[-100:]):.2f}")

### Test Trained Q-Learning Agent

In [None]:
# Test trained agent
state, _ = env.reset()
done = False
path = [tuple(state)]
total_reward = 0

while not done and len(path) < 50:
    action = np.argmax(q_table[state[0], state[1]])  # Greedy
    state, reward, done, truncated, info = env.step(action)
    path.append(tuple(state))
    total_reward += reward

print(f"âœ“ Test episode completed!")
print(f"  Steps: {len(path) - 1}")
print(f"  Total reward: {total_reward:.2f}")
print(f"  Goal reached: {info.get('goal_reached', False)}")
print(f"  Path: {path[:10]}..." if len(path) > 10 else f"  Path: {path}")

---

## 3. Deep Q-Network (DQN)

**Neural Network Architecture:**
```
Input (2) â†’ Dense(128, ReLU) â†’ Dense(128, ReLU) â†’ Output (4)
```

**Key Components:**
- Experience Replay Buffer (50,000 transitions)
- Target Network (updated every 20 episodes)
- Huber Loss (Smooth L1)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import deque
import random

# Check device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
# DQN Model
class Qnet(nn.Module):
    def __init__(self, no_states=2, no_actions=4):
        super(Qnet, self).__init__()
        self.fc1 = nn.Linear(no_states, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, no_actions)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    
    def sample_action(self, state, epsilon):
        if random.random() < epsilon:
            return random.randint(0, 3)
        else:
            with torch.no_grad():
                state_t = torch.FloatTensor(state).unsqueeze(0).to(device)
                q_values = self.forward(state_t)
                return q_values.argmax().item()

# Replay Buffer
class ReplayBuffer:
    def __init__(self, buffer_limit=50000):
        self.buffer = deque(maxlen=buffer_limit)
    
    def put(self, transition):
        self.buffer.append(transition)
    
    def sample(self, n):
        return random.sample(self.buffer, n)
    
    def size(self):
        return len(self.buffer)

print("âœ“ DQN model and replay buffer defined!")

### Initialize DQN Agent

In [None]:
# Hyperparameters
learning_rate = 0.005
gamma = 0.98
buffer_limit = 50000
batch_size = 32

# Initialize networks
q_net = Qnet().to(device)
q_target = Qnet().to(device)
q_target.load_state_dict(q_net.state_dict())

# Optimizer and memory
optimizer = optim.Adam(q_net.parameters(), lr=learning_rate)
memory = ReplayBuffer(buffer_limit)

print("âœ“ DQN agent initialized!")
print(f"  Parameters: {sum(p.numel() for p in q_net.parameters())}")

### DQN Training Loop (Mini Demo - 500 episodes)

In [None]:
# Mini training (500 episodes for demo)
num_episodes_dqn = 500
epsilon_dqn = 0.08
epsilon_end = 0.01
epsilon_decay_dqn = 0.995
target_update_freq = 20

dqn_rewards_history = []

for episode in range(num_episodes_dqn):
    state, _ = env.reset()
    done = False
    episode_reward = 0
    
    while not done:
        # Select action
        action = q_net.sample_action(state, epsilon_dqn)
        
        # Take action
        next_state, reward, done, truncated, info = env.step(action)
        
        # Store transition
        memory.put((state, action, reward, next_state, done))
        
        # Train if enough samples
        if memory.size() > 200:
            # Sample mini-batch
            mini_batch = memory.sample(batch_size)
            
            states = torch.FloatTensor([t[0] for t in mini_batch]).to(device)
            actions = torch.LongTensor([t[1] for t in mini_batch]).to(device)
            rewards = torch.FloatTensor([t[2] for t in mini_batch]).to(device)
            next_states = torch.FloatTensor([t[3] for t in mini_batch]).to(device)
            dones = torch.FloatTensor([t[4] for t in mini_batch]).to(device)
            
            # Compute Q-values
            q_values = q_net(states).gather(1, actions.unsqueeze(1)).squeeze(1)
            
            # Compute target Q-values
            with torch.no_grad():
                max_next_q = q_target(next_states).max(1)[0]
                target_q = rewards + gamma * max_next_q * (1 - dones)
            
            # Compute loss and update
            loss = F.smooth_l1_loss(q_values, target_q)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
        state = next_state
        episode_reward += reward
    
    # Update target network
    if (episode + 1) % target_update_freq == 0:
        q_target.load_state_dict(q_net.state_dict())
    
    # Decay epsilon
    epsilon_dqn = max(epsilon_end, epsilon_dqn * epsilon_decay_dqn)
    dqn_rewards_history.append(episode_reward)
    
    if (episode + 1) % 100 == 0:
        avg_reward = np.mean(dqn_rewards_history[-100:])
        print(f"Episode {episode + 1}/{num_episodes_dqn} | Avg Reward: {avg_reward:.2f} | Îµ: {epsilon_dqn:.4f}")

print("\nâœ“ DQN training completed!")

### Visualize DQN Performance

In [None]:
# Plot DQN training curve
plt.figure(figsize=(10, 5))
plt.plot(dqn_rewards_history, alpha=0.3, label='Raw rewards')
plt.plot(np.convolve(dqn_rewards_history, np.ones(50)/50, mode='valid'), 
         label='Moving average (50)', linewidth=2)
plt.xlabel('Episode')
plt.ylabel('Total Reward')
plt.title('DQN Training Progress')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print(f"Final average reward: {np.mean(dqn_rewards_history[-50:]):.2f}")

### Test Trained DQN Agent

In [None]:
# Test trained DQN agent
q_net.eval()
state, _ = env.reset()
done = False
path = [tuple(state)]
total_reward = 0

while not done and len(path) < 50:
    action = q_net.sample_action(state, epsilon=0.0)  # Greedy
    state, reward, done, truncated, info = env.step(action)
    path.append(tuple(state))
    total_reward += reward

print(f"âœ“ Test episode completed!")
print(f"  Steps: {len(path) - 1}")
print(f"  Total reward: {total_reward:.2f}")
print(f"  Goal reached: {info.get('goal_reached', False)}")
print(f"  Path: {path[:10]}..." if len(path) > 10 else f"  Path: {path}")

---

## 4. Results Visualization

### Load Pre-trained Models (if available)

In [None]:
# Load pre-trained Q-table
try:
    q_table_trained = np.load('2-q-learning-agent/q_table.npy')
    print("âœ“ Loaded trained Q-table from file")
    print(f"  Shape: {q_table_trained.shape}")
    print(f"  Non-zero values: {np.count_nonzero(q_table_trained)}")
except FileNotFoundError:
    print("âš  Pre-trained Q-table not found. Using freshly trained one.")
    q_table_trained = q_table

# Load pre-trained DQN model
try:
    q_net_trained = Qnet().to(device)
    q_net_trained.load_state_dict(torch.load('3-dqn-agent/dqn.pth', map_location=device))
    q_net_trained.eval()
    print("\nâœ“ Loaded trained DQN model from file")
except FileNotFoundError:
    print("\nâš  Pre-trained DQN model not found. Using freshly trained one.")
    q_net_trained = q_net

### Compare Q-Learning vs DQN

In [None]:
# Compare both methods
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Q-Learning performance
axes[0].plot(rewards_history, alpha=0.3)
axes[0].plot(np.convolve(rewards_history, np.ones(100)/100, mode='valid'), linewidth=2)
axes[0].set_title('Q-Learning Training')
axes[0].set_xlabel('Episode')
axes[0].set_ylabel('Total Reward')
axes[0].grid(True, alpha=0.3)

# DQN performance
axes[1].plot(dqn_rewards_history, alpha=0.3)
axes[1].plot(np.convolve(dqn_rewards_history, np.ones(50)/50, mode='valid'), linewidth=2)
axes[1].set_title('DQN Training')
axes[1].set_xlabel('Episode')
axes[1].set_ylabel('Total Reward')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\nðŸ“Š Performance Comparison:")
print(f"Q-Learning final avg reward: {np.mean(rewards_history[-100:]):.2f}")
print(f"DQN final avg reward: {np.mean(dqn_rewards_history[-50:]):.2f}")

### Visualize Q-Values (Q-Learning)

In [None]:
import seaborn as sns

# Visualize max Q-values across the grid
max_q_values = np.max(q_table_trained, axis=2)

plt.figure(figsize=(8, 6))
sns.heatmap(max_q_values, annot=True, fmt='.2f', cmap='viridis', 
            cbar_kws={'label': 'Max Q-Value'})
plt.title('Q-Learning: Max Q-Values per State')
plt.xlabel('Column')
plt.ylabel('Row')
plt.tight_layout()
plt.show()

---

## ðŸŽ¯ Summary

**What we learned:**

1. **Custom Environment:** Created a 7Ã—7 gridworld with traps and obstacles
2. **Q-Learning:** Tabular RL method â€” simple but effective for small state spaces
3. **DQN:** Neural network approximation â€” scales to larger/continuous spaces

**Key Takeaways:**
- âœ… Q-Learning converges quickly for small grids
- âœ… DQN requires more episodes but generalizes better
- âœ… Epsilon-greedy exploration is crucial
- âœ… Reward shaping affects learning speed

**Next Steps:**
- Try different hyperparameters
- Implement Double DQN or Dueling DQN
- Scale to larger environments
- Add stochastic transitions

---

## ðŸ“š Resources

- [Full Project Repository](https://github.com/muk0644/autonomous-agent-q-learning-dqn)
- [Sutton & Barto: Reinforcement Learning Book](http://incompleteideas.net/book/the-book.html)
- [OpenAI Spinning Up](https://spinningup.openai.com/)

---

**Author:** Shariq Khan  
**Contact:** engr.m.shariqkhan@gmail.com  
**GitHub:** [@muk0644](https://github.com/muk0644)