In [84]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random

class GridWorld:
    def __init__(self, size=15):
        self.size = size
        self.reset()
    
    def reset(self):
        self.agent_position = 0 #starts at top-left
        return self.agent_position
    
    def step(self, action):
        if action == 0 and self.agent_position % self.size > 0: #left
            self.agent_position -= 1
        elif action == 1 and self.agent_position % self.size < self.size - 1: #right
            self.agent_position += 1
        elif action == 2 and self.agent_position >= self.size: #up
            self.agent_position -= self.size
        elif action == 3 and self.agent_position < self.size * (self.size - 1): #down
            self.agent_position += self.size
        
        done = self.agent_position == self.size * self.size - 1
        reward = 10 if done else -1
        return self.agent_position, reward, done

class QLearningAgent:
    def __init__(self, state_size, action_size, learning_rate=0.001, discount_factor=0.99, exploration_rate=1.0, exploration_decay=0.995, exploration_min=0.01):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.state_size = state_size
        self.action_size = action_size
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.exploration_rate = exploration_rate
        self.exploration_min = exploration_min
        self.exploration_decay = exploration_decay

        print(self.device)
        
        # Q-Network
        self.q_network = nn.Sequential(
            nn.Linear(state_size, 24),
            nn.ReLU(),
            nn.Linear(24, 24),
            nn.ReLU(),
            nn.Linear(24, action_size)
        ).to(self.device) 
        
        # Target Network
        self.target_network = nn.Sequential(
            nn.Linear(state_size, 24),
            nn.ReLU(),
            nn.Linear(24, 24),
            nn.ReLU(),
            nn.Linear(24, action_size)
        ).to(self.device) 
        self.target_network.load_state_dict(self.q_network.state_dict())
        
        self.optimizer = optim.Adam(self.q_network.parameters(), lr=learning_rate)
        self.loss_fn = nn.MSELoss()
        self.target_update_frequency = 5
        self.steps = 0

    def get_action(self, state):
        if np.random.rand() < self.exploration_rate:
            return random.randrange(self.action_size)
        with torch.no_grad():
            state_tensor = torch.FloatTensor(self.one_hot_encode(state)).to(self.device)  
            q_values = self.q_network(state_tensor)
            return torch.argmax(q_values).item()

    def one_hot_encode(self, state):
        encoding = [0] * self.state_size
        encoding[state] = 1
        return encoding

    def train(self, state, action, reward, next_state, done):
        state_tensor = torch.FloatTensor(self.one_hot_encode(state)).to(self.device)
        next_state_tensor = torch.FloatTensor(self.one_hot_encode(next_state)).to(self.device)

        current_q_values = self.q_network(state_tensor)
        
        with torch.no_grad():
            next_q_values = self.target_network(next_state_tensor)
            max_next_q_value = torch.max(next_q_values)
            target_q_value = reward + (self.discount_factor * max_next_q_value * (not done))

        target = current_q_values.clone()
        target[action] = target_q_value
        
        loss = self.loss_fn(current_q_values, target)
        
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
        self.steps += 1

    def update_target_network(self):
        self.target_network.load_state_dict(self.q_network.state_dict())

def train_agent(episodes=1000):
    env = GridWorld()
    agent = QLearningAgent(
        state_size=env.size**2,
        action_size=4,
        learning_rate=0.001,
        exploration_decay=0.995,
        exploration_min=0.01
    )
    
    for episode in range(episodes):
        state = env.reset()
        done = False
        total_reward = 0
        
        while not done:
            action = agent.get_action(state)
            next_state, reward, done = env.step(action)
            agent.train(state, action, reward, next_state, done)
            state = next_state
            total_reward += reward
        
        # Decay exploration rate after each episode
        agent.exploration_rate = max(
            agent.exploration_min,
            agent.exploration_rate * agent.exploration_decay
        )
        
        # Update target network periodically
        if episode % agent.target_update_frequency == 0:
            agent.update_target_network()
        
        if episode % 100 == 0:
            print(f"Episode {episode}: Total Reward = {total_reward}, Exploration Rate = {agent.exploration_rate:.3f}")
    
    return agent

# Run the training
trained_agent = train_agent()

cuda
Episode 0: Total Reward = -1462, Exploration Rate = 0.995
Episode 100: Total Reward = -73, Exploration Rate = 0.603
Episode 200: Total Reward = -27, Exploration Rate = 0.365
Episode 300: Total Reward = -23, Exploration Rate = 0.221
Episode 400: Total Reward = -21, Exploration Rate = 0.134
Episode 500: Total Reward = -17, Exploration Rate = 0.081
Episode 600: Total Reward = -17, Exploration Rate = 0.049
Episode 700: Total Reward = -22, Exploration Rate = 0.030
Episode 800: Total Reward = -17, Exploration Rate = 0.018
Episode 900: Total Reward = -18, Exploration Rate = 0.011


In [85]:
def test_agent(agent, env):
    state = env.reset()
    done = False
    trajectory = []
    
    # Disable exploration for testing
    original_exploration = agent.exploration_rate
    agent.exploration_rate = 0
    
    while not done:
        action = agent.get_action(state)
        next_state, reward, done = env.step(action)
        trajectory.append((state, action))
        state = next_state
    
    # Restore exploration rate (if you plan to keep training)
    agent.exploration_rate = original_exploration
    
    return trajectory

# Test the agent
env = GridWorld()
path = test_agent(trained_agent, env)
print("Path taken:", path)

Path taken: [(0, 3), (15, 1), (16, 3), (31, 3), (46, 3), (61, 3), (76, 3), (91, 1), (92, 3), (107, 3), (122, 1), (123, 1), (124, 3), (139, 1), (140, 1), (141, 1), (142, 1), (143, 1), (144, 1), (145, 3), (160, 1), (161, 3), (176, 3), (191, 3), (206, 3), (221, 1), (222, 1), (223, 1)]


In [86]:
def print_grid_path(size, path):
    grid = [["·" for _ in range(size)] for _ in range(size)]
    action_symbols = {0: "←", 1: "→", 2: "↑", 3: "↓"}
    
    for (state, action) in path:
        row = state // size
        col = state % size
        grid[row][col] = action_symbols[action]
    
    # Mark goal (bottom-right corner)
    grid[size-1][size-1] = "G"
    
    for row in grid:
        print(" ".join(row))

# Visualize the path
print_grid_path(env.size, path)

↓ · · · · · · · · · · · · · ·
→ ↓ · · · · · · · · · · · · ·
· ↓ · · · · · · · · · · · · ·
· ↓ · · · · · · · · · · · · ·
· ↓ · · · · · · · · · · · · ·
· ↓ · · · · · · · · · · · · ·
· → ↓ · · · · · · · · · · · ·
· · ↓ · · · · · · · · · · · ·
· · → → ↓ · · · · · · · · · ·
· · · · → → → → → → ↓ · · · ·
· · · · · · · · · · → ↓ · · ·
· · · · · · · · · · · ↓ · · ·
· · · · · · · · · · · ↓ · · ·
· · · · · · · · · · · ↓ · · ·
· · · · · · · · · · · → → → G


In [90]:
def print_all_path(size):
    positions = torch.tensor([[0]*size*size]*size*size, dtype=torch.float).to('cuda' if torch.cuda.is_available() else 'cpu')
    for i in range(size*size):
        positions[i][i] = 1

    maxpos = []
    for position in positions:
        pick = trained_agent.q_network(position)
        maxpos.append(torch.argmax(pick))

    action_symbols = {0: "←", 1: "→", 2: "↑", 3: "↓"}
    for i,j in enumerate(maxpos):
        print(action_symbols[j.item()], end=' ')
        if (i+1)%size==0:
            print()

print_all_path(env.size)

↓ ↓ ↓ ↓ ↓ ↓ ↓ ↓ ↓ → → → ↓ ↓ ↓ 
→ ↓ ↓ → → ↓ ↓ ↓ → → → → ↓ ↓ ↓ 
→ ↓ ↓ ↓ ↓ ↓ → ↓ ↓ ↓ ↓ ↓ → ↓ ↓ 
→ ↓ ↓ ↓ ↓ → → ↓ ↓ → ↓ ↓ → ↓ ↓ 
→ ↓ ↓ → ↓ ↓ ↓ ↓ ↓ ↓ → → ↓ → ↓ 
→ ↓ ↓ ↓ ↓ → → → ↓ ↓ → → ↓ → ↓ 
→ → ↓ ↓ ↓ → ↓ → ↓ ↓ → ↓ ↓ ↓ ↓ 
↓ → ↓ → → → ↓ ↓ ↓ → → ↓ → → ↓ 
→ → → → ↓ → ↓ ↓ → → → ↓ → ↓ ↓ 
↓ → → → → → → → → → ↓ ↓ → ↓ ↓ 
→ → → → ↓ → → ↓ → → → ↓ → ↓ ↓ 
↓ → → → → → → → → ↓ → ↓ → ↓ ↓ 
→ ↓ ↓ → → → → → → → ↓ ↓ ↓ ↓ ↓ 
→ → → → → → → → ↓ → → ↓ ↓ → ↓ 
→ → → → → → → → → → → → → → ↓ 
