In [136]:
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque
import random
import numpy as np

In [137]:
class CliffWalkingEnv:
    def __init__(self):
        self.grid_height = 6
        self.grid_width = 10
        self.start = (5, 0)  # Starting at the bottom-left corner (index from 0)
        self.goal = (5, 9)   # Goal at the bottom-right corner
        self.state = self.start
        self.cliff = [(5, i) for i in range(1, 9)]  # Cliff positions

    def reset(self):
        self.state = self.start
        return self.state_to_one_hot()

    def step(self, action):
        actions = [(0, 1), (1, 0), (0, -1), (-1, 0)]  # (dy, dx) for each action
        next_state = tuple(np.add(self.state, actions[action]))
        if next_state[0] < 0 or next_state[0] >= self.grid_height or next_state[1] < 0 or next_state[1] >= self.grid_width:
            next_state = self.state
        done = next_state == self.goal or next_state in self.cliff
        reward = -5 if not done else (-500 if next_state in self.cliff else 0)
        self.state = next_state
        return self.state_to_one_hot(), reward, done, {}

    def state_to_index(self, state):
        return state[0] * self.grid_width + state[1]

    def state_to_one_hot(self):
        state_index = self.state_to_index(self.state)
        one_hot_state = np.zeros(self.grid_height * self.grid_width)
        one_hot_state[state_index] = 1
        return one_hot_state

    def observation_space(self):
        return self.grid_height * self.grid_width

    def action_space(self):
        return 4

In [144]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.7  # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_decay = 0.995
        self.epsilon_min = 0.01
        self.learning_rate = 0.001
        self.model = self._build_model()

    def _build_model(self):
        model = nn.Sequential(
            nn.Linear(self.state_size, 32),
            nn.ReLU(),
            nn.Linear(32, 8),
            nn.ReLU(),
            nn.Linear(8, 16),
            nn.ReLU(),
            nn.Linear(16, self.action_size)
        )
        return model

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        state = torch.FloatTensor(state).unsqueeze(0)
        with torch.no_grad():
            action_values = self.model(state)
        return action_values.max(1)[1].item()

    def replay(self, batch_size):
        if len(self.memory) < batch_size:
            return
        minibatch = random.sample(self.memory, batch_size)
        optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate)

        for state, action, reward, next_state, done in minibatch:
            state = torch.FloatTensor(state).unsqueeze(0)  # Adjust the shape for batch processing
            next_state = torch.FloatTensor(next_state).unsqueeze(0)  # Adjust the shape for batch processing
            reward = torch.FloatTensor([reward])
            action = torch.LongTensor([action])

            current_q_values = self.model(state)
            current_q_value = current_q_values.gather(1, action.unsqueeze(1)).squeeze(1)

            # Calculate the target Q-value
            next_q_values = self.model(next_state)
            max_next_q_value = torch.max(next_q_values, 1)[0]
            target_q_value = reward + self.gamma * max_next_q_value * (1 - int(done))

            # Compute the loss
            loss = nn.functional.mse_loss(current_q_value, target_q_value)
        
            # Optimize the model
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay



In [145]:
env = CliffWalkingEnv()
agent = DQNAgent(env.observation_space(), env.action_space())
episodes = 500
batch_size = 32

In [None]:
for episode in range(episodes):
    state = env.reset()
    total_reward = 0
    done = False
    while not done:
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)
        agent.remember(state, action, reward, next_state, done)
        state = next_state
        total_reward += reward
        if len(agent.memory) > batch_size:
            agent.replay(batch_size)
    print(f"Episode: {episode + 1}, Total Reward: {total_reward}, Epsilon: {agent.epsilon:.2f}")


Episode: 1, Total Reward: -510, Epsilon: 1.00
Episode: 2, Total Reward: -500, Epsilon: 1.00
Episode: 3, Total Reward: -505, Epsilon: 1.00
Episode: 4, Total Reward: -515, Epsilon: 1.00
Episode: 5, Total Reward: -505, Epsilon: 1.00
Episode: 6, Total Reward: -515, Epsilon: 1.00
Episode: 7, Total Reward: -500, Epsilon: 1.00
Episode: 8, Total Reward: -500, Epsilon: 1.00
Episode: 9, Total Reward: -1330, Epsilon: 0.46
Episode: 10, Total Reward: -500, Epsilon: 0.46
Episode: 11, Total Reward: -3050, Epsilon: 0.04
Episode: 12, Total Reward: -615, Epsilon: 0.03
Episode: 13, Total Reward: -1410, Epsilon: 0.01
Episode: 14, Total Reward: -535, Epsilon: 0.01
Episode: 15, Total Reward: -530, Epsilon: 0.01
Episode: 16, Total Reward: -500, Epsilon: 0.01
Episode: 17, Total Reward: -500, Epsilon: 0.01
Episode: 18, Total Reward: -1515, Epsilon: 0.01
Episode: 19, Total Reward: -515, Epsilon: 0.01
Episode: 20, Total Reward: -515, Epsilon: 0.01
Episode: 21, Total Reward: -2000, Epsilon: 0.01
Episode: 22, Tota

Episode: 175, Total Reward: -505, Epsilon: 0.01
Episode: 176, Total Reward: -525, Epsilon: 0.01
Episode: 177, Total Reward: -545, Epsilon: 0.01
Episode: 178, Total Reward: -500, Epsilon: 0.01
Episode: 179, Total Reward: -500, Epsilon: 0.01
Episode: 180, Total Reward: -500, Epsilon: 0.01
Episode: 181, Total Reward: -500, Epsilon: 0.01
Episode: 182, Total Reward: -545, Epsilon: 0.01
Episode: 183, Total Reward: -505, Epsilon: 0.01
Episode: 184, Total Reward: -500, Epsilon: 0.01
Episode: 185, Total Reward: -1960, Epsilon: 0.01
Episode: 186, Total Reward: -515, Epsilon: 0.01
Episode: 187, Total Reward: -505, Epsilon: 0.01
Episode: 188, Total Reward: -505, Epsilon: 0.01
Episode: 189, Total Reward: -935, Epsilon: 0.01
Episode: 190, Total Reward: -510, Epsilon: 0.01
Episode: 191, Total Reward: -765, Epsilon: 0.01
Episode: 192, Total Reward: -515, Epsilon: 0.01
Episode: 193, Total Reward: -500, Epsilon: 0.01
Episode: 194, Total Reward: -535, Epsilon: 0.01
Episode: 195, Total Reward: -555, Epsil