# Experiment 1: Solving LunarLander-v2 using Standard DQN

Iplementing a Standard DQN to solve the LunarLander-v2 environment from OpenAI Gym. This involves creating a neural network to approximate the Q-function, implementing an experience replay buffer, and using an ε-greedy strategy for exploration.

### 1. Imports

In [1]:
import gym

# Initialize the LunarLander environment with visualization
env = gym.make('LunarLander-v2', render_mode='human')

# Reset the environment to get the initial state
observation, info = env.reset()

import numpy as np
import random
from collections import deque
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

### Define the DQN Network
The DQN will be a simple fully connected neural network that takes the state as input and outputs Q-values for each possible action.

In [2]:
class DQN(nn.Module):
    def __init__(self, state_size, action_size, hidden_size=64):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(state_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, action_size)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.fc3(x)

### Experience Replay Buffer
A replay buffer to store and sample experiences, which helps in stabilizing the training.

In [3]:
class ReplayBuffer:
    def __init__(self, buffer_size, batch_size):
        self.buffer = deque(maxlen=buffer_size)
        self.batch_size = batch_size

    def add(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self):
        experiences = random.sample(self.buffer, self.batch_size)
        states, actions, rewards, next_states, dones = zip(*experiences)
        return (torch.tensor(states, dtype=torch.float32), torch.tensor(actions), 
                torch.tensor(rewards, dtype=torch.float32), torch.tensor(next_states, dtype=torch.float32), 
                torch.tensor(dones, dtype=torch.float32))

### Setup Environment and Hyperparameters
Setting up the LunarLander environment and defining hyperparameters for training.

In [4]:
# Environment setup
env = gym.make('LunarLander-v2')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

# Hyperparameters
BUFFER_SIZE = int(1e5)
BATCH_SIZE = 64
GAMMA = 0.99
LR = 1e-3
EPSILON_START = 1.0
EPSILON_END = 0.01
EPSILON_DECAY = 0.995
TARGET_UPDATE = 10

### Training Loop
Here, we define the training loop, where the agent interacts with the environment, and the network is updated using experiences from the replay buffer.

In [5]:
# Initialize replay buffer, network, optimizer
buffer = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE)
policy_net = DQN(state_size, action_size)
optimizer = optim.Adam(policy_net.parameters(), lr=LR)
epsilon = EPSILON_START

# Training loop
num_episodes = 2000
for episode in range(num_episodes):
    state = env.reset()
    total_reward = 0
    done = False
    while not done:
        # Epsilon-greedy action selection
        if random.random() > epsilon:
            with torch.no_grad():
                action = policy_net(torch.tensor(state, dtype=torch.float32)).max(0)[1].item()
        else:
            action = env.action_space.sample()

        # Take action and inspect the output
        step_output = env.step(action)
        print(f"Step output: {step_output}")

        # Check if the output can be unpacked into four variables
        try:
            next_state, reward, done, _ = step_output
        except ValueError as e:
            print(f"Error unpacking: {e}")
            break

        buffer.add(state, action, reward, next_state, done)
        state = next_state
        total_reward += reward

        # Train the network if enough samples are available
        if len(buffer.buffer) > BATCH_SIZE:
            experiences = buffer.sample()
            states, actions, rewards, next_states, dones = experiences

            # Compute Q targets
            Q_targets_next = policy_net(next_states).detach().max(1)[0].unsqueeze(1)
            Q_targets = rewards + (GAMMA * Q_targets_next * (1 - dones))

            # Get expected Q values
            Q_expected = policy_net(states).gather(1, actions.unsqueeze(1))

            # Compute loss and update the network
            loss = F.mse_loss(Q_expected, Q_targets)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # Update epsilon
        epsilon = max(EPSILON_END, EPSILON_DECAY * epsilon)

    # Print episode results
    print(f"Episode: {episode}, Total Reward: {total_reward}")

    # Update target network
    if episode % TARGET_UPDATE == 0:
        target_net.load_state_dict(policy_net.state_dict())

Step output: (array([ 0.0122797 ,  1.4014208 ,  0.61451036, -0.20625067, -0.01496093,
       -0.15537882,  0.        ,  0.        ], dtype=float32), 1.098086014583157, False, False, {})
Error unpacking: too many values to unpack (expected 4)
Episode: 0, Total Reward: 0


NameError: name 'target_net' is not defined

### Evaluation
After training, evaluate the performance of the agent without exploration.

In [None]:
# Evaluation
num_test_episodes = 100
total_rewards = []
for episode in range(num_test_episodes):
    state = env.reset()
    total_reward = 0
    done = False
    while not done:
        action = policy_net(torch.tensor(state, dtype=torch.float32)).max(0)[1].item()
        state, reward, done, _ = env.step(action)
        total_reward += reward
    total_rewards.append(total_reward)

average_reward = sum(total_rewards) / len(total_rewards)
print(f"Average Reward over {num_test_episodes} episodes: {average_reward}")