In [None]:
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np
from collections import deque

# Hyperparameters
EPISODES = 1000
GAMMA = 0.99
LR = 0.001
BATCH_SIZE = 16
EPSILON_START = 1.0
EPSILON_END = 0.01
EPSILON_DECAY = 0.995
TARGET_UPDATE = 10

# Environment
env = gym.make('LunarLander-v3')
n_state = env.observation_space.shape[0]
n_action = env.action_space.n

# Q-Network
class QNetwork(nn.Module):
    def __init__(self, n_state, n_action):
        super(QNetwork, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(n_state, 32),
            nn.ReLU(),
            nn.Linear(32, 32),
            nn.ReLU(),
            nn.Linear(32, n_action)
        )
    def forward(self, x):
        return self.fc(x)

# Replay Buffer
class ReplayBuffer:
    def __init__(self, capacity=10000):
        self.buffer = deque(maxlen=capacity)

    def push(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        return random.sample(self.buffer, batch_size)

    def __len__(self):
        return len(self.buffer)

# Initialize networks and replay buffer
policy_net = QNetwork(n_state, n_action)
target_net = QNetwork(n_state, n_action)
target_net.load_state_dict(policy_net.state_dict())
optimizer = optim.Adam(policy_net.parameters(), lr=LR)
replay_buffer = ReplayBuffer()

# Training loop
epsilon = EPSILON_START
for episode in range(EPISODES):
    state = env.reset()[0]
    total_reward = 0
    done = False

    while not done:
        # Epsilon-greedy action selection
        if random.random() < epsilon:
            action = env.action_space.sample()
        else:
            q_values = policy_net(torch.FloatTensor(state)).detach().numpy()
            action = np.argmax(q_values)

        next_state, reward, done, _, _ = env.step(action)
        replay_buffer.push(state, action, reward, next_state, done)
        state = next_state
        total_reward += reward

        # Update policy
        if len(replay_buffer) > BATCH_SIZE:
            batch = replay_buffer.sample(BATCH_SIZE)
            states, actions, rewards, next_states, dones = zip(*batch)
            states = torch.FloatTensor(states)
            actions = torch.LongTensor(actions)
            rewards = torch.FloatTensor(rewards)
            next_states = torch.FloatTensor(next_states)
            dones = torch.FloatTensor(dones)

            q_values = policy_net(states).gather(1, actions.unsqueeze(1)).squeeze(1)
            next_q_values = target_net(next_states).max(1)[0].detach()
            targets = rewards + GAMMA * next_q_values * (1 - dones)

            loss = nn.MSELoss()(q_values, targets)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    # Update target network
    if episode % TARGET_UPDATE == 0:
        target_net.load_state_dict(policy_net.state_dict())

    # Epsilon decay
    epsilon = max(EPSILON_END, epsilon * EPSILON_DECAY)
    print(f"Episode: {episode}, Reward: {total_reward}, Epsilon: {epsilon:.2f}")

env.close()


  states = torch.FloatTensor(states)


Episode: 0, Reward: -96.79945066352593, Epsilon: 0.99
Episode: 1, Reward: -103.27350002208057, Epsilon: 0.99
Episode: 2, Reward: -262.36922462623875, Epsilon: 0.99
Episode: 3, Reward: -310.22056720728284, Epsilon: 0.98
Episode: 4, Reward: -209.79776022013164, Epsilon: 0.98
Episode: 5, Reward: -347.3763375745901, Epsilon: 0.97
Episode: 6, Reward: -281.9818341131228, Epsilon: 0.97
Episode: 7, Reward: -77.18537028606973, Epsilon: 0.96
Episode: 8, Reward: -121.83518405869063, Epsilon: 0.96
Episode: 9, Reward: -330.763485156463, Epsilon: 0.95
Episode: 10, Reward: -117.05078128570642, Epsilon: 0.95
Episode: 11, Reward: -332.9607124380012, Epsilon: 0.94
Episode: 12, Reward: -209.6830852408756, Epsilon: 0.94
Episode: 13, Reward: -121.91174430071067, Epsilon: 0.93
Episode: 14, Reward: -27.239083293772907, Epsilon: 0.93
Episode: 15, Reward: -102.54991448781844, Epsilon: 0.92
Episode: 16, Reward: -160.00554089320786, Epsilon: 0.92
Episode: 17, Reward: -242.15690045323478, Epsilon: 0.91
Episode: 1