In [None]:
# Double DQN implementation with PyTorch

import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import deque, namedtuple
import gym

# Hyperparameters
GAMMA = 0.99
LR = 1e-3
BATCH_SIZE = 64
MEMORY_SIZE = 10000
TARGET_UPDATE = 10
EPISODES = 1000
EPSILON_START = 1.0
EPSILON_END = 0.01
EPSILON_DECAY = 0.995

# Experience tuple
Experience = namedtuple('Experience', ('state', 'action', 'reward', 'next_state', 'done'))

# Replay Buffer
class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    def push(self, *args):
        self.buffer.append(Experience(*args))

    def sample(self, batch_size):
        return random.sample(self.buffer, batch_size)

    def __len__(self):
        return len(self.buffer)

# Q-Network
class QNetwork(nn.Module):
    def __init__(self, state_size, action_size):
        super(QNetwork, self).__init__()
        self.fc1 = nn.Linear(state_size, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, action_size)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.fc3(x)

# Epsilon-Greedy Policy
def epsilon_greedy_policy(state, epsilon, action_size, policy_net):
    if random.random() < epsilon:
        return random.randrange(action_size)
    else:
        with torch.no_grad():
            return policy_net(state).argmax().item()

# Optimize Model
def optimize_model(policy_net, target_net, memory, optimizer):
    if len(memory) < BATCH_SIZE:
        return

    experiences = memory.sample(BATCH_SIZE)
    batch = Experience(*zip(*experiences))

    non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, batch.next_state)), dtype=torch.bool)
    non_final_next_states = torch.cat([s for s in batch.next_state if s is not None])
    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)

    state_action_values = policy_net(state_batch).gather(1, action_batch)

    next_state_values = torch.zeros(BATCH_SIZE)
    next_state_values[non_final_mask] = target_net(non_final_next_states).gather(
        1, policy_net(non_final_next_states).argmax(dim=1, keepdim=True)).squeeze().detach()

    expected_state_action_values = reward_batch + (GAMMA * next_state_values)

    loss = F.mse_loss(state_action_values, expected_state_action_values.unsqueeze(1))

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

# Main Training Loop
env = gym.make('CartPole-v1')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

policy_net = QNetwork(state_size, action_size)
target_net = QNetwork(state_size, action_size)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.Adam(policy_net.parameters(), lr=LR)
memory = ReplayBuffer(MEMORY_SIZE)

epsilon = EPSILON_START
steps_done = 0

for episode in range(EPISODES):
    state = env.reset()
    state = torch.tensor([state], dtype=torch.float32)
    total_reward = 0

    for t in range(1, 10000):
        action = epsilon_greedy_policy(state, epsilon, action_size, policy_net)
        next_state, reward, done, _ = env.step(action)
        total_reward += reward

        next_state = torch.tensor([next_state], dtype=torch.float32)
        action = torch.tensor([[action]], dtype=torch.int64)
        reward = torch.tensor([reward], dtype=torch.float32)
        done = torch.tensor([done], dtype=torch.bool)

        if done:
            next_state = None

        memory.push(state, action, reward, next_state, done)
        state = next_state

        optimize_model(policy_net, target_net, memory, optimizer)

        if done:
            print(f"Episode {episode}, Total reward: {total_reward}")
            break

    epsilon = max(EPSILON_END, EPSILON_DECAY * epsilon)

    if episode % TARGET_UPDATE == 0:
        target_net.load_state_dict(policy_net.state_dict())

print("Training completed.")
env.close()


  deprecation(
  deprecation(
  state = torch.tensor([state], dtype=torch.float32)
  if not isinstance(terminated, (bool, np.bool8)):


Episode 0, Total reward: 22.0
Episode 1, Total reward: 29.0
Episode 2, Total reward: 20.0
Episode 3, Total reward: 33.0
Episode 4, Total reward: 43.0
Episode 5, Total reward: 12.0
Episode 6, Total reward: 19.0
Episode 7, Total reward: 25.0
Episode 8, Total reward: 27.0
Episode 9, Total reward: 19.0
Episode 10, Total reward: 55.0
Episode 11, Total reward: 28.0
Episode 12, Total reward: 18.0
Episode 13, Total reward: 15.0
Episode 14, Total reward: 17.0
Episode 15, Total reward: 10.0
Episode 16, Total reward: 32.0
Episode 17, Total reward: 26.0
Episode 18, Total reward: 13.0
Episode 19, Total reward: 14.0
Episode 20, Total reward: 36.0
Episode 21, Total reward: 21.0
Episode 22, Total reward: 23.0
Episode 23, Total reward: 17.0
Episode 24, Total reward: 12.0
Episode 25, Total reward: 15.0
Episode 26, Total reward: 13.0
Episode 27, Total reward: 21.0
Episode 28, Total reward: 32.0
Episode 29, Total reward: 20.0
Episode 30, Total reward: 21.0
Episode 31, Total reward: 18.0
Episode 32, Total 