In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np
import gym

# Define the Q-network
class QNetwork(nn.Module):
    def __init__(self, state_size, action_size, hidden_size=64):
        super(QNetwork, self).__init__()
        self.fc1 = nn.Linear(state_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, action_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Define the DQN agent
class DQNAgent:
    def __init__(self, state_size, action_size, lr=0.001, gamma=0.99, epsilon_start=1.0, epsilon_end=0.01, epsilon_decay=0.995):
        self.state_size = state_size
        self.action_size = action_size
        self.lr = lr
        self.gamma = gamma
        self.epsilon = epsilon_start
        self.epsilon_end = epsilon_end
        self.epsilon_decay = epsilon_decay
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        self.q_network = QNetwork(state_size, action_size).to(self.device)
        self.target_network = QNetwork(state_size, action_size).to(self.device)
        self.target_network.load_state_dict(self.q_network.state_dict())
        self.target_network.eval()

        self.optimizer = optim.Adam(self.q_network.parameters(), lr=lr)
        self.loss_function = nn.MSELoss()

    def act(self, state):
        if random.random() > self.epsilon:
            with torch.no_grad():
                state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
                q_values = self.q_network(state)
                action = q_values.argmax().item()
        else:
            action = random.randrange(self.action_size)
        return action

    def train(self, state, action, reward, next_state, done, batch_size=32):
        if len(replay_memory) < batch_size:
            return

        batch = random.sample(replay_memory, batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)

        states = torch.FloatTensor(states).to(self.device)
        actions = torch.LongTensor(actions).unsqueeze(1).to(self.device)
        rewards = torch.FloatTensor(rewards).unsqueeze(1).to(self.device)
        next_states = torch.FloatTensor(next_states).to(self.device)
        dones = torch.FloatTensor(dones).unsqueeze(1).to(self.device)

        q_values = self.q_network(states).gather(1, actions)
        next_q_values = self.target_network(next_states).max(dim=1, keepdim=True)[0].detach()
        target_q_values = rewards + self.gamma * next_q_values * (1 - dones)

        loss = self.loss_function(q_values, target_q_values)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # Update target network
        if done:
            self.target_network.load_state_dict(self.q_network.state_dict())

        # Update epsilon
        self.epsilon = max(self.epsilon_end, self.epsilon * self.epsilon_decay)

# Initialize environment and agent
env = gym.make('CartPole-v1')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
agent = DQNAgent(state_size, action_size)

# Initialize replay memory
replay_memory = []

# Training parameters
n_episodes = 100
batch_size = 32

for episode in range(n_episodes):
    state = env.reset()
    total_reward = 0

    while True:
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)
        replay_memory.append((state, action, reward, next_state, done))
        total_reward += reward

        agent.train(state, action, reward, next_state, done, batch_size)

        state = next_state

        if done:
            break

    print(f"Episode: {episode + 1}, Total Reward: {total_reward}")

env.close()


  deprecation(
  deprecation(
  if not isinstance(terminated, (bool, np.bool8)):
  states = torch.FloatTensor(states).to(self.device)


Episode: 1, Total Reward: 33.0
Episode: 2, Total Reward: 16.0
Episode: 3, Total Reward: 11.0
Episode: 4, Total Reward: 24.0
Episode: 5, Total Reward: 14.0
Episode: 6, Total Reward: 17.0
Episode: 7, Total Reward: 45.0
Episode: 8, Total Reward: 17.0
Episode: 9, Total Reward: 12.0
Episode: 10, Total Reward: 13.0
Episode: 11, Total Reward: 10.0
Episode: 12, Total Reward: 12.0
Episode: 13, Total Reward: 10.0
Episode: 14, Total Reward: 17.0
Episode: 15, Total Reward: 16.0
Episode: 16, Total Reward: 8.0
Episode: 17, Total Reward: 11.0
Episode: 18, Total Reward: 13.0
Episode: 19, Total Reward: 9.0
Episode: 20, Total Reward: 9.0
Episode: 21, Total Reward: 11.0
Episode: 22, Total Reward: 9.0
Episode: 23, Total Reward: 10.0
Episode: 24, Total Reward: 9.0
Episode: 25, Total Reward: 9.0
Episode: 26, Total Reward: 15.0
Episode: 27, Total Reward: 11.0
Episode: 28, Total Reward: 10.0
Episode: 29, Total Reward: 9.0
Episode: 30, Total Reward: 11.0
Episode: 31, Total Reward: 13.0
Episode: 32, Total Rewar