<a href="https://colab.research.google.com/github/mani-droid/data-science-projects/blob/master/CartPole_with_Deep_Q_Network_(PyTorch).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Import Libraries**

In [1]:
import gym
import numpy as np
import random
from collections import deque
import torch
import torch.nn as nn
import torch.optim as optim

**Neural Network Structure**

In [2]:
pip install --upgrade gym



In [3]:
!pip install numpy==1.23.5 --quiet


In [4]:
class DQN(nn.Module):
  def __init__(self, input_dim, output_dim):
    super(DQN, self).__init__()
    self.fc1 = nn.Linear(input_dim, 128)
    self.fc2 = nn.Linear(128, 64)
    self.fc3 = nn.Linear(64, output_dim)

  def forward(self, x):
    x = torch.relu(self.fc1(x))
    x = torch.relu(self.fc2(x))
    return self.fc3(x)

**HyperParameters**

In [5]:
env = gym.make("CartPole-v1", render_mode = None)
input_dim = env.observation_space.shape[0]
output_dim = env.action_space.n

lr = 0.001
gamma = 0.99
epsilon = 1.0
epsilon_decay = 0.995
epsilon_min = 0.01
batch_size = 64
max_episodes = 500
buffer_size = 10000
target_update = 10

**Replay Buffer**

In [6]:
replay_buffer = deque(maxlen = buffer_size)

def sample_experiences():
  batch = random.sample(replay_buffer, batch_size)
  states, actions, rewards, next_states, dones = zip(*batch)
  return (np.array(states), actions, rewards, np.array(next_states), dones)

**Initialize Networks**

In [7]:
#device = torch.device("cuda" if torch.cuda is available() else "cpu")

policy_net = DQN(input_dim, output_dim)
target_net = DQN(input_dim, output_dim)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.Adam(policy_net.parameters(), lr = lr)
loss_fn = nn.MSELoss()

**Training Loop**

In [8]:
for episode in range(max_episodes):
  state, _ = env.reset()
  total_reward = 0


  for t in range(500):
    state_array = np.array(state)
    state_tensor = torch.tensor(state_array, dtype = torch.float32).unsqueeze(0)


    if random.random() < epsilon:
      action = random.choice(range(output_dim))
    else:
      with torch.no_grad():
        q_values = policy_net(state_tensor)
        action = torch.argmax(q_values).item()


    next_state, reward, terminated, truncated, _ = env.step(action)
    done = terminated or truncated
    total_reward += reward

    replay_buffer.append((state, action, reward, next_state, done))
    state = next_state

    if len(replay_buffer) >= batch_size:
      states, actions, rewards, next_states, dones = sample_experiences()


      states = torch.tensor(states, dtype = torch.float32)
      next_states = torch.tensor(next_states, dtype = torch.float32)
      actions = torch.tensor(actions).unsqueeze(1)
      rewards = torch.tensor(rewards, dtype = torch.float32).unsqueeze(1)
      dones = torch.BoolTensor(dones).unsqueeze(1)

      q_values = policy_net(states).gather(1, actions)
      next_q_values = target_net(next_states).max(1)[0].unsqueeze(1)
      target_q = rewards + gamma * next_q_values * (~dones)

      loss = loss_fn(q_values, target_q.detach())
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

    if done:
      break

    epsilon = max(epsilon_min, epsilon * epsilon_decay)


    if episode%target_update == 0:
      target_net.load_state_dict(policy_net.state_dict())
  print(f"Episode {episode} | Reward: {total_reward:.2f} | Epsilon: {epsilon:.3f}")

Episode 0 | Reward: 34.00 | Epsilon: 0.848
Episode 1 | Reward: 15.00 | Epsilon: 0.790
Episode 2 | Reward: 13.00 | Epsilon: 0.744
Episode 3 | Reward: 34.00 | Epsilon: 0.631
Episode 4 | Reward: 10.00 | Epsilon: 0.603
Episode 5 | Reward: 12.00 | Epsilon: 0.570
Episode 6 | Reward: 11.00 | Epsilon: 0.543
Episode 7 | Reward: 22.00 | Epsilon: 0.488
Episode 8 | Reward: 9.00 | Epsilon: 0.469
Episode 9 | Reward: 15.00 | Epsilon: 0.437
Episode 10 | Reward: 10.00 | Epsilon: 0.418
Episode 11 | Reward: 11.00 | Epsilon: 0.398
Episode 12 | Reward: 13.00 | Epsilon: 0.374
Episode 13 | Reward: 10.00 | Epsilon: 0.358
Episode 14 | Reward: 11.00 | Epsilon: 0.340
Episode 15 | Reward: 9.00 | Epsilon: 0.327
Episode 16 | Reward: 12.00 | Epsilon: 0.309
Episode 17 | Reward: 13.00 | Epsilon: 0.291
Episode 18 | Reward: 11.00 | Epsilon: 0.277
Episode 19 | Reward: 12.00 | Epsilon: 0.262
Episode 20 | Reward: 9.00 | Epsilon: 0.252
Episode 21 | Reward: 9.00 | Epsilon: 0.242
Episode 22 | Reward: 13.00 | Epsilon: 0.228
Ep