<a href="https://colab.research.google.com/github/magalaReuben/practicaldeepreinforcementlearning/blob/main/Lecture3/DeepQLearningPytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install swig
!pip install gymnasium[box2d]

Collecting swig
  Using cached swig-4.1.1.post1-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.8 MB)
Installing collected packages: swig
Successfully installed swig-4.1.1.post1
Collecting box2d-py==2.3.5 (from gymnasium[box2d])
  Using cached box2d-py-2.3.5.tar.gz (374 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: box2d-py


In [2]:
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import random


In [3]:
env = gym.make('LunarLander-v2')

print("observation_space", *env.observation_space.shape)
print("action_space", env.action_space.n)

DependencyNotInstalled: Box2D is not installed, run `pip install gymnasium[box2d]`

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
import numpy as np

MEM_SIZE = 10000
BATCH_SIZE = 64

class ReplayMemory:
    def __init__(self):
        self.mem_count = 0

        self.states = np.zeros((MEM_SIZE, *env.observation_space.shape),dtype=np.float32)
        self.actions = np.zeros(MEM_SIZE, dtype=np.int64)
        self.rewards = np.zeros(MEM_SIZE, dtype=np.float32)
        self.states_ = np.zeros((MEM_SIZE, *env.observation_space.shape),dtype=np.float32)
        self.dones = np.zeros(MEM_SIZE, dtype=np.bool)

    def add(self, state, action, reward, state_, done):
        mem_index = self.mem_count % MEM_SIZE

        self.states[mem_index]  = state
        self.actions[mem_index] = action
        self.rewards[mem_index] = reward
        self.states_[mem_index] = state_
        self.dones[mem_index] =  1 - done

        self.mem_count += 1

    def sample(self):
        MEM_MAX = min(self.mem_count, MEM_SIZE)
        batch_indices = np.random.choice(MEM_MAX, BATCH_SIZE, replace=True)

        states  = self.states[batch_indices]
        actions = self.actions[batch_indices]
        rewards = self.rewards[batch_indices]
        states_ = self.states_[batch_indices]
        dones   = self.dones[batch_indices]

        return states, actions, rewards, states_, dones

In [6]:
LEARNING_RATE = 0.0001

class DQN(nn.Module):
    def __init__(self):
        super().__init__()
        self.input_shape = env.observation_space.shape
        self.action_space = env.action_space.n

        self.layer1 = nn.Linear(*self.input_shape, 1024)
        self.layer2 = nn.Linear(512, 512)
        self.layer3 = nn.Linear(512, self.action_space)

        self.optimizer = optim.Adam(self.parameters(), lr=LEARNING_RATE)
        self.loss = nn.SmoothL1Loss()
        self.to(device)

    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        x = self.layer3(x)

        return x

In [None]:
max_epsilon = 1.0
min_epsilon = 0.05
decay_rate = 0.0005

gamma = 0.95

class DqnAgent:
    def __init__(self):
        self.memory = ReplayMemory()
        self.epsilon = max_epsilon
        self.network = DQN()

    def choose_action(self, state):
        random_num = random.uniform(0, 1)
        if random_num > self.epsilon:
            state = torch.tensor(state).float().detach().to(device).unsqueeze(0)
            q_values = self.network(state)
            return torch.argmax(q_values).item()

        else:
            return env.observation_space.sample()

    def learn(self):
        if self.memory.mem_count < BATCH_SIZE:
            return

        states, actions, rewards, next_states, dones = self.memory.sample()
        states = torch.tensor(states , dtype=torch.float32).to(device)
        actions = torch.tensor(actions, dtype=torch.long).to(device)
        rewards = torch.tensor(rewards, dtype=torch.float32).to(device)
        next_states = torch.tensor(next_states, dtype=torch.float32).to(device)
        dones = torch.tensor(dones, dtype=torch.bool).to(device)
        batch_indices = np.arange(BATCH_SIZE, dtype=np.int64)

        q_values = self.network(states)
        next_q_values = self.network(next_states)




