# Intro
While this notebook focuses on understanding through clear explanations and visualizations,a professional implementation could involve separate files for better organization:
- **model.py**: Contains the neural network architecture.
- **agent.py**: Contains the DRL algorithm.
- **trainning.py**: Contains the agent interaction with the environment to provide the training and testing loops.

This notebook contains all above files in one place for better understanding! Consider this notebook the first chapter of our DRL adventure! In next notebooks, we'll be more focused on the implementation of the algorithms. 


# Simulation Environment

In [5]:
# Game of Pong Simulation environment
import gymnasium as gym
import gymnasium.utils.seeding as seeding
from gymnasium.wrappers import AtariPreprocessing, RecordVideo
import ale_py



import numpy as np
import random
from collections import namedtuple, deque
import torch
import torch.nn.functional as F
import torch.optim as optim

BUFFER_SIZE = int(1e5)  # replay buffer size
BATCH_SIZE = 64         # minibatch size
GAMMA = 0.99            # discount factor
TAU = 1e-3              # for soft update of target parameters
LR = 5e-4               # learning rate 
UPDATE_EVERY = 4        # how often to update the network

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


DefaultRandomSeed = 10 
# Create the Pong environment
env = gym.make("ALE/Pong-v5",frameskip=1)
env.np_random, _ = seeding.np_random(DefaultRandomSeed)
env.reset(seed=DefaultRandomSeed)
env = AtariPreprocessing(env) # Frame skipping, Grayscale, Resize (To 84*84), Stack 4 frames




# Example interaction with the environment
for _ in range(1000):
    action = env.action_space.sample()  # Take a random action
    observation, reward , terminated, truncated, info = env.step(action)  # Apply the action

    if terminated or truncated:
        state = env.reset()  # Reset the environment if done

print(terminated)
print(info)


A.L.E: Arcade Learning Environment (version 0.9.0+750d7f9)
[Powered by Stella]


False
{'lives': 0, 'episode_frame_number': 295, 'frame_number': 4026}


# DQN Algorithm walk through 
In this part we will review all the steps in the DQN in general and in the next sections, we will learn about each step in details. The following steps are written based on the pseducode providede in the [main DQN paper](https://arxiv.org/abs/1312.5602). Steps are as folllows :
1. Initialize Experince Replay Buffer ($\mathcal{D} $)
2. Initialize action-value function network (${Q}$) with random weights ($\theta$)
3. Initialize target action-value function network ($\hat{Q}$)
4. In each episode
    -  In each step (Until episode is done or terminated)
        - Run action selection policy
            - chose a random action with a probability of $\epsilon$
            - **Otherwise** use action that maximizes the output of ${Q}$ network
        - step function! : take action $a_t$, shift to $s_{t+1}$, and receive reward $r_{t}$
        



# Model 

In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class QNetwork(nn.Module):

    def __init__(self, action_size):
        super(QNetwork, self).__init__()
        self.conv1 = nn.Conv2d(2,  4, kernel_size=6, stride=2)  # 2x84x84 to 4x40x40
        self.conv2 = nn.Conv2d(4, 16, kernel_size=8, stride=4)  # 4x40x40 to 16x9x9
        self.lsize = 16*9*9
        self.fc1 = nn.Linear(self.lsize, 256)
        self.fc2 = nn.Linear(256, action_size)

    def forward(self, x):
        # input : Observations 
        # Ouput : Q value of different actions
        x = self.conv1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = F.relu(x)
        x = x.view(-1, self.lsize)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        return x        

# Agent

##  Experience Replay Buffer 

A set of (state, action, reward, next state) is saved in the experience replay buffer at each step, and a training sample is selected randomly from it. Experience Replay Buffer is used to solve the following issues 
1. More Diverse Mini-Batches for Training
2. Reduces Overfitting to Recent Experiences
3. Mitigating the Non-i.i.d. Issue (independent and identically distributed samples)



In [12]:

class ReplayBuffer:
    def __init__(self, action_size, buffer_size, batch_size, seed):
        self.action_size = action_size
        self.memory = deque(maxlen=buffer_size)  
        self.batch_size = batch_size
        self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
        self.seed = random.seed(seed)
    
    def add(self, state, action, reward, next_state, done):
        self.memory.append(self.experience(state, action, reward, next_state, done))
    
    def sample(self):
        experiences = random.sample(self.memory, k=self.batch_size)
        states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)
        actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(device)
        rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
        next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(device)
        dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device)
        return (states, actions, rewards, next_states, dones)

    def __len__(self):
        return len(self.memory)


As can be seen in the next block, we have defined two networks with the same stracture known as "Local" and "Target" Q networks. But why?!
The **non-stationary** behavior of the DRL methods.

In [13]:


class DQNAgent():
    def __init__(self, state_size, action_size, seed, random_policy=False):
        self.random = random_policy
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def load_weights(self, model_weights):
        self.qnetwork_local.load_state_dict(torch.load('models/{}'.format(model_weights)))
    
    def save_weights(self, model_weights):
        torch.save(self.qnetwork_local.state_dict(), 'models/{}'.format(model_weights))

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        if self.random:
            return
        self.memory.add(state, action, reward, next_state, done)
        
        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        if self.random:
            return random.choice(np.arange(self.action_size))
        state = torch.from_numpy(state).float().unsqueeze(0).to(device) # unsqueeze adds an extra dimension to the provided tensor
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()
        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))


    def learn(self, experiences, gamma):
        states, actions, rewards, next_states, dones = experiences
        current = self.qnetwork_local(states).gather(1, actions) # return Q(s,a)
        next_qvalues = self.qnetwork_target(next_states).max(1)[0].detach().unsqueeze(1)
        targets = rewards + GAMMA*(next_qvalues*(1 - dones))
        loss = F.smooth_l1_loss(current, targets)

        self.qnetwork_local.zero_grad()
        loss.backward()
        self.optimizer.step()
        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)                     

    def soft_update(self, local_model, target_model, tau):
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)



# Training

In [1]:
# to be added ! 