# Intro
While this notebook focuses on understanding through clear explanations and visualizations,a professional implementation could involve separate files for better organization:
- **model.py**: Contains the neural network architecture.
- **agent.py**: Contains the DRL algorithm.
- **trainning.py**: Contains the agent interaction with the environment to provide the training and testing loops.

This notebook contains all above files in one place for better understanding! Consider this notebook the first chapter of our DRL adventure! In next notebooks, we'll be more focused on the implementation of the algorithms. 


# Simulation Environment

In [35]:
# Game of Pong Simulation environment
import gymnasium as gym
import gymnasium.utils.seeding as seeding
from gymnasium.wrappers import AtariPreprocessing, RecordVideo
import ale_py



import numpy as np
import random
from collections import namedtuple, deque
import torch
import torch.nn.functional as F
import torch.optim as optim

BUFFER_SIZE = int(1e5)  # replay buffer size
BATCH_SIZE = 64         # minibatch size
GAMMA = 0.99            # discount factor
TAU = 1e-3              # for soft update of target parameters
LR = 5e-4               # learning rate 
UPDATE_EVERY = 4        # how often to update the network

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


DefaultRandomSeed = 10 
# Create the Pong environment
env = gym.make("ALE/Pong-v5",frameskip=1)
env.np_random, _ = seeding.np_random(DefaultRandomSeed)
env.reset(seed=DefaultRandomSeed)
env = AtariPreprocessing(env) # Frame skipping, Grayscale, Resize (To 84*84), Stack 4 frames




# Example interaction with the environment
for _ in range(1000):
    action = env.action_space.sample()  # Take a random action
    observation, reward , terminated, truncated, info = env.step(action)  # Apply the action
    

    if terminated or truncated:
        state = env.reset()  # Reset the environment if done

print(terminated)
print (truncated)
print(info)
print(observation.shape)


False
False
{'lives': 0, 'episode_frame_number': 4000, 'frame_number': 4000}
(84, 84)


# DQN Algorithm walkthrough 

This section provides a high-level overview of the Deep Q-Network (DQN) algorithm. We will learn about each step in details. The following steps are written based on the pseducode providede in the [main DQN paper](https://arxiv.org/abs/1312.5602).

1. Initialize Experince Replay Buffer ($\mathcal{D} $)
2. Initialize action-value function network (${Q}$) with random weights ($\theta$)
3. Initialize target action-value function network ($\hat{Q}$)
4. In each episode
    -  In each step (Until episode is done or terminated)
        - Run action selection policy
            - chose a random action with a probability of $\epsilon$
            - **Otherwise** use action that maximizes the output of ${Q}$ network
        - step function! : take action $a_t$, shift to $s_{t+1}$, and receive reward $r_{t}$
        - store ($s_{t}$,$a_{t}$,$r_{t}$,$s_{t+1}$) in $\mathcal{D} $
        - Sample mininbatch of transitions from $\mathcal{D} $
        - Calculate target value:
            - if episode is done or terminated:
                $y_{j} = r_{j}$
            - Otherwise:
                $y_{j} = r_{j} + \gamma \text{max}_{a^{\prime}} \hat{Q}(s_{j+1},a^{\prime}, \bar{\theta})$
        - Define loss as  $(y_{j} - Q(s_{j},a_{j}, \theta))$
        - Update ${Q}$ weights given the provided loss 
        - every C steps update $\hat{Q}$ network weights using the following equation
            $\bar{\theta} = \tau*\theta + (1 - \tau)*\bar{\theta}$



# Model 
No matter what DRL algorithm we use, the way the deep neural network structure is super important in decision-making process. We're starting with a basic type of network known as Convolutional Neural Network (CNN), but as we go further, we will try networks with more complex structures!

In [36]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class QNetwork(nn.Module):
    def __init__(self, input_shape, num_actions,seed):
        super(QNetwork, self).__init__()
        print(input_shape)
        self.conv1 = nn.Conv2d(in_channels=input_shape[0], out_channels=16, kernel_size=8, stride=4)
        self.conv2 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=4, stride=2)
        self.fc1 = nn.Linear(in_features=self._feature_size(input_shape), out_features=256)
        self.fc2 = nn.Linear(in_features=256, out_features=num_actions)
        self.seed = torch.manual_seed(seed)

    def forward(self, x):
        # input : Observations 
        # Ouput : Q value of different actions
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

    def _feature_size(self, input_shape):
        return nn.Sequential(
            nn.Conv2d(input_shape[0], 16, 8, 4),
            nn.ReLU(),
            nn.Conv2d(16, 32, 4, 2),
            nn.ReLU()
        ).forward(torch.zeros(1, *input_shape)).view(1, -1).size(1)

net = QNetwork((2, 84, 84), 4,10) 
print(net)


(2, 84, 84)
QNetwork(
  (conv1): Conv2d(2, 16, kernel_size=(8, 8), stride=(4, 4))
  (conv2): Conv2d(16, 32, kernel_size=(4, 4), stride=(2, 2))
  (fc1): Linear(in_features=2592, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=4, bias=True)
)


# Agent

##  Experience Replay Buffer 

A set of (state, action, reward, next state) is saved in the experience replay buffer at each step, and a training sample is selected randomly from it. Experience Replay Buffer is used to solve the following issues 
1. More Diverse Mini-Batches for Training
2. Reduces Overfitting to Recent Experiences
3. Mitigating the Non-i.i.d. Issue (independent and identically distributed samples)



In [37]:

class ReplayBuffer:
    def __init__(self, action_size, buffer_size, batch_size, seed):
        self.action_size = action_size
        self.memory = deque(maxlen=buffer_size)  
        self.batch_size = batch_size
        self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
        self.seed = random.seed(seed)
    
    def add(self, state, action, reward, next_state, done):
        self.memory.append(self.experience(state, action, reward, next_state, done))
    
    def sample(self):
        experiences = random.sample(self.memory, k=self.batch_size)
        states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)
        actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(device)
        rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
        next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(device)
        dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device)
        return (states, actions, rewards, next_states, dones)

    def __len__(self):
        return len(self.memory)


## Target Network
As you'll see in the next section, we define two networks with the same structure: a "Local" network and a "Target" network. Why do we need both?

The problem lies in the constantly changing behavior (non-stationary) of DRL methods. The targets we use to train our network are calculated using the network itself. This means the target function changes with every update, making the learning process unstable.

A simple yet effective solution is to have a separate network, called the target network, that we fix for multiple training steps. This network remains unchanged while the local network updates. We use the target network to calculate more stable targets for training the local network (step 3 in the walkthrough).



In [38]:


class DQNAgent():
    def __init__(self, state_size, action_size, seed, random_policy=False):
        self.random = random_policy
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def load_weights(self, model_weights):
        self.qnetwork_local.load_state_dict(torch.load('models/{}'.format(model_weights)))
    
    def save_weights(self, model_weights):
        torch.save(self.qnetwork_local.state_dict(), 'models/{}'.format(model_weights))

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        if self.random:
            return
        self.memory.add(state, action, reward, next_state, done)
        
        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        if self.random:
            return random.choice(np.arange(self.action_size))
        state = torch.from_numpy(state).float().unsqueeze(0).to(device) # unsqueeze adds an extra dimension to the provided tensor
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()
        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))


    def learn(self, experiences, gamma):
        states, actions, rewards, next_states, dones = experiences
        current = self.qnetwork_local(states).gather(1, actions) # return Q(s,a)
        next_qvalues = self.qnetwork_target(next_states).max(1)[0].detach().unsqueeze(1)
        targets = rewards + GAMMA*(next_qvalues*(1 - dones))
        loss = F.smooth_l1_loss(current, targets)

        self.qnetwork_local.zero_grad()
        loss.backward()
        self.optimizer.step()
        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)                     

    def soft_update(self, local_model, target_model, tau):
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)



# Training

In [1]:
class DQN_Training():
    def __init__(self, n_episodes=2000, eps_start=1.0, eps_end=0.01, eps_decay=0.995, max_t=1000):
        self.n_episodes = n_episodes
        self.max_t = max_t
        self.eps_start = eps_start
        self.eps_decay = eps_decay
        self.eps_end = eps_end
        self.eps = self.eps_start
    
    def train(self,agent,env):

        for ep_number in range(self.n_episodes+1):
            state, _ = env.reset()
            print(state.shape)
            episode_reward, done = 0, False

            episode_step = 0


            while not done:
                episode_step += 1 
                action = agent.act(state, self.eps)
                next_state, reward , terminated, truncated, info = env.step(action)  # Apply the action
                if terminated or truncated :
                    done = True 

                agent.step(state, action, reward, next_state, done)
                
                episode_reward += reward 
                state = next_state
            
            self.eps = max(self.eps_end, self.eps_decay*self.eps)

            if ep_number % 10 == 0 :
                print(episode_reward)

