<a href="https://colab.research.google.com/github/parth-pai/SOC23_Breakout_Genius/blob/main/SOC_Training_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Required Installations
!pip install gym==0.25.2
!pip install gym[atari]
!pip install gym[accept-rom-license]

Collecting autorom[accept-rom-license]~=0.4.2 (from gym[accept-rom-license])
  Using cached AutoROM-0.4.2-py3-none-any.whl (16 kB)
Collecting AutoROM.accept-rom-license (from autorom[accept-rom-license]~=0.4.2->gym[accept-rom-license])
  Using cached AutoROM.accept-rom-license-0.6.1.tar.gz (434 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: AutoROM.accept-rom-license
  Building wheel for AutoROM.accept-rom-license (pyproject.toml) ... [?25l[?25hdone
  Created wheel for AutoROM.accept-rom-license: filename=AutoROM.accept_rom_license-0.6.1-py3-none-any.whl size=446660 sha256=47aa133c6b3ce220b9a95e11e78543fabd878a5d0f1377c3dd75603c9142ba7a
  Stored in directory: /root/.cache/pip/wheels/6b/1b/ef/a43ff1a2f1736d5711faa1ba4c1f61be1131b8899e6a057811
Successfully built AutoROM.accept-rom-license
Installing collected packages: A

In [None]:
#Importing all necessary libraries
import random
import numpy as np
import torch
import torch.nn as nn
import gym
import numpy as np
import matplotlib.pyplot as plt
from scipy.signal import convolve, gaussian

from collections import deque
from torch.nn.functional import mse_loss
import torch.optim as optim

import os
import io
import base64
import time
import glob
from IPython.display import HTML

In [None]:
from gym.wrappers import AtariPreprocessing
from gym.wrappers import FrameStack
from gym.wrappers import TransformReward

#Gives out environment using OpenAI's gym module
def make_env(env_name, clip_rewards = True, seed = None):
    env = gym.make(env_name)
    env = AtariPreprocessing(env)
    env = FrameStack(env,num_stack = 4)
    if clip_rewards:
        env = TransformReward(env, lambda r: np.sign(r))
    return env

In [None]:
#Giving GPU access if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
#Our main DQNAgent class
class DQNAgent(nn.Module):
    def __init__(self, state_shape, n_actions, epsilon):
		# state_shape --> Input shape to the Neural Network
		# n_actions --> Number of actions
		# epsilon --> Exploration Probability
        super(DQNAgent,self).__init__()
        self.n_actions = n_actions
        self.epsilon = epsilon

    #Initialising Neural Network using these layers
		# 1) A Convolutional layer followed by ReLU activation function
        self.conv1 = nn.Conv2d(4,16,kernel_size=8,stride=4)
        self.relu1 = nn.ReLU()
		# 2) A Convolutional layer followed by ReLU activation function
        self.conv2 = nn.Conv2d(16,32,kernel_size=4,stride=2)
        self.relu2 = nn.ReLU()
    # 3) A Linear layer to give output
        self.flatten = nn.Flatten()
        self.linear = nn.Linear(32*((((state_shape[1]-8)//4 + 1)-4)//2 + 1)*((((state_shape[2]-8)//4 + 1)-4)//2 + 1),256)
        self.relu3 = nn.ReLU()
    # 4) linear Layer with output size = 'number of actions'(the qvalues of actions)
        self.qvalues = nn.Linear(256,n_actions)

    def forward(self, state_t):
        state_t = torch.tensor(np.array(state_t), dtype=torch.float32)
        x = self.relu1(self.conv1(state_t))
        x = self.relu2(self.conv2(x))
        x = self.relu3(self.linear(self.flatten(x)))
		# returns qvalues
        return self.qvalues(x)

    def get_qvalues(self, state_t):
        x = self.forward(state_t)
    # returns the numpy array of qvalues
        return x.detach().numpy()

    def sample_actions(self, qvalues):
		# sample_actions based on the qvalues
        batch_size = qvalues.shape[0]
        actions = np.zeros(batch_size,dtype=np.int64)
        for i in range(batch_size):
            if(np.random.rand() < self.epsilon):
                actions[i] = np.random.randint(low=0,high=self.n_actions)
            else:
                actions[i] = np.argmax(qvalues[i])
        return actions

In [None]:
#Evaluating the training agent
def evaluate(env, agent, n_games = 1, greedy = False, t_max = 10000):
	# returns the mean of sum of all rewards across n_games
    rewards = []
    for _ in range(n_games):
        s = env.reset()
        R = 0.0
        for _ in range(t_max):
            qvalues = agent.get_qvalues([s])
            if greedy:
                action = qvalues.argmax(axis=-1)[0]
            else:
                action = agent.sample_actions(qvalues)[0]
            s,r,done,_ = env.step(action)
            R += r
            if done:
                break
        rewards.append(R)
    return np.mean(rewards)

In [None]:
#ReplayBuffer class
class ReplayBuffer:
    def __init__(self, size):
		# maximum size that buffer can hold
        self.buffer = deque(maxlen=size)

    def __len__(self):
        return len(self.buffer)

    def add(self, state, action ,reward, next_state, done):
		# store the information passed in one call to add as 1 unit of informmation
        self.buffer.append((state,action,reward,next_state,done))

    def sample(self, batch_size):
		# return a random sampling of 'batch_size' units of information
        states,actions,rewards,next_states,dones = zip(*random.sample(self.buffer,batch_size))
        return np.array(states),np.array(actions),np.array(rewards),np.array(next_states),np.array(dones)

In [None]:
#Make the agent play on the env and store the information in exp_replay
def play_and_record(start_state, agent, env, exp_replay, n_steps = 1):
	# n_steps --> number of steps to be played in this function on one call
    s = start_state
    for _ in range(n_steps):
        qvalues = agent.get_qvalues([s])
        a = agent.sample_actions(qvalues)[0]
        next_s,r,done,_ = env.step(a)
        exp_replay.add(s,a,r,next_s,done)
        if not done:
            s = next_s
        else:
            s = env.reset()

In [None]:
#Computing TD Loss
def compute_td_loss(agent, target_network,device, batch_size, exp_replay, gamma = 0.99):
	# sample 'batch_size' units of info stored in the exp_replay
    states,actions,rewards,next_states,dones = exp_replay.sample(batch_size)
  # Need to convert back the objects into tensors using PyTorch
    states = torch.tensor(states, device=device, dtype=torch.float)
    actions = torch.tensor(actions,device=device,dtype=torch.long)
    rewards = torch.tensor(rewards, device=device, dtype=torch.float)
    next_states = torch.tensor(next_states, device=device, dtype=torch.float)
    dones = torch.tensor(dones.astype('float32'),device=device, dtype=torch.float)
	# Find predicted qvalues and target qvalues and then MSELoss of them
    predicted_qvalues = agent(states)
  # forward call in DQNAgent class
    predicted_qvalues_of_actions = predicted_qvalues[range(batch_size),actions]

    with torch.no_grad():
        target_qvalues_of_actions = target_network(next_states).max(-1)[0]
        target_qvalues_of_actions = rewards + (gamma * target_qvalues_of_actions * (1 - dones))

    return mse_loss(predicted_qvalues_of_actions,target_qvalues_of_actions)

In [None]:
#### MAIN LOOP ####

#More libraries
from tqdm import trange
from IPython.display import clear_output
import matplotlib.pyplot as plt

In [None]:
# Function to plot the rewards and loss
seed = 108
random.seed(108)
np.random.seed(108)
torch.manual_seed(108)

<torch._C.Generator at 0x7f11aa3d20f0>

In [None]:
#Setting up environment
env_name = "BreakoutNoFrameskip-v4"
# Reset the environment before starting to train the agent and everytime the game ends (U will get a done flag which is a boolean representing whether the game has ended or not)
env = make_env(env_name)
state = env.reset()
state_shape = env.observation_space.shape
n_actions = env.action_space.n
epsilon = 0.5

  deprecation(
  deprecation(
  logger.deprecation(


In [None]:
#Create agent from DQNAgent class
agent = DQNAgent(state_shape,n_actions,epsilon).to(device)
#Create target_network from DQNAgent class is updated after some fixed steps from agent
target_network = DQNAgent(state_shape,n_actions,epsilon).to(device)
# Note initialise target network values from agent
target_network.load_state_dict(agent.state_dict())

<All keys matched successfully>

In [None]:
# Creating a ReplayBuffer object and saving some information in the object by playing the agent
exp_replay = ReplayBuffer(10**6)
for i in range(4000):
    play_and_record(state, agent, env, exp_replay, n_steps=10**2)
    print( "Replay Buffer : i : ", i)
    if len(exp_replay) == 10**6:
        break
print(len(exp_replay))

Replay Buffer : i :  0
Replay Buffer : i :  1
Replay Buffer : i :  2
Replay Buffer : i :  3
Replay Buffer : i :  4
Replay Buffer : i :  5
Replay Buffer : i :  6
Replay Buffer : i :  7
Replay Buffer : i :  8
Replay Buffer : i :  9
Replay Buffer : i :  10
Replay Buffer : i :  11
Replay Buffer : i :  12
Replay Buffer : i :  13
Replay Buffer : i :  14
Replay Buffer : i :  15
Replay Buffer : i :  16
Replay Buffer : i :  17
Replay Buffer : i :  18
Replay Buffer : i :  19
Replay Buffer : i :  20
Replay Buffer : i :  21
Replay Buffer : i :  22
Replay Buffer : i :  23
Replay Buffer : i :  24
Replay Buffer : i :  25
Replay Buffer : i :  26
Replay Buffer : i :  27
Replay Buffer : i :  28
Replay Buffer : i :  29
Replay Buffer : i :  30
Replay Buffer : i :  31
Replay Buffer : i :  32
Replay Buffer : i :  33
Replay Buffer : i :  34
Replay Buffer : i :  35
Replay Buffer : i :  36
Replay Buffer : i :  37
Replay Buffer : i :  38
Replay Buffer : i :  39
Replay Buffer : i :  40
Replay Buffer : i :  41
Re

In [None]:
#Setup some parameters for training
timesteps_per_epoch = 2
batch_size = 16
total_steps = 2 * 10**2

In [None]:
#Optimizer
optimizer = optim.Adam(agent.parameters(),lr=2*1e-5)

In [None]:
#Setting exploration epsilon
start_epsilon = 0.1
end_epsilon = 0.05
eps_decay_final_step = 1 * 10**1

In [None]:
#Setup some frequency for logging and updating target network
loss_freq = 20
refresh_target_network_freq = 100
eval_freq = 10000

In [None]:
#For gradient clipping
max_grad_norm = 5000

mean_rw_history = []
td_loss_history = []

SAVE_INTERVAL = 50000

In [None]:
from numpy import asarray
from numpy import savetxt

#Defines epsilon schedule
def epsilon_schedule(start_eps, end_eps, step, final_step):
    return start_eps + (end_eps-start_eps)*min(step, final_step)/final_step

#To reset the state of the environment before starting
env.reset()

In [None]:
### STARTING MAIN LOOP ###
for step in range(total_steps + 1):
    #Updating exploration epsilon
    agent.epsilon = epsilon_schedule(start_epsilon, end_epsilon, step, eps_decay_final_step)

    #Taking timesteps_per_epoch and update experience replay buffer, (use play_and_record)
    play_and_record(state, agent,env,exp_replay,timesteps_per_epoch)

    #Computing TD Loss
    loss = compute_td_loss(agent,target_network,device,batch_size,exp_replay,gamma=0.99)
    #Backward propogation and updating the network parameters
    optimizer.zero_grad()
    loss.backward()
    #Taking optimization step
    optimizer.step()

    if step % loss_freq == 0:
        td_loss_history.append(loss.data.cpu().item())

    # Load agent weights into target_network
    if step % refresh_target_network_freq == 0:
        target_network.load_state_dict(agent.state_dict())

    if step % eval_freq == 0:
        mean_reward = evaluate(make_env(env_name, seed=step), agent, n_games=3, greedy=True, t_max=6000)
        mean_rw_history.append(mean_reward)

        print("mean_reward : ", mean_reward)

        clear_output(True)
        print("buffer size = %i, epsilon = %.5f" %
				(len(exp_replay), agent.epsilon))


    # #Saving
    if step % SAVE_INTERVAL == 0 and step!= 0:
        print('Saving...')
        device = torch.device('cpu')
        torch.save(agent.state_dict(), f'model_{step}.pth')
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

        savetxt(f'reward_{step}.csv', np.array(mean_rw_history))


buffer size = 404332, epsilon = 0.10000


In [None]:
# Final Score
final_score = evaluate(make_env(env_name),agent, n_games=1, greedy=True, t_max=10000)
print('final score:', final_score)

final score: 0.0
