#Deep Learning - Proyecto 1

In [1]:
from __future__ import division
from __future__ import print_function
from vizdoom import *
import itertools as it
from random import sample, randint, random
from time import time, sleep
import numpy as np
import skimage.color, skimage.transform
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.autograd import Variable
from tqdm import trange
from torch.utils.data import IterableDataset, DataLoader
from torch.distributions import Categorical
# ⚡ PyTorch Lightning
import pytorch_lightning as pl

# 🏋️‍♀️ Weights & Biases
import wandb

# ⚡ 🤝 🏋️‍♀️
from pytorch_lightning.loggers import WandbLogger

##Configuracion

In [2]:
# Q-learning settings
learning_rate = 0.00025
discount_factor = 0.99
epochs = 20
learning_steps_per_epoch = 2000
replay_memory_size = 10000

# NN learning settings
batch_size = 64

# Training regime
test_episodes_per_epoch = 100

# Other parameters
frame_repeat = 12
resolution = (30, 45)
episodes_to_watch = 10

model_savefile = "./model-doom.pth"
save_model = True
load_model = False
skip_learning = False

# Configuration file path
#config_file_path = "../scenarios/simpler_basic.cfg"
config_file_path = "../../scenarios/rocket_basic.cfg"
# config_file_path = "../../scenarios/basic.cfg"

pl.seed_everything(hash("setting random seeds") % 2**32 - 1)

Global seed set to 3568881279


3568881279

##Preprocesamiento

In [3]:
# Converts and down-samples the input image
def preprocess(img):
    img = skimage.transform.resize(img, resolution)
    img = img.astype(np.float32)
    img = img.reshape([1, 1, resolution[0], resolution[1]])
    img = torch.from_numpy(img)
    img = Variable(img)
    return img

##Reproducir juego aprendido

In [4]:
class ReplayMemory:
    def __init__(self, capacity):
        channels = 1
        state_shape = (capacity, channels, resolution[0], resolution[1])
        self.s1 = np.zeros(state_shape, dtype=np.float32)
        self.s2 = np.zeros(state_shape, dtype=np.float32)
        self.a = np.zeros(capacity, dtype=np.int32)
        self.r = np.zeros(capacity, dtype=np.float32)
        self.isterminal = np.zeros(capacity, dtype=np.float32)

        self.capacity = capacity
        self.size = 0
        self.pos = 0

    def add_transition(self, s1, action, s2, isterminal, reward):
        self.s1[self.pos, 0, :, :] = s1
        self.a[self.pos] = action
        if not isterminal:
            self.s2[self.pos, 0, :, :] = s2
        self.isterminal[self.pos] = isterminal
        self.r[self.pos] = reward

        self.pos = (self.pos + 1) % self.capacity
        self.size = min(self.size + 1, self.capacity)

    def get_sample(self, sample_size):
        i = sample(range(0, self.size), sample_size)
        return self.s1[i], self.a[i], self.r[i], self.isterminal[i], self.s2[i] 

    def clean_memory(self):
        self.pos = 0
        self.size = 0



In [5]:
class RLDataset(IterableDataset):
    """
    Iterable Dataset containing the ReplayMemory
    which will be updated with new experiences during training
    Args:
        buffer: replay buffer
        sample_size: number of experiences to sample at a time
    """

    def __init__(self, buffer: ReplayMemory, sample_size: int = batch_size) -> None:
        self.buffer = buffer
        self.sample_size = sample_size

    def __iter__(self):
        states, actions, rewards, dones, new_states = self.buffer.get_sample(self.sample_size)
        for i in range(len(dones)):
            yield states[i], actions[i], rewards[i], dones[i], new_states[i]

##Modelo de Policy

In [6]:
class Net(nn.Module):
    def __init__(self, available_actions_count):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 8, kernel_size=6, stride=3)
        self.conv2 = nn.Conv2d(8, 8, kernel_size=3, stride=2)
        self.fc1 = nn.Linear(192, 128)
        self.fc2 = nn.Linear(128, available_actions_count)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = x.view(-1, 192)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [7]:
class PolicyModel(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.lr      = learning_rate

        # Create Doom instance
        self.game = self.initialize_vizdoom(config_file_path)

        # Action = which buttons are pressed
        n = self.game.get_available_buttons_size()
        self.actions = [list(a) for a in it.product([0, 1], repeat=n)]

        # Create replay memory which will store the transitions
        self.memory = ReplayMemory(capacity=replay_memory_size)

        if load_model:
            print("Loading model from: ", model_savefile)
            self.net = torch.load(model_savefile)
        else:
            self.net = Net(len(self.actions))
        
    def initialize_vizdoom(self, config_file_path):
        game = DoomGame()
        game.load_config(config_file_path)
        game.set_window_visible(False)
        game.set_mode(Mode.PLAYER)
        game.set_screen_format(ScreenFormat.GRAY8)
        game.set_screen_resolution(ScreenResolution.RES_640X480)
        game.init()
        return game

    def forward(self, x): 
        logits = self.net(x)
        return Categorical(logits=logits)

    def get_action(self,state):
        return self(state).sample().item()
        
    def loss(self,state, action, reward):       
        logp = self(state).log_prob(action)
        return -(logp * reward).mean()

    def on_train_epoch_start(self):
        #Llena la memoria
        self.memory.clean_memory()
        self.game.new_episode()
        for learning_step in trange(learning_steps_per_epoch, leave=False):            
            state = preprocess(self.game.get_state().screen_buffer)
            action= self.get_action(state)
            reward = self.game.make_action(self.actions[action], frame_repeat)
            done = self.game.is_episode_finished()
            next_state = preprocess(self.game.get_state().screen_buffer) if not done else None
            # Remember the transition that was just experienced.
            self.memory.add_transition(state, action, next_state, done, reward)
            
            if self.game.is_episode_finished():
                score = self.game.get_total_reward()
                self.log('train/score', score, on_epoch=True)                
                self.game.new_episode()        

    def discount_reward(self,reward):
        R = 0
        rewards_with_discount=[]
        for r in torch.flip(reward, [0]):
            R = r + discount_factor * R
            rewards_with_discount.insert(0,R)
        
        # Scale rewards
        rewards_with_discount = torch.FloatTensor(rewards_with_discount)
        rewards_with_discount = (rewards_with_discount - rewards_with_discount.mean()) / (rewards_with_discount.std() + np.finfo(np.float32).eps)
        return rewards_with_discount

    def training_step(self, batch, batch_idx):
        state, action, reward, _, _ = batch

        # Discount future rewards back to the present       
        reward = self.discount_reward(reward) 
        
        # Calculate loss
        loss = self.loss(state,action,reward)        

        # logging metrics we calculated by hand
        self.log('train/loss', loss, on_epoch=True)
        
        return loss

    def configure_optimizers(self):
        return torch.optim.SGD(model.parameters(), self.lr)

    #paso de pruebas que se realizarán al finalizar todo el entrenamiento
    def test_step(self, batch, batch_idx):
        print("Hola")
        

    def on_validation_start(self):      
        for _ in trange(test_episodes_per_epoch, leave=False):
            self.game.new_episode()
            while not self.game.is_episode_finished():
                state = preprocess(self.game.get_state().screen_buffer)
                action= self.get_action(state)
                self.game.make_action(self.actions[action], frame_repeat)

            r = self.game.get_total_reward()
            self.log("val/total_score", r, on_step=True, on_epoch=True)

    def on_train_end(self):
        self.game.close()

        # Reinitialize the game with window visible
        self.game.set_window_visible(True)
        self.game.set_mode(Mode.ASYNC_PLAYER)
        self.game.init()

        for _ in range(episodes_to_watch):
            self.game.new_episode()
            while not self.game.is_episode_finished():
                state = preprocess(self.game.get_state().screen_buffer)
                action = self.get_action(state)

                # Instead of make_action(a, frame_repeat) in order to make the animation smooth
                self.game.set_action(self.actions[action])
                for _ in range(frame_repeat):
                    self.game.advance_action()

            # Sleep between episodes
            sleep(1.0)
            score = self.game.get_total_reward()
            print("test/total_score", score)

        torch.save(self.net, model_savefile)

    def train_dataloader(self) -> DataLoader:
        """Initialize the Replay Buffer dataset used for retrieving experiences"""
        self.hparams.episode_length = 200
        self.hparams.batch_size = 16
        dataset = RLDataset(self.memory, self.hparams.episode_length)
        dataloader = DataLoader(dataset=dataset,
                                batch_size=self.hparams.batch_size,
                                )
        return dataloader
            

In [8]:
!wandb login --relogin 70434303a5ed54f05105b62078db2447874ee020
wandb_logger = WandbLogger(project="VDoomLearning")

wandb: Appending key for api.wandb.ai to your netrc file: C:\Users\ruben/.netrc
wandb: Network error (ConnectionError), entering retry loop.


In [9]:
trainer = pl.Trainer(
    logger = wandb_logger,    # W&B integration
    log_every_n_steps = 10,   # set the logging frequency
    gpus = 0,                # use all GPUs
    max_epochs = 10,           # number of epochs
    deterministic = True,     # keep it deterministic
    default_root_dir = "./"
)

model = PolicyModel()
trainer.fit(model)
wandb.finish()

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mteccr[0m (use `wandb login --relogin` to force relogin)



  | Name | Type | Params
------------------------------
0 | net  | Net  | 26.6 K
------------------------------
26.6 K    Trainable params
0         Non-trainable params
26.6 K    Total params
0.106     Total estimated model params size (MB)
  f"The dataloader, {name}, does not have many workers which may be a bottleneck."


Epoch 0: : 0it [00:00, ?it/s]



Epoch 1: : 0it [00:00, ?it/s, loss=-0.00498, v_num=5u0p]



Epoch 1: : 0it [00:10, ?it/s, loss=-0.00498, v_num=5u0p]



Epoch 2: : 0it [00:00, ?it/s, loss=-0.00584, v_num=5u0p]



Epoch 2: : 0it [00:16, ?it/s, loss=-0.00584, v_num=5u0p]



Epoch 3: : 0it [00:00, ?it/s, loss=0.00165, v_num=5u0p]



Epoch 4: : 0it [00:00, ?it/s, loss=0.00117, v_num=5u0p]



Epoch 5: : 0it [00:00, ?it/s, loss=-1.81e-05, v_num=5u0p]



Epoch 6: : 0it [00:00, ?it/s, loss=-0.00543, v_num=5u0p]



Epoch 7: : 0it [00:00, ?it/s, loss=-0.00528, v_num=5u0p]



Epoch 8: : 0it [00:00, ?it/s, loss=-0.00743, v_num=5u0p]



Epoch 9: : 0it [00:00, ?it/s, loss=-0.00715, v_num=5u0p]



Epoch 9: : 13it [03:03, 14.11s/it, loss=-0.000214, v_num=5u0p]
test/total_score -104.0
test/total_score 58.0
test/total_score -330.0
test/total_score 70.0
test/total_score 36.0
test/total_score -187.0
test/total_score -213.0
test/total_score 52.0
test/total_score 94.0
test/total_score -111.0


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,▁▁▂▂▃▃▃▃▃▄▄▅▅▆▆▆▆▆▇▇███
train/loss_epoch,▃▅▅▄█▁▄▂▁▇
train/loss_step,▂▅▁▅▄▆▃▆▄▄▁█▁
train/score,█▁▇▂▃▆▂▃▃▆
trainer/global_step,▁▁▂▂▂▃▃▃▃▄▄▅▅▅▆▆▆▆▇▇▇██

0,1
epoch,9.0
train/loss_epoch,0.0062
train/loss_step,-0.0323
train/score,-65.98959
trainer/global_step,129.0
