#Deep Learning - Proyecto 1

In [1]:
from __future__ import division
from __future__ import print_function
from vizdoom import *
import itertools as it
from random import sample, randint, random
from time import time, sleep
import numpy as np
import skimage.color, skimage.transform
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from tqdm import trange
from torch.utils.data import IterableDataset, DataLoader
from torch.distributions import Categorical
# ⚡ PyTorch Lightning
import pytorch_lightning as pl

# 🏋️‍♀️ Weights & Biases
import wandb

# ⚡ 🤝 🏋️‍♀️
from pytorch_lightning.loggers import WandbLogger

import cv2

##Configuracion

In [2]:
# Q-learning settings
learning_rate = 0.00025
discount_factor = 0.99
epochs = 20
learning_steps_per_epoch = 2000
replay_memory_size = 10000

# NN learning settings
batch_size = 64

# Training regime
test_episodes_per_epoch = 100

# Other parameters
frame_repeat = 12
resolution = (30, 45)
episodes_to_watch = 10

model_savefile = "./model-doom.pth"
save_model = True
load_model = False
skip_learning = False

# Configuration file path
#config_file_path = "../scenarios/simpler_basic.cfg"
config_file_path = "../scenarios/deadly_corridor.cfg"
#config_file_path = "../../scenarios/basic.cfg"

pl.seed_everything(hash("setting random seeds") % 2**32 - 1)

Global seed set to 1982895900


1982895900

##Preprocesamiento

In [3]:
# Converts and down-samples the input image
def preprocess(img):
    img = skimage.transform.resize(img, resolution)
    img = img.astype(np.float32)
    img = img.reshape([1, 1, resolution[0], resolution[1]])
    img = torch.from_numpy(img)
    img = Variable(img)
    return img

In [4]:
import math
def distance(x1,y1,x2,y2):
    return math.sqrt(math.pow(x1-x2,2) + math.pow(y1-y2,2))

##Reproducir juego aprendido

In [5]:
class ReplayMemory:
    def __init__(self, capacity):
        channels = 1
        state_shape = (capacity, channels, resolution[0], resolution[1])
        self.s1 = np.zeros(state_shape, dtype=np.float32)
        self.s2 = np.zeros(state_shape, dtype=np.float32)
        self.a = np.zeros(capacity, dtype=np.int32)
        self.r = np.zeros(capacity, dtype=np.float32)
        self.isterminal = np.zeros(capacity, dtype=np.float32)

        self.capacity = capacity
        self.size = 0
        self.pos = 0

    def add_transition(self, s1, action, s2, isterminal, reward):
        self.s1[self.pos, 0, :, :] = s1
        self.a[self.pos] = action
        if not isterminal:
            self.s2[self.pos, 0, :, :] = s2
        self.isterminal[self.pos] = isterminal
        self.r[self.pos] = reward

        self.pos = (self.pos + 1) % self.capacity
        self.size = min(self.size + 1, self.capacity)

    def get_sample(self, sample_size):
        i = sample(range(0, self.size), sample_size)
        return self.s1[i], self.a[i], self.r[i], self.isterminal[i], self.s2[i] 

    def clean_memory(self):
        self.pos = 0
        self.size = 0

In [6]:
class RLDataset(IterableDataset):
    """
    Iterable Dataset containing the ReplayMemory
    which will be updated with new experiences during training
    Args:
        buffer: replay buffer
        sample_size: number of experiences to sample at a time
    """

    def __init__(self, buffer: ReplayMemory, sample_size: int = batch_size) -> None:
        self.buffer = buffer
        self.sample_size = sample_size

    def __iter__(self):
        states, actions, rewards, dones, new_states = self.buffer.get_sample(self.sample_size)
        for i in range(len(dones)):
            yield states[i], actions[i], rewards[i], dones[i], new_states[i]

In [7]:
class RewardVariables():
    def __init__(self):
        self.distance = 1312
        self.ammo = 55
        self.hits = 0
        self.health = 100
        self.goal_X = 1312
        self.goal_Y = 0
        self.total_reward = 0        

    def update(self, game):
        reward = 0

        player_x = game.get_game_variable(GameVariable.POSITION_X)
        player_y = game.get_game_variable(GameVariable.POSITION_Y)
        d = distance(self.goal_X,self.goal_Y,player_x,player_y )

        reward += (self.distance - d )*10
        self.distance = d
        
        hits = game.get_game_variable(GameVariable.KILLCOUNT) 
        delta_hits = hits - self.hits
        reward += delta_hits * 100 
        self.hits = hits

        ammo = game.get_game_variable(GameVariable.AMMO4)
        
        if delta_hits == 0:
            delta_ammo = ammo - self.ammo
            reward += delta_ammo * 10
        self.ammo = ammo
        
        health = game.get_game_variable(GameVariable.HEALTH) 
        reward += health - self.health
        self.health = health

        angle = game.get_game_variable(GameVariable.ANGLE)
        if angle > 90 and angle < 270:
            reward +=  abs(angle - 180) - 180
        
        self.total_reward += reward
        return reward

##Modelo de Policy

In [8]:
class Net(nn.Module):
    def __init__(self, available_actions_count):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 8, kernel_size=6, stride=3)
        self.conv2 = nn.Conv2d(8, 8, kernel_size=3, stride=2)
        self.fc1 = nn.Linear(192, 128)
        self.fc2 = nn.Linear(128, available_actions_count)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = x.view(-1, 192)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [9]:
from vit_pytorch import ViT

class VisionTransformer(nn.Module):
    def __init__(self, available_actions_count):
        super(VisionTransformer,self).__init__()
        print("Available actions count: ",available_actions_count)
        self.vit = ViT(
                    image_size = 45,
                    patch_size = 5,
                    num_classes = available_actions_count,
                    dim = available_actions_count,
                    depth = 2,
                    heads = 2,
                    mlp_dim = available_actions_count*2,
                    dropout = 0.1,
                    emb_dropout = 0.1,
                    channels=1
                    )

    def forward(self,x):
        return self.vit(x)

In [10]:
class PolicyModel(pl.LightningModule):
    def __init__(self, preprocess = True, use_original_model=False):
        super().__init__()
        self.lr      = learning_rate
        self.preprocess = preprocess
        # Create Doom instance
        self.game = self.initialize_vizdoom(config_file_path)

        # Action = which buttons are pressed
        n = self.game.get_available_buttons_size()
        self.actions = [list(a) for a in it.product([0, 1], repeat=n)]

        # Create replay memory which will store the transitions
        self.memory = ReplayMemory(capacity=replay_memory_size)

        if use_original_model:            
            self.net = VisionTransformer(len(self.actions))
        else:
            self.net = Net(len(self.actions))
        
    def initialize_vizdoom(self, config_file_path):
        game = DoomGame()
        game.load_config(config_file_path)
        game.set_window_visible(False)
        game.set_mode(Mode.PLAYER)
        game.set_screen_format(ScreenFormat.GRAY8)        
        game.set_depth_buffer_enabled(True)
        game.set_labels_buffer_enabled(True)
        game.set_screen_resolution(ScreenResolution.RES_640X480)
        game.set_render_weapon(False)
        game.set_render_hud(False)
        game.init()
        return game

    def forward(self, x): 
        logits = self.net(x)
        return Categorical(logits=logits)

    def get_action(self,state):
        return self(state).sample().item()

    def get_state(self):
        state = self.game.get_state()
        if self.preprocess:            
            frame = cv2.add(state.depth_buffer,state.labels_buffer)
            state = preprocess(frame)
        else:
            state = preprocess(state.screen_buffer)
        return state

        
    def loss(self,state, action, reward):       
        logp = self(state).log_prob(action)
        return -(logp * reward).mean()

    def on_train_epoch_start(self):
        #Llena la memoria
        self.memory.clean_memory()
        self.game.new_episode()
        variables = RewardVariables()
        for learning_step in trange(learning_steps_per_epoch, leave=False):   
            state = self.get_state()
            action= self.get_action(state)
            self.game.make_action(self.actions[action], frame_repeat)
            reward = variables.update(self.game)
            done = self.game.is_episode_finished()
            next_state = self.get_state() if not done else None
            # Remember the transition that was just experienced.
            self.memory.add_transition(state, action, next_state, done, reward)
            
            if self.game.is_episode_finished():
                score = self.game.get_total_reward()
                self.log('train/score', score, on_epoch=True)    
                self.log('train/reward', variables.total_reward, on_epoch=True)            
                self.game.new_episode()  
                variables = RewardVariables()      

    def discount_reward(self,reward):
        R = 0
        rewards_with_discount=[]
        for r in torch.flip(reward, [0]):
            R = r + discount_factor * R
            rewards_with_discount.insert(0,R)
        
        # Scale rewards
        rewards_with_discount = torch.FloatTensor(rewards_with_discount)
        rewards_with_discount = (rewards_with_discount - rewards_with_discount.mean()) / (rewards_with_discount.std() + np.finfo(np.float32).eps)
        return rewards_with_discount

    def training_step(self, batch, batch_idx):
        state, action, reward, _, _ = batch

        # Discount future rewards back to the present       
        reward = self.discount_reward(reward) 
        
        # Calculate loss
        loss = self.loss(state,action,reward)        

        # logging metrics we calculated by hand
        self.log('train/loss', loss, on_epoch=True)
        
        return loss

    def configure_optimizers(self):
        return torch.optim.SGD(model.parameters(), self.lr)

    #paso de pruebas que se realizarán al finalizar todo el entrenamiento
    def test_step(self, batch, batch_idx):
        print("Testing...")

    def on_validation_start(self):      
        for _ in trange(test_episodes_per_epoch, leave=False):
            self.game.new_episode()
            variables = RewardVariables()
            while not self.game.is_episode_finished():
                state = self.get_state()
                action= self.get_action(state)
                self.game.make_action(self.actions[action], frame_repeat)
                variables.update(self.game)

            r = self.game.get_total_reward()
            self.log("val/total_score", r, on_step=True, on_epoch=True)
            self.log("val/total_reward", variables.total_reward, on_step=True, on_epoch=True)

    def on_train_end(self):
        self.game.close()
        
        # Reinitialize the game with window visible
        self.game.set_window_visible(True)
        self.game.set_mode(Mode.PLAYER )
        self.game.set_screen_format(ScreenFormat.GRAY8)
        self.game.init()

        btns = self.game.get_available_buttons()
        print(btns)
        for _ in range(episodes_to_watch):
            self.game.new_episode()
            variables = RewardVariables()
            while not self.game.is_episode_finished():
                state = self.get_state()
                state2 = self.game.get_state()
                action = self.get_action(state)
                

                depth = state2.depth_buffer
                label = state2.labels_buffer

                combination = cv2.add(depth,label)
                cv2.imshow('ViZDoom combination Buffer', combination)


                cv2.waitKey(int(0.028 * 1000))

                self.game.set_action(self.actions[action])
                for _ in range(frame_repeat):
                    self.game.advance_action()
                variables.update(self.game)
            # Sleep between episodes
            sleep(1.0)
            score = self.game.get_total_reward()
            print("test/total_score", score)
            print("test/total_reward",variables.total_reward)

        torch.save(self.net, model_savefile)
        cv2.destroyAllWindows()
        self.game.close()

    def train_dataloader(self) -> DataLoader:
        """Initialize the Replay Buffer dataset used for retrieving experiences"""
        self.hparams.episode_length = 200
        self.hparams.batch_size = 16
        dataset = RLDataset(self.memory, self.hparams.episode_length)
        dataloader = DataLoader(dataset=dataset,
                                batch_size=self.hparams.batch_size,
                                )
        return dataloader        

In [18]:
#model = PolicyModel(preprocess= False)
model.on_train_end()

[Button.MOVE_LEFT, Button.MOVE_RIGHT, Button.ATTACK, Button.MOVE_FORWARD, Button.MOVE_BACKWARD, Button.TURN_LEFT, Button.TURN_RIGHT]
test/total_score 293.29005432128906
test/total_reward -670.8547524766074
test/total_score 81.60090637207031
test/total_reward 121.48779903341142
test/total_score -65.11970520019531
test/total_reward -280.2041334912258
test/total_score -37.21434020996094
test/total_reward -441.1290409570588
test/total_score -109.65455627441406
test/total_reward -531.0714446753675
test/total_score 161.2923126220703
test/total_reward -1082.9309006201406
test/total_score -5.74847412109375
test/total_reward -11.80517498740619
test/total_score 179.3506622314453
test/total_reward -522.054243565593
test/total_score 232.18496704101562
test/total_reward 172.0718525887611
test/total_score -115.99314880371094
test/total_reward -215.9935650530249


In [15]:
wandb.finish()
!wandb login --relogin 70434303a5ed54f05105b62078db2447874ee020
wandb_logger = WandbLogger(project="VDoomLearning")

wandb: Appending key for api.wandb.ai to your netrc file: C:\Users\ruben/.netrc


In [16]:
trainer = pl.Trainer(
    logger = wandb_logger,    # W&B integration
    log_every_n_steps = 10,   # set the logging frequency
    gpus = 0,                # use all GPUs
    max_epochs = 20,           # number of epochs
    deterministic = True,     # keep it deterministic
    default_root_dir = "./"
)

model = PolicyModel(preprocess=True,use_original_model=False)
trainer.fit(model)
wandb.finish()

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
[34m[1mwandb[0m: wandb version 0.12.7 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade



  | Name | Type | Params
------------------------------
0 | net  | Net  | 42.1 K
------------------------------
42.1 K    Trainable params
0         Non-trainable params
42.1 K    Total params
0.168     Total estimated model params size (MB)
  f"The dataloader, {name}, does not have many workers which may be a bottleneck."


Epoch 0: : 0it [00:00, ?it/s]



Epoch 1: : 0it [00:00, ?it/s, loss=-0.00748, v_num=9hn9]



Epoch 2: : 0it [00:00, ?it/s, loss=-0.00105, v_num=9hn9]



Epoch 3: : 0it [00:00, ?it/s, loss=0.00116, v_num=9hn9]



Epoch 3: : 0it [00:11, ?it/s, loss=0.00116, v_num=9hn9]



Epoch 4: : 0it [00:00, ?it/s, loss=0.00174, v_num=9hn9]



Epoch 4: : 0it [00:14, ?it/s, loss=0.00174, v_num=9hn9]



Epoch 5: : 0it [00:00, ?it/s, loss=-0.00244, v_num=9hn9]



Epoch 5: : 0it [00:12, ?it/s, loss=-0.00244, v_num=9hn9]



Epoch 6: : 0it [00:00, ?it/s, loss=-0.00673, v_num=9hn9]



Epoch 6: : 0it [00:11, ?it/s, loss=-0.00673, v_num=9hn9]



Epoch 7: : 0it [00:00, ?it/s, loss=-0.00512, v_num=9hn9]



Epoch 7: : 0it [00:11, ?it/s, loss=-0.00512, v_num=9hn9]



Epoch 8: : 0it [00:00, ?it/s, loss=-0.000386, v_num=9hn9]



Epoch 8: : 0it [00:19, ?it/s, loss=-0.000386, v_num=9hn9]



Epoch 9: : 0it [00:00, ?it/s, loss=5.44e-05, v_num=9hn9]



Epoch 9: : 0it [00:10, ?it/s, loss=5.44e-05, v_num=9hn9]



Epoch 10: : 0it [00:00, ?it/s, loss=-0.00216, v_num=9hn9]



Epoch 11: : 0it [00:00, ?it/s, loss=0.00156, v_num=9hn9]



Epoch 11: : 0it [00:13, ?it/s, loss=0.00156, v_num=9hn9]



Epoch 12: : 0it [00:00, ?it/s, loss=0.000786, v_num=9hn9]



Epoch 12: : 0it [00:13, ?it/s, loss=0.000786, v_num=9hn9]



Epoch 13: : 0it [00:00, ?it/s, loss=-0.00183, v_num=9hn9]



Epoch 13: : 0it [00:17, ?it/s, loss=-0.00183, v_num=9hn9]



Epoch 14: : 0it [00:00, ?it/s, loss=-0.00161, v_num=9hn9]



Epoch 14: : 0it [00:10, ?it/s, loss=-0.00161, v_num=9hn9]



Epoch 15: : 0it [00:00, ?it/s, loss=-0.00215, v_num=9hn9]



Epoch 15: : 0it [00:13, ?it/s, loss=-0.00215, v_num=9hn9]



Epoch 16: : 0it [00:00, ?it/s, loss=-0.00289, v_num=9hn9]



Epoch 16: : 0it [00:10, ?it/s, loss=-0.00289, v_num=9hn9]



Epoch 17: : 0it [00:00, ?it/s, loss=-0.0042, v_num=9hn9]



Epoch 17: : 0it [00:14, ?it/s, loss=-0.0042, v_num=9hn9]



Epoch 18: : 0it [00:00, ?it/s, loss=0.00193, v_num=9hn9]



Epoch 18: : 0it [00:17, ?it/s, loss=0.00193, v_num=9hn9]



Epoch 19: : 0it [00:00, ?it/s, loss=-0.000308, v_num=9hn9]



Epoch 19: : 0it [00:13, ?it/s, loss=-0.000308, v_num=9hn9]



Epoch 19: : 13it [04:09, 19.16s/it, loss=0.00438, v_num=9hn9]
[Button.MOVE_LEFT, Button.MOVE_RIGHT, Button.ATTACK, Button.MOVE_FORWARD, Button.MOVE_BACKWARD, Button.TURN_LEFT, Button.TURN_RIGHT]
test/total_score 21.379608154296875
test/total_reward 1000.9480308344368
test/total_score 404.69969177246094
test/total_reward 3348.458272518291
test/total_score -0.4805450439453125
test/total_reward -621.0221596449987
test/total_score -23.177780151367188
test/total_reward -642.1544120383805
test/total_score 321.6117706298828
test/total_reward -6213.700967073447
test/total_score 238.41354370117188
test/total_reward -7453.206711159541
test/total_score 392.94752502441406
test/total_reward -120.97170546284406
test/total_score -43.489715576171875
test/total_reward -1783.4022773512952
test/total_score 249.03634643554688
test/total_reward 22.714089784852376
test/total_score 589.9055786132812
test/total_reward 5819.2179035993395


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇████
train/loss_epoch,▁▅▆▄▃▄▂▅▅▄▆▆▄▄▂▄▃▆▄█
train/loss_step,▅▃▃▂▆▁▂▃▁▂▄▃▁▃▄▃▂▇▃▂▃▄▃▄▃█
train/reward,▅█▆█▆▁▇█▄▄▆▆▆▆▆▄▇▆▃▂
train/score,▄▆▃▅▃▄▇▃▁▂▂▄▄▆▃▆█▃▂▂
trainer/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇███

0,1
epoch,19.0
train/loss_epoch,0.00739
train/loss_step,0.04326
train/reward,-2856.33667
train/score,164.5106
trainer/global_step,259.0


In [45]:
#Experimento 1  - B : Modelo base con preprocessing
# 10 epochs

#Experimento A: 1xiwwsve

#model = PolicyModel.load_from_checkpoint(".\VDoomLearning\8okrjet9\checkpoints\epoch=9-step=129.ckpt")
#trainer.fit(model)

# 20 epochs
model = PolicyModel.load_from_checkpoint(".\VDoomLearning\2xw64o7u\checkpoints\epoch=19-step=259.ckpt")
trainer.fit(model)

#Experimento 2: Nuevo modelo ViT
#20 epochs
model = PolicyModel.load_from_checkpoint(".\VDoomLearning\2xukrzpg\checkpoints\epoch=19-step=259.ckpt")
trainer.fit(model)

In [54]:
#model = PolicyModel.load_from_checkpoint(".\VDoomLearning\2xukrzpg\checkpoints\epoch=19-step=259.ckpt")
model.on_train_end()

[Button.MOVE_LEFT, Button.MOVE_RIGHT, Button.ATTACK, Button.MOVE_FORWARD, Button.MOVE_BACKWARD, Button.TURN_LEFT, Button.TURN_RIGHT]
test/total_score -32.399810791015625
test/total_reward -702.0763478560989
test/total_score -52.77935791015625
test/total_reward -10.88044285881847
test/total_score -79.49752807617188
test/total_reward -267.97785675105047
test/total_score -29.0035400390625
test/total_reward -25.106505051959402
test/total_score -56.83161926269531
test/total_reward -48.93244871513866
test/total_score 127.52180480957031
test/total_reward -5498.480270982866
test/total_score -115.35690307617188
test/total_reward -197.45322167940708
test/total_score 36.95648193359375
test/total_reward -1618.5486809257545
test/total_score 116.38494873046875
test/total_reward 158.32152770620746
test/total_score 57.40484619140625
test/total_reward -1278.2596981869883
