In [1]:
!cd github & git clone https://github.com/Farama-Foundation/ViZDoom.git

fatal: destination path 'ViZDoom' already exists and is not an empty directory.


In [3]:
!pip install gym
!pip install opencv-python
!pip install matplotlib
!pip install stable-baselines3[extra]
!pip install torch



In [50]:
from vizdoom import *
import random
import time
import numpy as np
from gymnasium import Env
from gymnasium.spaces import Discrete, Box
import cv2
from matplotlib import pyplot as plt
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
from collections import deque

In [52]:
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common import env_checker
from stable_baselines3 import PPO

In [54]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [56]:
class VizDoomGym(Env):
    def __init__(self, render=False):
        super().__init__()
        self.game = DoomGame()
        self.game.load_config('github/VizDoom/scenarios/defend_the_center.cfg')

        if render:
            self.game.set_window_visible(True)
        else:
            self.game.set_window_visible(False)
            
        self.game.init()

        self.observation_space = Box(low=0, high=255, shape=(1, 100, 160), dtype = np.uint8)
        self.action_space = Discrete(3)
        
    def step(self, action):
        actions = np.identity(3, dtype=np.uint8)
        reward = self.game.make_action(actions[action], 4)
        terminated = self.game.is_episode_finished()
        truncated = False
        info = {}
        
        if not terminated:
            state = self.game.get_state().screen_buffer
            state = self.preprocess(state)
        else:
            state = np.zeros(self.observation_space.shape, dtype=np.uint)
        
        return state, reward, terminated, truncated, info
        
    def render(self):
        pass
            
    def reset(self, seed =None, options = None):
        if seed is not None:
            self.game.set_seed(seed)

        self.game.new_episode()
        state = self.game.get_state().screen_buffer
        observation = self.preprocess(state)
        info = {}
        return observation, info
        
    #Grayscale game frame and resize it 
    def preprocess(self, observation):
        gray = cv2.cvtColor(np.moveaxis(observation, 0, -1), cv2.COLOR_BGR2GRAY)
        resized = cv2.resize(gray, (160, 100), interpolation=cv2.INTER_AREA)
        state = np.expand_dims(resized, axis=0) 
        return state.astype(np.float32) / 255.0
        
    def close(self):
        self.game.close()
    

In [58]:
def evaluate_policy(qlearning_model, env, trials=5):

    total_rewards = []
    for episode in range(trials):
        state, _ = env.reset()
        state = np.expand_dims(state, axis=0)
        total_reward = 0
        terminated = False

        while not terminated:
            # Get action
            state_tensor = torch.FloatTensor(state).to(device)
            with torch.no_grad():
                q_values = qlearning_model.q_network(state_tensor)
            action = torch.argmax(q_values).item()

            # Take step in env
            next_state, reward, terminated, truncated, _ = env.step(action)
            next_state = np.expand_dims(next_state, axis=0)
            total_reward += reward
            state = next_state

        total_rewards.append(total_reward)
        print(f"Episode {episode + 1} Reward: {total_reward}")

    avg_reward = np.mean(total_rewards)
    print(f"Average Reward over {trials} evaluation episodes: {avg_reward}")
    return avg_reward


In [60]:
class QNetwork(nn.Module):
    def __init__(self, action_space):
        super(QNetwork, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU(),
        )
        self.fc = nn.Sequential(
            nn.Flatten(),
            nn.Linear(64 * 9 * 16, 512),
            nn.ReLU(),
            nn.Linear(512, action_space.n)
        )

    def forward(self, x):
        return self.fc(self.conv(x))

In [134]:
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque
import numpy as np
import random
import os
from torch.utils.tensorboard import SummaryWriter

class QNetwork(nn.Module):
    def __init__(self, action_space):
        super(QNetwork, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU(),
        )
        self.fc = nn.Sequential(
            nn.Flatten(),
            nn.Linear(64 * 9 * 16, 512),
            nn.ReLU(),
            nn.Linear(512, action_space.n)
        )

    def forward(self, x):
        return self.fc(self.conv(x))

class QLearning:
    def __init__(self, env, run_name, lr=1e-5, gamma=0.99, epsilon=1.0, epsilon_decay=0.995, epsilon_min=0.1, buffer_size=50000, batch_size=32):
        self.env = env
        self.q_network = QNetwork(env.action_space).to(device)
        self.target_network = QNetwork(env.action_space).to(device)
        self.target_network.load_state_dict(self.q_network.state_dict())
        self.optimizer = optim.Adam(self.q_network.parameters(), lr=lr)

        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        self.batch_size = batch_size

        self.replay_buffer = deque(maxlen=buffer_size)
        self.buffer_size = buffer_size

        log_dir = os.path.join("qlearning_logs", run_name)
        self.losses = deque(maxlen=100)
        self.writer = SummaryWriter(log_dir)
        self.global_step = 0

    def select_action(self, state):
        # Ensure state has shape [channels, height, width]
        if len(state.shape) == 2:
            state = np.expand_dims(state, axis=0)  # Add channel dimension
        state = torch.FloatTensor(state).unsqueeze(0).to(device)  # Add batch dimension
        with torch.no_grad():
            q_values = self.q_network(state)
        return torch.argmax(q_values).item()

    def store_transition(self, state, action, reward, next_state, done):
        # Ensure states have correct shape
        if len(state.shape) == 2:
            state = np.expand_dims(state, axis=0)
        if len(next_state.shape) == 2:
            next_state = np.expand_dims(next_state, axis=0)
        self.replay_buffer.append((state, action, reward, next_state, done))

    def sample_batch(self):
        batch = random.sample(self.replay_buffer, self.batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)
        states = torch.FloatTensor(np.array(states)).to(device)
        next_states = torch.FloatTensor(np.array(next_states)).to(device)
        actions = torch.LongTensor(actions).unsqueeze(1).to(device)
        rewards = torch.FloatTensor(rewards).unsqueeze(1).to(device)
        dones = torch.FloatTensor(dones).unsqueeze(1).to(device)
        return states, actions, rewards, next_states, dones

    def train_step(self):
        if len(self.replay_buffer) < self.batch_size:
            return

        states, actions, rewards, next_states, dones = self.sample_batch()
        current_q_values = self.q_network(states).gather(1, actions)
        with torch.no_grad():
            next_q_values = self.target_network(next_states).max(1)[0].unsqueeze(1)

        target_q_values = rewards + self.gamma * next_q_values * (1 - dones)
        loss = nn.MSELoss()(current_q_values, target_q_values)

        self.losses.append(loss.item())
        avg_loss = np.mean(self.losses)
        self.writer.add_scalar("Average Loss", avg_loss, self.global_step)

        self.optimizer.zero_grad()
        loss.backward()
        #torch.nn.utils.clip_grad_norm_(self.q_network.parameters(), max_norm=1.0)
        self.optimizer.step()

    def train(self, total_timesteps):
        state, _ = self.env.reset()
        state = state / 255.0  # Normalize initial state
        if len(state.shape) == 2:
            state = np.expand_dims(state, axis=0)  # Add channel dimension

        timesteps = 0
        episode_reward = 0
        episode_rewards = []
        episode = 0

        # Pre-fill replay buffer
        print("Pre-filling replay buffer...")
        while len(self.replay_buffer) < self.batch_size:
            action = random.randint(0, self.env.action_space.n - 1)
            next_state, reward, terminated, truncated, _ = self.env.step(action)
            next_state = next_state / 255.0
            if len(next_state.shape) == 2:
                next_state = np.expand_dims(next_state, axis=0)
            self.store_transition(state, action, reward, next_state, terminated)

            if terminated:
                state, _ = self.env.reset()
                state = state / 255.0
                if len(state.shape) == 2:
                    state = np.expand_dims(state, axis=0)
            else:
                state = next_state

        print("Replay buffer pre-filled. Starting training...")

        # Main training loop
        while timesteps < total_timesteps:
            action = self.select_action(state) if random.random() > self.epsilon else random.randint(0, self.env.action_space.n - 1)
            next_state, reward, terminated, truncated, _ = self.env.step(action)
            next_state = next_state / 255.0
            if len(next_state.shape) == 2:
                next_state = np.expand_dims(next_state, axis=0)

            self.store_transition(state, action, reward, next_state, terminated)
            self.train_step()

            state = next_state
            episode_reward += reward
            timesteps += 1
            self.global_step += 1

            if terminated:
                episode_rewards.append(episode_reward)
                print(f"Episode {episode} Reward: {episode_reward}")
                if len(episode_rewards) >= 10:
                    avg_reward = np.mean(episode_rewards[-10:])
                else:
                    avg_reward = np.mean(episode_rewards)
                self.writer.add_scalar("Average Reward", avg_reward, self.global_step)
                self.writer.add_scalar("Epsilon", self.epsilon, self.global_step)
                self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)
                episode_reward = 0
                episode += 1
                state, _ = self.env.reset()
                state = state / 255.0
                if len(state.shape) == 2:
                    state = np.expand_dims(state, axis=0)
            else:
                state = next_state

            if timesteps % 500 == 0:
                self.target_network.load_state_dict(self.q_network.state_dict())

        self.writer.close()


In [136]:
class QLearningCallback:
    def __init__(self, check_freq, save_path, eval_env, eval_trials=5, verbose=1):
        self.check_freq = check_freq
        self.save_path = save_path
        self.eval_env = eval_env
        self.eval_trials = eval_trials
        self.verbose = verbose

    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self, qlearning_model, timesteps):
        if timesteps % self.check_freq == 0:
            # Save model checkpoint
            model_path = os.path.join(self.save_path, f'best_model_{timesteps}.pth')
            torch.save(qlearning_model.q_network.state_dict(), model_path)
            if self.verbose:
                print(f"Model saved at step {timesteps}")

            # Evaluate policy
            avg_reward = evaluate_policy(qlearning_model, self.eval_env, trials=self.eval_trials)
            if self.verbose:
                print(f"Average Reward after {timesteps} timesteps: {avg_reward}")


In [None]:
    env = VizDoomGym(render=False)
    run_name = "qlearning_defend_8"
    qlearning = QLearning(env, run_name)

    callback = QLearningCallback(check_freq=1000, save_path="qlearning_checkpoints", eval_env = env, verbose=1)

    qlearning.train(total_timesteps=100000)


Pre-filling replay buffer...
Replay buffer pre-filled. Starting training...
Episode 0 Reward: 0.0
Episode 1 Reward: 0.0
Episode 2 Reward: 0.0
Episode 3 Reward: 0.0
Episode 4 Reward: 0.0
Episode 5 Reward: -1.0
Episode 6 Reward: -1.0
Episode 7 Reward: 2.0
Episode 8 Reward: 0.0
Episode 9 Reward: -1.0
Episode 10 Reward: 0.0
Episode 11 Reward: 0.0
Episode 12 Reward: 0.0
Episode 13 Reward: 0.0
Episode 14 Reward: 2.0
Episode 15 Reward: -1.0
Episode 16 Reward: 0.0
Episode 17 Reward: 0.0
Episode 18 Reward: 0.0
Episode 19 Reward: -1.0
Episode 20 Reward: 2.0
Episode 21 Reward: 0.0
Episode 22 Reward: -1.0
Episode 23 Reward: 0.0
Episode 24 Reward: 0.0
Episode 25 Reward: 1.0
Episode 26 Reward: 2.0
Episode 27 Reward: 2.0
Episode 28 Reward: 0.0
Episode 29 Reward: -1.0
Episode 30 Reward: 0.0
Episode 31 Reward: 0.0
Episode 32 Reward: -1.0
Episode 33 Reward: 1.0
Episode 34 Reward: 1.0
Episode 35 Reward: 0.0
Episode 36 Reward: 0.0
Episode 37 Reward: 0.0
Episode 38 Reward: 0.0
Episode 39 Reward: 1.0
Episod

In [25]:
if __name__ == "__main__":
    env = VizDoomGym(render=False)
    run_name = "qlearning_defend_2"
    qlearning = QLearning(env, run_name)

    callback = QLearningCallback(check_freq=1000, save_path="qlearning_checkpoints", eval_env = env, verbose=1)

    qlearning.train(total_timesteps=100000)

    # Save final model
    final_model_path = f'final_qlearning_model_{run_name}.pth'
    torch.save(qlearning.q_network.state_dict(), final_model_path)

    env.close()

Episode 0 Reward: 62.0
Episode 1 Reward: 64.0
Episode 2 Reward: 95.0
Episode 3 Reward: -380.0


In [25]:
eval_env = VizDoomGym(render=False)
callback = QLearningCallback(
        check_freq=5000, 
        save_path="qlearning_checkpoints", 
        eval_env=eval_env, 
        eval_trials=5, 
        verbose=1
    )

In [26]:
evaluate_policy(qlearning, eval_env, trials=5)

Episode 1 Reward: 67.0
Episode 2 Reward: 91.0
Episode 3 Reward: 67.0
Episode 4 Reward: 67.0
Episode 5 Reward: 75.0
Average Reward over 5 evaluation episodes: 73.4


73.4

In [28]:
env.close()
eval_env.close()