In [13]:
!cd github & git clone https://github.com/Farama-Foundation/ViZDoom.git

/bin/bash: line 0: cd: github: No such file or directory
fatal: destination path 'ViZDoom' already exists and is not an empty directory.


In [None]:
!pip install gym

In [None]:
!pip install opencv-python

In [None]:
!pip install matplotlib

In [None]:
!pip install stable-baselines3[extra]

In [None]:
!pip install torch

In [14]:
from vizdoom import *
import random
import time
import numpy as np
from gymnasium import Env
from gymnasium.spaces import Discrete, Box
import cv2
from matplotlib import pyplot as plt
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter

In [15]:
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common import env_checker
from stable_baselines3 import PPO

In [16]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [22]:
class VizDoomGym(Env):
    def __init__(self, render=False):
        super().__init__()
        self.game = DoomGame()
        self.game.load_config('VizDoom/scenarios/defend_the_center.cfg')

        if render:
            self.game.set_window_visible(True)
        else:
            self.game.set_window_visible(False)
            
        self.game.init()

        #Sets up observation space(image of game) and action space
        self.observation_space = Box(low=0, high=255, shape=(1, 100, 160), dtype = np.uint8)
        self.action_space = Discrete(3)
        
    def step(self, action):
        actions = np.identity(3, dtype=np.uint8)
        reward = self.game.make_action(actions[action], 2)
        terminated = self.game.is_episode_finished()
        truncated = False
        info = {}
        
        if not terminated:
            state = self.game.get_state().screen_buffer
            state = self.preprocess(state)
        else:
            state = np.zeros(self.observation_space.shape, dtype=np.uint)
        
        return state, reward, terminated, truncated, info
        
    def render(self):
        pass
            
    def reset(self, seed =None, options = None):
        if seed is not None:
            self.game.set_seed(seed)

        self.game.new_episode()
        state = self.game.get_state().screen_buffer
        observation = self.preprocess(state)
        info = {}
        return observation, info
        
    #Grayscale game frame and resize it 
    def preprocess(self, observation):
        gray = cv2.cvtColor(np.moveaxis(observation, 0, -1), cv2.COLOR_BGR2GRAY)
        resized  = cv2.resize(gray, (160, 100), interpolation=cv2.INTER_AREA)
        state = np.reshape(resized, (1, 100, 160))
        return state.astype(np.float32) / 255.0
        
    def close(self):
        self.game.close()
    

In [18]:
def evaluate_policy(model, env, trials=5):
    total_rewards = []
    for episode in range(trials):
        state, _ = env.reset()
        state = torch.FloatTensor(state).unsqueeze(0)
        done = False
        total_reward = 0
        while not done:
            with torch.no_grad():
                logits, _ = agent.get_action(state)
                probs = torch.softmax(logits, dim=-1)
                action = torch.argmax(probs, dim=-1).item()
                next_state, reward, terminated, truncated, info = env.step(action)
                state = torch.FloatTensor(next_state).unsqueeze(0)
                total_reward += reward
                done = terminated
        total_rewards.append(total_reward)
        print(f"Episode {episode + 1} Reward: {total_reward}")
        average_reward = np.mean(total_rewards)
        print(f"Average Reward over {trials} episodes: {average_reward}")
        
                

In [19]:
class PPOAgent(nn.Module):
    def __init__(self, action_space):
        super(PPOAgent, self).__init__()
        self.conv = nn.Sequential(
            #based logic on stablebaselines
            nn.Conv2d(1, 32, kernel_size=8, stride=4),  # Output: [32, 24, 39]
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),  # Output: [64, 11, 18]
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),  # Output: [64, 9, 16]
            nn.ReLU(),
        )
        self.fc = nn.Sequential(
            nn.Flatten(),
            nn.Linear(64 * 9 * 16, 512),
            nn.ReLU(),
        )
        self.policy_head = nn.Linear(512, action_space.n)
        self.value_head = nn.Linear(512, 1)
        
    def forward(self, x):
        x = self.conv(x)
        x = self.fc(x)
        return x
    
    def get_action(self, state):
        x = self.forward(state)
        logits = self.policy_head(x)
        value = self.value_head(x)
        return logits, value

In [31]:
from torch.utils.tensorboard import SummaryWriter
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

class PPO:
    def __init__(self, env, agent, run_name, epochs=10, clip_eps=0.2, gamma=0.99, lr=1e-4, batch_size=3000):
        self.env = env
        self.agent = agent
        self.optimizer = optim.Adam(self.agent.parameters(), lr=lr)
        self.epochs = epochs
        self.clip_eps = clip_eps
        self.gamma = gamma
        self.batch_size = batch_size

        
        log_dir = os.path.join("ppo_logs", run_name)
        self.writer = SummaryWriter(log_dir)
        self.global_step = 0  

    def compute_returns(self, rewards, dones, gamma):
        returns = []
        R = 0
        for step in reversed(range(len(rewards))):
            if dones[step]:
                R = 0  
            R = rewards[step] + gamma * R
            returns.insert(0, R)
        return returns

    def learn(self, total_timesteps):
        state, _ = self.env.reset()
        state = torch.FloatTensor(state).unsqueeze(0)
        timesteps = 0
        episode_rewards = []
        episode_reward = 0
        episode = 0

        while timesteps < total_timesteps:
            states = []
            actions = []
            log_probs = []
            rewards = []
            values = []
            dones = []

            for _ in range(self.batch_size):
                logits, value = self.agent.get_action(state)
                probs = torch.softmax(logits, dim=-1)
                dist = torch.distributions.Categorical(probs)
                action = dist.sample()
                log_prob = dist.log_prob(action)

                next_state, reward, terminated, truncated, _ = self.env.step(action.item())
                next_state = torch.FloatTensor(next_state.astype(np.float32)).unsqueeze(0)


                states.append(state)
                actions.append(action)
                log_probs.append(log_prob.detach())
                rewards.append(reward)
                values.append(value.detach().item())
                dones.append(terminated)

                episode_reward += reward
                self.global_step += 1  

                if terminated:
                    episode_rewards.append(episode_reward)
                    print(f"Episode {episode} Reward: {episode_reward}")

                    # Calculate and log average reward
                    if len(episode_rewards) >= 100:
                        average_reward = np.mean(episode_rewards[-100:])
                    else:
                        average_reward = np.mean(episode_rewards)
                    self.writer.add_scalar("Average Reward", average_reward, self.global_step)

                    episode_reward = 0
                    episode += 1
                    state, _ = self.env.reset()
                    state = torch.FloatTensor(state).unsqueeze(0)
                else:
                    state = next_state

                timesteps += 1

            returns = self.compute_returns(rewards, dones, self.gamma)

            states = torch.cat(states)
            actions = torch.tensor(actions).unsqueeze(-1)
            log_probs = torch.stack(log_probs)
            returns = torch.tensor(returns).unsqueeze(-1)
            values = torch.tensor(values).unsqueeze(-1)

            advantages = returns - values
            advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)


            policy_losses = []
            value_losses = []
            losses = []
            entropies = []

            for _ in range(self.epochs):
                logits, value = self.agent.get_action(states)
                probs = torch.softmax(logits, dim=-1)
                dist = torch.distributions.Categorical(probs)
                new_log_probs = dist.log_prob(actions.squeeze(-1)).unsqueeze(-1)
                entropy = dist.entropy().mean()

                ratio = (new_log_probs - log_probs).exp()
                surr1 = ratio * advantages
                surr2 = torch.clamp(ratio, 1 - self.clip_eps, 1 + self.clip_eps) * advantages

                policy_loss = -torch.min(surr1, surr2).mean()
                value_loss = nn.SmoothL1Loss()(value, returns)
                loss = 100*policy_loss + 0.5 * value_loss - 0.03 * entropy

                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()

                losses.append(loss.item())
                policy_losses.append(policy_loss.item())
                value_losses.append(value_loss.item())
                entropies.append(entropy.item())


            avg_policy_loss = np.mean(policy_losses)
            avg_value_loss = np.mean(value_losses)
            avg_entropy = np.mean(entropies)
            avg_loss = np.mean(losses)

            self.writer.add_scalar("Loss", avg_loss, self.global_step)
            self.writer.add_scalar("Policy Loss", avg_policy_loss, self.global_step)
            self.writer.add_scalar("Value Loss", avg_value_loss, self.global_step)
            self.writer.add_scalar("Entropy", avg_entropy, self.global_step)


            print(f"Timesteps: {timesteps}, Policy Loss: {avg_policy_loss}, "f"Value Loss: {avg_value_loss}, Entropy: {avg_entropy}, "f"Combined Loss: {avg_loss}")


        self.writer.close()

In [32]:
class PPOCallback:
    def __init__(self, check_freq, save_path, eval_env, eval_trials=5, verbose=1):
        self.check_freq = check_freq
        self.save_path = save_path
        self.eval_env = eval_env
        self.eval_trials = eval_trials
        self.verbose = verbose

    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self, ppo_agent, timesteps):
        if timesteps % self.check_freq == 0:
            # Save the model
            model_path = os.path.join(self.save_path, f'ppo_model_{timesteps}.pth')
            torch.save(ppo_agent.agent.state_dict(), model_path)
            if self.verbose:
                print(f"Model saved at step {timesteps}")

            # Evaluate policy
            avg_reward = evaluate_policy(ppo_agent.agent, self.eval_env, trials=self.eval_trials)
            if self.verbose:
                print(f"Average Reward after {timesteps} timesteps: {avg_reward}")

In [34]:
#Training the model
env = VizDoomGym(render=False)
agent = PPOAgent(env.action_space)

callback  = PPOCallback(check_freq=1000, save_path="ppo_checkpoints", eval_env=env, verbose=1)
run_name= "defend_9"
ppo = PPO(
    env=env,
    agent=agent,
    run_name=run_name)
total_timesteps = 100000
ppo.learn(total_timesteps)

filename = f'ppo_vizdoom_agent_{run_name}.pth'
torch.save(agent.state_dict(), filename)

Episode 0 Reward: 0.0
Episode 1 Reward: 0.0
Episode 2 Reward: 0.0
Episode 3 Reward: 1.0
Episode 4 Reward: 1.0
Episode 5 Reward: 0.0
Episode 6 Reward: 1.0
Episode 7 Reward: 1.0
Episode 8 Reward: 1.0
Episode 9 Reward: 0.0
Episode 10 Reward: -1.0
Episode 11 Reward: 1.0
Episode 12 Reward: 1.0
Episode 13 Reward: 0.0
Episode 14 Reward: -1.0
Episode 15 Reward: -1.0
Episode 16 Reward: 0.0
Episode 17 Reward: 0.0
Episode 18 Reward: 0.0


KeyboardInterrupt: 

In [None]:
basic_file_list.append(filename)

In [None]:
eval_env = VizDoomGym(render=False)
callback = PPOCallback(
        check_freq=5000, 
        save_path="ppo_checkpoints", 
        eval_env=eval_env, 
        eval_trials=5, 
        verbose=1)

evaluate_policy(agent, eval_env, trials=5)

In [None]:
env.close()
eval_env.close()