In [1]:
import os
import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim
import random
import torch.nn.functional as F
import vizdoom as vzd

In [25]:
# Set up ViZDoom environment
def setup_vizdoom():
    game = vzd.DoomGame()
    game.set_doom_scenario_path(os.path.join(vzd.scenarios_path, "basic.wad"))
    game.set_doom_map("map01")
    game.set_screen_resolution(vzd.ScreenResolution.RES_160X120)
    game.set_screen_format(vzd.ScreenFormat.RGB24)
    game.set_window_visible(True)
    game.set_available_buttons([vzd.Button.MOVE_LEFT, vzd.Button.MOVE_RIGHT, vzd.Button.ATTACK])
    game.set_mode(vzd.Mode.PLAYER)
    game.set_living_reward(-1)
    # game.set_reward_for_kill(10)
    game.init()
    return game

In [26]:
# class QNetwork(nn.Module):
#     def __init__(self, image_height: int, image_width: int, num_actions: int):
#         super(QNetwork, self).__init__()
#         h = image_height
#         w = image_width

#         self.conv1 = nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, padding=1)
#         self.pool1 = nn.MaxPool2d(kernel_size=4)
#         h //= 4
#         w //= 4

#         self.conv2 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, padding=1)
#         self.pool2 = nn.MaxPool2d(kernel_size=4)
#         h //= 4
#         w //= 4

#         self.fc = nn.Sequential(
#             nn.Linear(h * w * 32, 128),
#             nn.ReLU(),
#             nn.Linear(128, num_actions)
#         )

#     def forward(self, x):
#         x = F.relu(self.conv1(x))
#         x = self.pool1(x)
#         x = F.relu(self.conv2(x))
#         x = self.pool2(x)
#         x = x.view(x.size(0), -1)  # Flatten
#         return self.fc(x)

# Neural network for approximating Q-values
class QNetwork(nn.Module):
    def __init__(self, input_shape, num_actions):
        super(QNetwork, self).__init__()
        h, w, c = input_shape

        self.conv1 = nn.Conv2d(in_channels=c, out_channels=16, kernel_size=3, stride=2)
        self.conv2 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, stride=2)
        self.fc1 = nn.Linear(32 * (h // 4 - 1) * (w // 4 - 1), 128)
        self.fc2 = nn.Linear(128, num_actions)

    def forward(self, x):
        x = torch.relu(self.conv1(x))
        x = torch.relu(self.conv2(x))
        x = x.view(x.size(0), -1)  # Flatten
        x = torch.relu(self.fc1(x))
        return self.fc2(x)

In [35]:
class QLearningAgent:
    def __init__(self, env, lr=0.0001, gamma=0.99, epsilon=1.0, epsilon_decay=0.995, epsilon_min=0.1):
        self.env = env
        self.input_shape = env.get_state().screen_buffer.shape  # (H, W, C)
        self.num_actions = len(env.get_available_buttons())

        # Define action mappings explicitly
        self.actions = [
            [True, False, False],  # MOVE_LEFT
            [False, True, False],  # MOVE_RIGHT
            [False, False, True],  # ATTACK
        ]

        self.q_network = QNetwork(self.input_shape, self.num_actions)
        self.target_network = QNetwork(self.input_shape, self.num_actions)
        self.target_network.load_state_dict(self.q_network.state_dict())
        self.target_network.eval()

        self.optimizer = optim.Adam(self.q_network.parameters(), lr=lr)
        self.loss_fn = nn.MSELoss()

        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min

        self.replay_buffer = []
        self.buffer_size = 5000
        self.batch_size = 32
        self.target_update_freq = 10

    def preprocess_observation(self, obs):
        obs = obs.transpose(2, 0, 1)  # Channels-first for PyTorch
        obs = obs / 255.0  # Normalize pixel values to [0, 1]
        return torch.tensor(obs, dtype=torch.float).unsqueeze(0)

    def select_action(self, obs):
        if random.random() < self.epsilon:
            return random.randint(0, self.num_actions - 1)  # Explore
        with torch.no_grad():
            q_values = self.q_network(obs)
        return torch.argmax(q_values).item()  # Exploit

    def train_step(self):
        if len(self.replay_buffer) < self.batch_size:
            return

        batch = random.sample(self.replay_buffer, self.batch_size)
        obs_batch, action_batch, reward_batch, next_obs_batch, done_batch = zip(*batch)

        obs_batch = torch.cat(obs_batch)
        next_obs_batch = torch.cat(next_obs_batch)
        action_batch = torch.tensor(action_batch, dtype=torch.long)
        reward_batch = torch.tensor(reward_batch, dtype=torch.float)
        done_batch = torch.tensor(done_batch, dtype=torch.float)

        q_values = self.q_network(obs_batch)
        next_q_values = self.target_network(next_obs_batch)

        max_next_q_values = torch.max(next_q_values, dim=1)[0]
        target_q_values = reward_batch + self.gamma * max_next_q_values * (1 - done_batch)

        current_q_values = q_values.gather(1, action_batch.unsqueeze(1)).squeeze()

        loss = self.loss_fn(current_q_values, target_q_values)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def train(self, num_episodes):
        for episode in range(num_episodes):
            self.env.new_episode()
            obs = self.preprocess_observation(self.env.get_state().screen_buffer)
            episode_reward = 0

            while not self.env.is_episode_finished():
                action = self.select_action(obs)
                reward = self.env.make_action(self.actions[action])  # FIXED LINE
                episode_reward += reward

                done = self.env.is_episode_finished()
                next_obs = (
                    self.preprocess_observation(self.env.get_state().screen_buffer)
                    if not done
                    else torch.zeros_like(obs)
                )

                self.replay_buffer.append((obs, action, reward, next_obs, done))
                if len(self.replay_buffer) > self.buffer_size:
                    self.replay_buffer.pop(0)

                obs = next_obs
                self.train_step()

            if episode % self.target_update_freq == 0:
                self.target_network.load_state_dict(self.q_network.state_dict())

            self.epsilon = max(self.epsilon * self.epsilon_decay, self.epsilon_min)
            if episode % 50 == 0:
                self.epsilon = 1.0  # Explore more often

            print(f"Episode {episode + 1}/{num_episodes}, Reward: {episode_reward}, Epsilon: {self.epsilon:.2f}")

In [21]:
# class QLearningAgent:
#     def __init__(self, env, lr=0.0003, gamma=0.95, epsilon=1.0, epsilon_decay=0.995, epsilon_min=0.01):
#         self.env = env
#         screen_shape = self.env.get_state().screen_buffer.shape
#         self.image_height, self.image_width, _ = screen_shape
#         self.num_actions = len(env.get_available_buttons())

#         self.q_network = QNetwork(self.image_height, self.image_width, self.num_actions)
#         self.target_network = QNetwork(self.image_height, self.image_width, self.num_actions)
#         self.target_network.load_state_dict(self.q_network.state_dict())
#         self.target_network.eval()

#         self.optimizer = optim.Adam(self.q_network.parameters(), lr=lr)
#         self.gamma = gamma
#         self.epsilon = epsilon
#         self.epsilon_decay = epsilon_decay
#         self.epsilon_min = epsilon_min
#         self.actions = [
#             [True, False, False],  # MOVE_LEFT
#             [False, True, False],  # MOVE_RIGHT
#             [False, False, True],  # ATTACK
#         ]

#     def select_action(self, obs):
#         if random.random() < self.epsilon:
#             return random.randint(0, self.num_actions - 1)
#         obs = torch.tensor(obs, dtype=torch.float).unsqueeze(0)
#         with torch.no_grad():
#             q_values = self.q_network(obs)
#         return torch.argmax(q_values).item()

#     def train(self, num_episodes, batch_size=32, target_update=10, replay_buffer_size=5000):
#         replay_buffer = []
#         for episode in range(num_episodes):
#             self.env.new_episode()
#             obs = self._process_obs(self.env.get_state().screen_buffer)
#             episode_reward = 0
#             previous_killcount = 0
#             episode_rewards = []

#             while not self.env.is_episode_finished():
#                 action_idx = self.select_action(obs)
#                 action = self.actions[action_idx]
#                 reward = self.env.make_action(action)
#                 print(f"Reward received: {reward}")

#                 current_killcount = self.env.get_game_variable(vzd.GameVariable.KILLCOUNT)
#                 if current_killcount > previous_killcount:
#                     reward += 10
#                     previous_killcount = current_killcount

#                 next_obs = (
#                     self._process_obs(self.env.get_state().screen_buffer)
#                     if not self.env.is_episode_finished()
#                     else None
#                 )
#                 replay_buffer.append((obs, action_idx, reward, next_obs))
#                 if len(replay_buffer) > replay_buffer_size:
#                     replay_buffer.pop(0)
                
#                 if len(episode_rewards) % 100 == 0:
#                     print(f"Intermediate reward (last 100 timesteps): {sum(episode_rewards[-100:])}")

#                 obs = next_obs
#                 episode_reward += reward
#                 episode_rewards.append(episode_reward)

#                 if len(replay_buffer) >= batch_size:
#                     self._train_step(replay_buffer, batch_size)

#             self.epsilon = max(self.epsilon * self.epsilon_decay, self.epsilon_min)

#             if episode % target_update == 0:
#                 self.target_network.load_state_dict(self.q_network.state_dict())

#             print(f"Episode {episode + 1}/{num_episodes}, Reward: {episode_reward}, Epsilon: {self.epsilon:.2f}")

#     def _train_step(self, replay_buffer, batch_size):
#         batch = random.sample(replay_buffer, batch_size)
#         obs_batch, action_batch, reward_batch, next_obs_batch = zip(*batch)

#         obs_batch = torch.tensor(np.array(obs_batch), dtype=torch.float)
#         action_batch = torch.tensor(action_batch, dtype=torch.long)
#         reward_batch = torch.tensor(reward_batch, dtype=torch.float)
#         next_obs_batch = torch.tensor(
#             [x for x in next_obs_batch if x is not None], dtype=torch.float
#         )

#         q_values = self.q_network(obs_batch)
#         next_q_values = torch.zeros(len(batch), self.num_actions)
#         if len(next_obs_batch) > 0:
#             next_q_values[: len(next_obs_batch)] = self.target_network(next_obs_batch)

#         max_next_q_values = torch.max(next_q_values, dim=1)[0]
#         target_q_values = reward_batch + (self.gamma * max_next_q_values)

#         current_q_values = q_values.gather(1, action_batch.unsqueeze(1)).squeeze()

#         loss = F.mse_loss(current_q_values, target_q_values)
#         self.optimizer.zero_grad()
#         loss.backward()
#         self.optimizer.step()

#     def _process_obs(self, obs):
#         obs = obs.transpose(2, 0, 1)
#         obs = obs / 255.0  
#         return obs

In [36]:
if __name__ == "__main__":
  
    env = setup_vizdoom()
    agent = QLearningAgent(env)
    agent.train(num_episodes=500)

    env.close()

Episode 1/500, Reward: 54.0, Epsilon: 1.00
Episode 2/500, Reward: -245.0, Epsilon: 0.99
Episode 3/500, Reward: -150.0, Epsilon: 0.99
Episode 4/500, Reward: 82.0, Epsilon: 0.99


KeyboardInterrupt: 