In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
import gymnasium as gym
import ale_py

import torch
from core.environments.gym.env import PolicyEnvironment
from core.models.resnet import ResNet

gym.register_envs(ale_py)

In [5]:
env = gym.make("ALE/SpaceInvaders-v5", render_mode="rgb_array")
model = ResNet(3 * 3, 6, 2, 2)
env_wrapper = PolicyEnvironment(env, 3, model)

A.L.E: Arcade Learning Environment (version 0.11.2+ecc1138)
[Powered by Stella]


In [6]:
trajectory = env_wrapper.sample_trajectory()

In [7]:
from implementations.reinforce.src.buffer import ReinforceReplayBuffer

In [8]:
buffer = ReinforceReplayBuffer(100, 3, 0.99)
buffer.add(trajectory)

In [9]:
from typing import Optional
from gymnasium.core import Env
from torch.optim import Optimizer, AdamW
import torch.nn as nn
from core.environments.gym.trajectory import Trajectory
from matplotlib.animation import FuncAnimation

class REINFORCETrainer:
    def __init__(
        self,
        model: nn.Module,
        env: Env,
        input_buffer_size: int = 3,
        replay_buffer_size: int = 100,
        gamma: float = 0.99,
        optimizer: Optional[Optimizer] = None
    ):
        self.env = PolicyEnvironment(env, input_buffer_size, model)
        self.model = model
        self.gamma = gamma
        self.replay = ReinforceReplayBuffer(replay_buffer_size, input_buffer_size, gamma)
        if optimizer is None:
            self.optimizer = AdamW(model.parameters())
        else:
            self.optimizer = optimizer
        self._rewards = []

    def sample(self) -> Trajectory:
        return self.env.sample_trajectory()
    
    def display(self) -> FuncAnimation:
        return self.env.display()

    def fetch_output(self):
        inputs = self.replay.inputs()
        actions = self.replay.actions()
        output = torch.log_softmax(self.model.forward(inputs), dim=1)
        indices = torch.arange(output.size(0))
        future_rewards = self.replay.future_rewards()
        values = output[indices, actions] * future_rewards
        return values.mean()

    def add_reward(self, trajectory: Trajectory):
        reward = 0
        for node in trajectory.nodes[::-1]:
            reward = self.gamma * reward + node.reward
        self._rewards.append(reward)
        if len(self._rewards) > 10:
            print(sum(self._rewards[-10:]) / len(self._rewards[-10:]))

    def iterate(self) -> None:
        self.replay.reset()
        self.optimizer.zero_grad()
        while not self.replay.is_full():
            trajectory = self.sample()
            self.replay.add(trajectory)
            self.add_reward(trajectory)
        output = self.fetch_output()
        output.backward()
        self.optimizer.step()


In [10]:
trainer = REINFORCETrainer(model, env)

In [None]:
while True:
    trainer.iterate()

  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


20.143938251872036
19.70607964766092
18.461568655162022
18.32754740674404
17.976089282537078
18.523516767917606
18.783318967100247
18.111516374672732
18.303514671604916
19.04943741484066
17.05243305326066
16.42314476107671
16.596279961519624
15.37887702607416
15.828223098631724
16.537486199494495
16.19257633994147
15.392324887564907


In [24]:
import torch.nn as nn 

nn.Module

torch.nn.modules.module.Module

In [25]:
model.parameters()

<generator object Module.parameters at 0x796481006260>

In [26]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters())
optimizer.zero_grad()

In [27]:
inputs = buffer.inputs()
actions = buffer.actions()
output = torch.log_softmax(model.forward(inputs), dim=1)
indices = torch.arange(output.size(0))
future_rewards = buffer.future_rewards()
values = output[indices, actions] * future_rewards
batch_value = values.mean()

In [28]:
batch_value.backward()

  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
