# Pipeline draft

In [None]:
import torch
import numpy as np
from omegaconf import OmegaConf
from functools import partial
import gymnasium as gym
import matplotlib.pyplot as plt

import bbrl_utils
from bbrl_utils.notebook import setup_tensorboard
from bbrl.stats import WelchTTest
from bbrl.agents import Agent, Agents, TemporalAgent
from bbrl.agents.gymnasium import ParallelGymAgent, make_env
from bbrl.workspace import Workspace
from bbrl.utils.replay_buffer import ReplayBuffer

import bbrl_gymnasium

from pmind.algorithms import DQN, DDPG, TD3
from pmind.losses import dqn_compute_critic_loss, ddqn_compute_critic_loss
from pmind.training import run_dqn, run_ddpg, run_td3, run_td3_offline
from pmind.replay import (
    collect_policy_transitions,
    collect_uniform_transitions,
    collect_uniform_transitions_2,
    mix_transitions,
    test_rb_compositions,
)

from pmind.config.loader import load_config

bbrl_utils.setup()

cfg = load_config("bbrl_example")

%load_ext autoreload
%autoreload 2

In [None]:
# setup_tensorboard("./outputs/tblogs")

# Obtain transitions

In [None]:
ENV_NAME = [
    "CartPoleContinuous-v1",
    "Pendulum-v1",
    "MountainCarContinuous-v0",
    "LunarLander-v3",
][0]
BUFFER_SIZE = 100_000
PROPORTIONS = np.linspace(0, 1, 10)

## Best policy

Get the best policy and get transitions with it

In [None]:
cfg_td3_best = OmegaConf.create(cfg.td3)

# accelerate for the sake of test:
cfg_td3_best.algorithm.max_epochs = 11_000
cfg_td3_best.algorithm.learning_starts = 1000

td3 = TD3(cfg_td3_best)
run_td3(td3)
td3.visualize_best()
best_policy_agent = td3.best_policy

In [None]:
rb_best = collect_policy_transitions(
    best_policy_agent, ENV_NAME, buffer_size=BUFFER_SIZE
)
print(f"Replay buffer with {rb_best.size()} transitions")

## Uniform exploration

In [None]:
# rb_unif = collect_uniform_transitions(ENV_NAME, buffer_size=BUFFER_SIZE)
rb_unif2 = collect_uniform_transitions_2(ENV_NAME)
print(f"Replay buffer with {rb_unif.size()} transitions")

Milos: dict_keys(['env/env_obs', 'env/terminated', 'env/truncated', 'env/done', 'env/reward', 'env/cumulated_reward', 'env/timestep', 'action'])

Vlad: dict_keys(['env/env_obs', 'action', 'env/timestep', 'env/terminated', 'env/truncated', 'env/done', 'env/reward', 'env/cumulated_reward'])

# Offline learning on mixed replay buffer

In [None]:
cfg_td3_offline = OmegaConf.create(cfg.td3)

# accelerate for the sake of test:
cfg_td3_offline.algorithm.n_steps = 100
cfg_td3_offline.algorithm.max_epochs = 1000

# we don't care about when learning starts for offline:
cfg_td3_offline.algorithm.learning_starts = None

# TODO: apply no noise to actions during learning or it's independent?
# cfg_td3_offline.action_noise = 0
# cfg_td3_offline.target_policy_noise = 0


perfs = test_rb_compositions(
    rb_unif=rb_unif2,
    rb_best=rb_best,
    buffer_size=BUFFER_SIZE,
    proportions=PROPORTIONS,
    agent_constructor=TD3,
    cfg=cfg_td3_offline,
    offline_run=run_td3_offline,
)