In [1]:
# Cell 1: Общее конфигурирование
from agentslab.utils.device import resolve_device
from agentslab.utils.seeding import set_global_seed
from pathlib import Path

device = resolve_device("cpu")
print('Device:', device)
 
seed = 42
set_global_seed(seed, deterministic=True)

ROOT = Path('..').resolve()
ALGO_NAME, ENV_NAME = "ppo", "pendulum"
ENV_ID = "InvertedDoublePendulum-v4"
# ENV_ID = "CartPole-v1"

Device: cpu


# Создание среды

In [2]:
from agentslab.envs.gym_factory import GymEnvConfig, make_gym_env
from torchrl.envs.utils import check_env_specs

env_cfg = GymEnvConfig(env_id=ENV_ID, render_mode=None, device=device, seed=seed)
env = make_gym_env(env_cfg)
check_env_specs(env)

# from agentslab.utils.specs import print_specs
# print_specs(env)

Gym has been unmaintained since 2022 and does not support NumPy 2.0 amongst other critical functionality.
Please upgrade to Gymnasium, the maintained drop-in replacement of Gym, or contact the authors of your software and request that they upgrade.
Users of this version of Gym should be able to simply replace 'import gym' with 'import gymnasium as gym' in the vast majority of cases.
See the migration guide at https://gymnasium.farama.org/introduction/migration_guide/ for additional information.


[92m2025-08-22 19:21:44,105 [torchrl][INFO][0m    check_env_specs succeeded![92m [END][0m


# Создание актора и критика

In [3]:
from agentslab.modules.networks import MLPConfig, build_mlp
from agentslab.modules.policy import build_stochastic_actor
from torchrl.modules import ValueOperator

# Достаём размерности
obs_dim = env.observation_spec["observation"].shape[-1]
act_dim = env.action_spec.shape[-1]

mlp_cfg = MLPConfig(
        in_dim = obs_dim, 
        out_dim = 2*act_dim,
        hidden_sizes = (256, 256),
        activation = "tanh",
        layer_norm = False
)

actor_network = build_mlp(mlp_cfg)
actor = build_stochastic_actor(actor_network, env.action_spec)

mlp_cfg.out_dim = act_dim
critic_network = build_mlp(mlp_cfg)
critic = ValueOperator(module=critic_network, in_keys=["observation"])
critic

ValueOperator(
    module=Sequential(
      (0): Linear(in_features=11, out_features=256, bias=True)
      (1): Tanh()
      (2): Linear(in_features=256, out_features=256, bias=True)
      (3): Tanh()
      (4): Linear(in_features=256, out_features=1, bias=True)
    ),
    device=cpu,
    in_keys=['observation'],
    out_keys=['state_value'])

# Collector

In [4]:
from torchrl.collectors import SyncDataCollector
from torchrl.data.replay_buffers import ReplayBuffer
from torchrl.data.replay_buffers.samplers import SamplerWithoutReplacement
from torchrl.data.replay_buffers.storages import LazyTensorStorage

frames_per_batch = 1000
# For a complete training, bring the number of frames up to 1M
total_frames = 10_000

collector = SyncDataCollector(
    create_env_fn=env,
    policy=actor,
    frames_per_batch=frames_per_batch,
    total_frames=total_frames,
    split_trajs=False,
    device=device,
)
replay_buffer = ReplayBuffer(
    storage=LazyTensorStorage(max_size=frames_per_batch),
    sampler=SamplerWithoutReplacement(),
)

# Loss Function

In [5]:
from torchrl.objectives import ClipPPOLoss
from torchrl.objectives.value import GAE
import torch

gamma = 0.99
lmbda = 0.95

advantage_module = GAE(
    gamma=gamma, lmbda=lmbda, value_network=critic, average_gae=True
)

clip_epsilon = (
    0.2  # clip value for PPO loss: see the equation in the intro for more context.
)
entropy_eps = 1e-4

loss_module = ClipPPOLoss(
    actor_network=actor,
    critic_network=critic,
    clip_epsilon=clip_epsilon,
    entropy_bonus=bool(entropy_eps),
    entropy_coeff=entropy_eps,
    # these keys match by default but we set this for completeness
    critic_coeff=1.0,
    loss_critic_type="smooth_l1",
)

lr = 3e-4

optim = torch.optim.Adam(loss_module.parameters(), lr)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
    optimizer=optim, 
    T_max=total_frames // frames_per_batch, 
    eta_min=0.0
)

In [6]:
import logging

# Make Dirs

In [7]:
# === infra.py (можно держать в той же ячейке, а потом вынести по модулям) ===
from __future__ import annotations
from dataclasses import dataclass, asdict
from pathlib import Path
from typing import Dict, Any, Optional, Tuple
from datetime import datetime
import yaml
import torch
from pytorch_lightning.loggers import CSVLogger, TensorBoardLogger
from tqdm.auto import tqdm
import logging
import shutil
import re

# ---- Конфиги ----

@dataclass
class RunConfig:
    root: Path
    algo_name: str
    env_name: str
    run_time_fmt: str = "%Y%m%d-%H%M%S"
    eval_every_batches: int = 5
    eval_episodes: int = 5
    keep_last_k_ckpts: int = 3

@dataclass
class TrainConfig:
    num_epochs: int = 10
    sub_batch_size: int = 64
    max_grad_norm: float = 1.0

@dataclass
class RunPaths:
    root: Path
    runs_dir: Path
    run_dir: Path
    csv_train_dir: Path
    csv_eval_dir: Path
    tb_train_dir: Path
    tb_eval_dir: Path
    txt_train_dir: Path
    txt_eval_dir: Path
    ckpt_dir: Path
    meta_yaml: Path

@dataclass
class LoggerHandles:
    train_csv: CSVLogger
    eval_csv: CSVLogger
    train_tb: TensorBoardLogger
    eval_tb: TensorBoardLogger
    train_txt_logger: logging.Logger
    eval_txt_logger: logging.Logger

# ---- FS helpers ----

def _mkdir(p: Path) -> Path:
    p.mkdir(parents=True, exist_ok=True)
    return p

def _make_run_dirs(cfg: RunConfig) -> RunPaths:
    runs_dir = _mkdir(cfg.root / "runs")
    run_name = f"{cfg.algo_name}_{cfg.env_name}_{datetime.now().strftime(cfg.run_time_fmt)}"
    run_dir = _mkdir(runs_dir / run_name)

    csv_train_dir = _mkdir(run_dir / "csv_logs" / "train")
    csv_eval_dir  = _mkdir(run_dir / "csv_logs" / "eval")
    tb_train_dir  = _mkdir(run_dir / "tb_logs"  / "train")
    tb_eval_dir   = _mkdir(run_dir / "tb_logs"  / "eval")
    txt_train_dir = _mkdir(run_dir / "txt_logs" / "train")
    txt_eval_dir  = _mkdir(run_dir / "txt_logs" / "eval")
    ckpt_dir      = _mkdir(run_dir / "checkpoints")
    meta_yaml     = run_dir / "meta.yaml"

    return RunPaths(
        root=cfg.root, runs_dir=runs_dir, run_dir=run_dir,
        csv_train_dir=csv_train_dir, csv_eval_dir=csv_eval_dir,
        tb_train_dir=tb_train_dir, tb_eval_dir=tb_eval_dir,
        txt_train_dir=txt_train_dir, txt_eval_dir=txt_eval_dir,
        ckpt_dir=ckpt_dir, meta_yaml=meta_yaml
    )

def _safe_logger(name: str, file_path: Path) -> logging.Logger:
    logger = logging.getLogger(name)
    logger.setLevel(logging.INFO)
    logger.propagate = False
    # чтобы не дублировать хендлеры при повторных запусках ячейки
    if not any(isinstance(h, logging.FileHandler) and getattr(h, "_file_path", None) == str(file_path) 
               for h in logger.handlers):
        fh = logging.FileHandler(file_path, encoding="utf-8")
        fh._file_path = str(file_path)
        fmt = logging.Formatter("%(asctime)s | %(message)s")
        fh.setFormatter(fmt)
        logger.addHandler(fh)
    return logger

def _get_loggers(paths: RunPaths) -> LoggerHandles:
    # CSV/TB логгеры (раздельно train/eval)
    train_csv = CSVLogger(save_dir=str(paths.csv_train_dir), name="metrics")
    eval_csv  = CSVLogger(save_dir=str(paths.csv_eval_dir),  name="metrics")

    train_tb  = TensorBoardLogger(save_dir=str(paths.tb_train_dir), name="tb")
    eval_tb   = TensorBoardLogger(save_dir=str(paths.tb_eval_dir),  name="tb")

    # Текстовые логи
    train_txt = _safe_logger(f"train_txt_{paths.run_dir.name}", paths.txt_train_dir / "train.log")
    eval_txt  = _safe_logger(f"eval_txt_{paths.run_dir.name}",  paths.txt_eval_dir / "eval.log")

    return LoggerHandles(
        train_csv=train_csv, eval_csv=eval_csv,
        train_tb=train_tb,   eval_tb=eval_tb,
        train_txt_logger=train_txt, eval_txt_logger=eval_txt
    )

def _dump_meta_yaml(paths: RunPaths, run_cfg: RunConfig, train_cfg: TrainConfig, extra: Optional[Dict[str, Any]] = None):
    meta = {
        "run_name": paths.run_dir.name,
        "algo": run_cfg.algo_name,
        "env": run_cfg.env_name,
        "started_at": datetime.now().isoformat(timespec="seconds"),
        "train_config": asdict(train_cfg),
    }
    if extra:
        meta["extra"] = extra
    with open(paths.meta_yaml, "w", encoding="utf-8") as f:
        yaml.safe_dump(meta, f, allow_unicode=True, sort_keys=False)

# ---- Прогресс-бар ----

def pbar_create(total_frames: int) -> tqdm:
    return tqdm(total=total_frames, desc="initializing...", leave=True, dynamic_ncols=True)

def pbar_update(pbar: tqdm, batch_frames: int, desc_parts: Dict[str, Any]):
    # аккуратное обновление, чтобы не переполнить total
    inc = min(batch_frames, pbar.total - pbar.n)
    if inc > 0:
        pbar.update(inc)
    # человекочитаемое описание
    formatted = []
    for k, v in desc_parts.items():
        if isinstance(v, float):
            formatted.append(f"{k}={v: .4f}")
        else:
            formatted.append(f"{k}: {v}")
    pbar.set_description(", ".join(formatted))

def pbar_write(pbar: tqdm, text: str):
    pbar.write(text)

def pbar_close(pbar: tqdm):
    pbar.close()

# ---- Логирование ----

def _current_lr(optim: torch.optim.Optimizer, scheduler=None) -> float:
    if scheduler is not None:
        try:
            return float(scheduler.get_last_lr()[0])
        except Exception:
            pass
    # берем lr первой группы
    return float(optim.param_groups[0]["lr"])

def log_train_metrics(
    logs: LoggerHandles, 
    metrics: Dict[str, float], 
    step: int
):
    # CSV/TensorBoard
    logs.train_csv.log_metrics(metrics, step=step)
    logs.train_tb.log_metrics(metrics, step=step)
    # TXT
    logs.train_txt_logger.info(" | ".join([f"{k}={v}" for k, v in metrics.items()]))

def log_eval_metrics(
    logs: LoggerHandles, 
    metrics: Dict[str, float], 
    step: int
):
    logs.eval_csv.log_metrics(metrics, step=step)
    logs.eval_tb.log_metrics(metrics, step=step)
    logs.eval_txt_logger.info(" | ".join([f"{k}={v}" for k, v in metrics.items()]))

# ---- Чекпоинты ----

def save_checkpoint(
    paths: RunPaths,
    *,
    actor: torch.nn.Module,
    optimizer: torch.optim.Optimizer,
    scheduler: Optional[Any],
    step: int,
    eval_metrics: Dict[str, Any],
    algo_name: str,
    env_name: str,
    keep_last_k: int = 3
) -> Path:
    avg_ret = eval_metrics.get("return_mean", None)
    tag = f"step{step}"
    if avg_ret is not None:
        # безопасное имя файла
        safe_ret = re.sub(r"[^0-9eE\-\.+]", "", f"{avg_ret:.3f}")
        tag += f"_ret{safe_ret}"
    ckpt_path = paths.ckpt_dir / f"{algo_name}_{env_name}_{tag}.pt"

    payload = {
        "step": step,
        "algo": algo_name,
        "env": env_name,
        "timestamp": datetime.now().isoformat(timespec="seconds"),
        "model_state_dict": actor.state_dict(),
        "optimizer_state_dict": optimizer.state_dict(),
        "scheduler_state_dict": scheduler.state_dict() if scheduler is not None else None,
        "eval_metrics": eval_metrics,
        "run_dir": str(paths.run_dir),
    }
    torch.save(payload, ckpt_path)

    # Retention policy: оставить только N последних
    ckpts = sorted(paths.ckpt_dir.glob("*.pt"), key=lambda p: p.stat().st_mtime)
    if len(ckpts) > keep_last_k:
        for old in ckpts[:-keep_last_k]:
            try:
                old.unlink()
            except Exception:
                pass

    return ckpt_path

# ---- Инициализация всего набора ----

def setup_run(
    root: Path, algo_name: str, env_name: str,
    train_cfg: TrainConfig,
    run_cfg: Optional[RunConfig] = None,
    meta_extra: Optional[Dict[str, Any]] = None
) -> Tuple[RunPaths, LoggerHandles, RunConfig]:
    run_cfg = run_cfg or RunConfig(root=root, algo_name=algo_name, env_name=env_name)
    paths = _make_run_dirs(run_cfg)
    _dump_meta_yaml(paths, run_cfg, train_cfg, extra=meta_extra)
    loggers = _get_loggers(paths)
    return paths, loggers, run_cfg



# Logging

In [8]:

def _safe_logger(name: str, file_path: Path) -> logging.Logger:
    logger = logging.getLogger(name)
    logger.setLevel(logging.INFO)
    logger.propagate = False
    # чтобы не дублировать хендлеры при повторных запусках ячейки
    if not any(isinstance(h, logging.FileHandler) and getattr(h, "_file_path", None) == str(file_path) 
               for h in logger.handlers):
        fh = logging.FileHandler(file_path, encoding="utf-8")
        fh._file_path = str(file_path)
        fmt = logging.Formatter("%(asctime)s | %(message)s")
        fh.setFormatter(fmt)
        logger.addHandler(fh)
    return logger

def _get_loggers(paths: RunPaths) -> LoggerHandles:
    # CSV/TB логгеры (раздельно train/eval)
    train_csv = CSVLogger(save_dir=str(paths.csv_train_dir), name="metrics")
    eval_csv  = CSVLogger(save_dir=str(paths.csv_eval_dir),  name="metrics")

    train_tb  = TensorBoardLogger(save_dir=str(paths.tb_train_dir), name="tb")
    eval_tb   = TensorBoardLogger(save_dir=str(paths.tb_eval_dir),  name="tb")

    # Текстовые логи
    train_txt = _safe_logger(f"train_txt_{paths.run_dir.name}", paths.txt_train_dir / "train.log")
    eval_txt  = _safe_logger(f"eval_txt_{paths.run_dir.name}",  paths.txt_eval_dir / "eval.log")

    return LoggerHandles(
        train_csv=train_csv, eval_csv=eval_csv,
        train_tb=train_tb,   eval_tb=eval_tb,
        train_txt_logger=train_txt, eval_txt_logger=eval_txt
    )

def _dump_meta_yaml(paths: RunPaths, run_cfg: RunConfig, train_cfg: TrainConfig, extra: Optional[Dict[str, Any]] = None):
    meta = {
        "run_name": paths.run_dir.name,
        "algo": run_cfg.algo_name,
        "env": run_cfg.env_name,
        "started_at": datetime.now().isoformat(timespec="seconds"),
        "train_config": asdict(train_cfg),
    }
    if extra:
        meta["extra"] = extra
    with open(paths.meta_yaml, "w", encoding="utf-8") as f:
        yaml.safe_dump(meta, f, allow_unicode=True, sort_keys=False)


# Progress Bar

In [9]:

def pbar_create(total_frames: int) -> tqdm:
    return tqdm(total=total_frames, desc="initializing...", leave=True, dynamic_ncols=True)

def pbar_update(pbar: tqdm, batch_frames: int, desc_parts: Dict[str, Any]):
    # аккуратное обновление, чтобы не переполнить total
    inc = min(batch_frames, pbar.total - pbar.n)
    if inc > 0:
        pbar.update(inc)
    # человекочитаемое описание
    formatted = []
    for k, v in desc_parts.items():
        if isinstance(v, float):
            formatted.append(f"{k}={v: .4f}")
        else:
            formatted.append(f"{k}: {v}")
    pbar.set_description(", ".join(formatted))

def pbar_write(pbar: tqdm, text: str):
    pbar.write(text)

def pbar_close(pbar: tqdm):
    pbar.close()

# ---- Логирование ----

def _current_lr(optim: torch.optim.Optimizer, scheduler=None) -> float:
    if scheduler is not None:
        try:
            return float(scheduler.get_last_lr()[0])
        except Exception:
            pass
    # берем lr первой группы
    return float(optim.param_groups[0]["lr"])

def log_train_metrics(
    logs: LoggerHandles, 
    metrics: Dict[str, float], 
    step: int
):
    # CSV/TensorBoard
    logs.train_csv.log_metrics(metrics, step=step)
    logs.train_tb.log_metrics(metrics, step=step)
    # TXT
    logs.train_txt_logger.info(" | ".join([f"{k}={v}" for k, v in metrics.items()]))

def log_eval_metrics(
    logs: LoggerHandles, 
    metrics: Dict[str, float], 
    step: int
):
    logs.eval_csv.log_metrics(metrics, step=step)
    logs.eval_tb.log_metrics(metrics, step=step)
    logs.eval_txt_logger.info(" | ".join([f"{k}={v}" for k, v in metrics.items()]))


# Checkpointing

In [10]:

def save_checkpoint(
    paths: RunPaths,
    *,
    actor: torch.nn.Module,
    optimizer: torch.optim.Optimizer,
    scheduler: Optional[Any],
    step: int,
    eval_metrics: Dict[str, Any],
    algo_name: str,
    env_name: str,
    keep_last_k: int = 3
) -> Path:
    avg_ret = eval_metrics.get("return_mean", None)
    tag = f"step{step}"
    if avg_ret is not None:
        # безопасное имя файла
        safe_ret = re.sub(r"[^0-9eE\-\.+]", "", f"{avg_ret:.3f}")
        tag += f"_ret{safe_ret}"
    ckpt_path = paths.ckpt_dir / f"{algo_name}_{env_name}_{tag}.pt"

    payload = {
        "step": step,
        "algo": algo_name,
        "env": env_name,
        "timestamp": datetime.now().isoformat(timespec="seconds"),
        "model_state_dict": actor.state_dict(),
        "optimizer_state_dict": optimizer.state_dict(),
        "scheduler_state_dict": scheduler.state_dict() if scheduler is not None else None,
        "eval_metrics": eval_metrics,
        "run_dir": str(paths.run_dir),
    }
    torch.save(payload, ckpt_path)

    # Retention policy: оставить только N последних
    ckpts = sorted(paths.ckpt_dir.glob("*.pt"), key=lambda p: p.stat().st_mtime)
    if len(ckpts) > keep_last_k:
        for old in ckpts[:-keep_last_k]:
            try:
                old.unlink()
            except Exception:
                pass

    return ckpt_path

# Training Loop

In [11]:

def setup_run(
    root: Path, algo_name: str, env_name: str,
    train_cfg: TrainConfig,
    run_cfg: Optional[RunConfig] = None,
    meta_extra: Optional[Dict[str, Any]] = None
) -> Tuple[RunPaths, LoggerHandles, RunConfig]:
    run_cfg = run_cfg or RunConfig(root=root, algo_name=algo_name, env_name=env_name)
    paths = _make_run_dirs(run_cfg)
    _dump_meta_yaml(paths, run_cfg, train_cfg, extra=meta_extra)
    loggers = _get_loggers(paths)
    return paths, loggers, run_cfg

In [12]:
# === training.py (пример использования) ===
from pathlib import Path
import torch
from agentslab.runners.evals import eval_policy

# Ваши заранее определённые объекты/переменные:
# env, actor, collector, advantage_module, replay_buffer, loss_module, optim, scheduler, frames_per_batch, device
# а также общее число фреймов для тренировки (например, total_frames)
# здесь считаем, что total_frames известен извне (например, как параметр эксперимента)

# Корневой каталог и названия алгоритма и среды
from pathlib import Path
ROOT = Path("..").resolve()
ALGO_NAME, ENV_NAME = "ppo", "pendulum"

# Конфиги
train_cfg = TrainConfig(num_epochs=10, sub_batch_size=64, max_grad_norm=1.0)
# eval каждые 5 батчей — совпадает с исходным кодом
run_cfg   = RunConfig(root=ROOT, algo_name=ALGO_NAME, env_name=ENV_NAME, eval_every_batches=5, eval_episodes=5, keep_last_k_ckpts=3)

# Инициализация run-директории, логгеров и метаинформации
meta_extra = {
    "frames_per_batch": int(frames_per_batch),
    "device": str(device),
}
paths, logs, run_cfg = setup_run(ROOT, ALGO_NAME, ENV_NAME, train_cfg, run_cfg, meta_extra=meta_extra)

# Подготовка прогресс-бара
# total_frames — количество фреймов, которое вы планируете собрать/обучить за весь ран
pbar = pbar_create(total_frames=total_frames)

global_frames = 0  # будем логировать шаги в терминах фреймов (удобно для RL)
batch_index = 0

try:
    for i, tensordict_data in enumerate(collector):
        batch_index += 1

        # === Обучение на партии ===
        for _ in range(train_cfg.num_epochs):
            # Advantage пересчитываем на каждом проходе
            advantage_module(tensordict_data)
            data_view = tensordict_data.reshape(-1)
            replay_buffer.extend(data_view.cpu())
            # Разбиваем на саббатчи
            iters = int(frames_per_batch) // int(train_cfg.sub_batch_size)
            for _ in range(iters):
                subdata = replay_buffer.sample(train_cfg.sub_batch_size)
                loss_vals = loss_module(subdata.to(device))
                loss_total = (
                    loss_vals["loss_objective"] 
                    + loss_vals["loss_critic"] 
                    + loss_vals["loss_entropy"]
                )

                # Оптимизация
                loss_total.backward()
                torch.nn.utils.clip_grad_norm_(loss_module.parameters(), train_cfg.max_grad_norm)
                optim.step()
                optim.zero_grad()

        # шаг планировщика
        if scheduler is not None:
            scheduler.step()

        # === Подсчёт метрик тренировки ===
        # корректное число фреймов в текущей пачке
        batch_frames = int(tensordict_data.get(("next", "reward")).numel())
        global_frames += batch_frames

        # базовые метрики: средняя награда по партии, лоссы, lr
        with torch.no_grad():
            avg_reward = float(tensordict_data.get(("next", "reward")).float().mean().cpu().item())
        lr_val = _current_lr(optim, scheduler=scheduler)

        train_metrics = {
            "reward": avg_reward,
            "loss_objective": float(loss_vals["loss_objective"].detach().cpu().item()),
            "loss_critic": float(loss_vals["loss_critic"].detach().cpu().item()),
            "loss_entropy": float(loss_vals["loss_entropy"].detach().cpu().item()),
            "loss_total": float(loss_total.detach().cpu().item()),
            "lr": lr_val,
            "batch_frames": float(batch_frames),   # полезно иметь и в csv
            "global_frames": float(global_frames)  # для графиков
        }
        log_train_metrics(logs, train_metrics, step=global_frames)

        # === Обновление прогресс-бара ===
        pbar_update(
            pbar,
            batch_frames=batch_frames,
            desc_parts={
                "avg reward": avg_reward,
                "frames": global_frames,
                "lr": lr_val
            }
        )

        # === Периодическая оценка ===
        if (batch_index % run_cfg.eval_every_batches) == 0:
            eval_results = eval_policy(env, actor, episodes=run_cfg.eval_episodes, progress=False)
            # ожидаем, что eval_policy вернёт хотя бы 'return_mean' и 'max_episode_lengh'
            eval_metrics = {
                "return_mean": float(eval_results.get("return_mean", float("nan"))),
                "max_episode_length": float(eval_results.get("max_episode_lengh", float("nan"))),
                "global_frames": float(global_frames)
            }
            # логируем отдельно в eval-логи
            log_eval_metrics(logs, eval_metrics, step=global_frames)

            # пишем в прогресс-бар (не ломая его)
            pbar_write(
                pbar,
                f"eval: avg reward = {eval_metrics['return_mean']:.3f}, "
                f"max episode length = {eval_metrics['max_episode_length']:.0f}"
            )

            # сохраняем чекпоинт по результатам оценки
            ckpt_path = save_checkpoint(
                paths,
                actor=actor,
                optimizer=optim,
                scheduler=scheduler,
                step=global_frames,
                eval_metrics=eval_metrics,
                algo_name=ALGO_NAME,
                env_name=ENV_NAME,
                keep_last_k=run_cfg.keep_last_k_ckpts
            )
            logs.eval_txt_logger.info(f"checkpoint saved: {ckpt_path.name}")

        # опционально — условие выхода, если собрали достаточно фреймов
        if global_frames >= pbar.total:
            break

finally:
    # Гарантированно закрываем бар (важно для корректного вывода в ноутбуках)
    pbar_close(pbar)


initializing...:   0%|          | 0/10000 [00:00<?, ?it/s]

eval: avg reward = 118.453, max episode length = 15
eval: avg reward = 153.850, max episode length = 22


# Graphs

In [13]:
# === plotting_example.py (запускать в новой ячейке после обучения) ===
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

# Укажите путь к последнему ран-каналу, либо найдите автоматически:
ROOT = Path("..").resolve()
runs_dir = ROOT / "runs"

# Найдём последний ран (по времени модификации папки)
run_dirs = sorted([p for p in runs_dir.glob("*") if p.is_dir()], key=lambda p: p.stat().st_mtime, reverse=True)
assert len(run_dirs) > 0, "Не найдено ни одного запуска в runs/"
run_dir = run_dirs[0]
print("Using run_dir:", run_dir)

def _load_pl_csv(csv_root: Path) -> pd.DataFrame:
    """
    CSVLogger из Lightning создаёт иерархию: <csv_root>/<name>/version_x/metrics.csv
    и может писать в «длинном» формате (name, step, value) либо «широком» (step + метрики).
    Эта функция аккуратно приводить к широкому формату: столбцы — метрики, индекс — step.
    """
    # ищем все варианты metrics.csv
    metrics_files = list(csv_root.glob("**/metrics.csv"))
    if not metrics_files:
        return pd.DataFrame()
    # берём последний по времени
    metrics_path = sorted(metrics_files, key=lambda p: p.stat().st_mtime)[-1]
    df = pd.read_csv(metrics_path)

    if {"name", "step", "value"}.issubset(df.columns):
        # длинный формат -> pivot
        wide = df.pivot_table(index="step", columns="name", values="value", aggfunc="last")
        wide.sort_index(inplace=True)
        wide.reset_index(inplace=True)
        return wide
    else:
        # уже широкий формат
        if "step" not in df.columns:
            # если step отсутствует — добавим монотонный по индексу
            df.insert(0, "step", range(len(df)))
        return df

train_csv_root = run_dir / "csv_logs" / "train"
eval_csv_root  = run_dir / "csv_logs" / "eval"

df_train = _load_pl_csv(train_csv_root)
df_eval  = _load_pl_csv(eval_csv_root)

print("Train columns:", df_train.columns.tolist())
print("Eval columns:", df_eval.columns.tolist())

# === Примеры графиков ===
plt.figure()
plt.plot(df_train["step"], df_train.get("reward", pd.Series([float("nan")]*len(df_train))))
plt.title("Train: average reward")
plt.xlabel("global frames (step)")
plt.ylabel("reward")
plt.grid(True)
plt.show()

if "loss_total" in df_train.columns:
    plt.figure()
    plt.plot(df_train["step"], df_train["loss_total"])
    plt.title("Train: total loss")
    plt.xlabel("global frames (step)")
    plt.ylabel("loss_total")
    plt.grid(True)
    plt.show()

if not df_eval.empty and "return_mean" in df_eval.columns:
    plt.figure()
    plt.plot(df_eval["step"], df_eval["return_mean"], marker="o")
    plt.title("Eval: return_mean")
    plt.xlabel("global frames (step)")
    plt.ylabel("return_mean")
    plt.grid(True)
    plt.show()


Using run_dir: C:\Users\werna\Documents\GitHub\AgentsLab\runs\ppo_pendulum_20250822-192147
Train columns: []
Eval columns: []


KeyError: 'step'

<Figure size 640x480 with 0 Axes>

# Evaluating

In [None]:
from agentslab.runners.evals import eval_policy

eval_policy(env, actor, episodes=50)

eval:   0%|          | 0/50 [00:00<?, ?it/s]

{'return_mean': 179.49685134887696,
 'return_sum': 8974.842567443848,
 'max_episode_lengh': 33,
 'num_episodes': 50}