Treina os vários agents base, nos vários ambientes que criarmos em várias seeds para cada episódio predefinido.

A ideia é ir buscar o máximo de dados para poder entender o quão eficaz um ambiente,agente ou episódio está a ser

Desta forma, no futuro pode-se escolher a combinação episódio,ambiente,agente com maior probabilidade de sucesso na transferibilidade de um timeframe para outro

---

### ✅ **Meta-Features (Input for prediction)**

These are all continuous, float64:

* `mean_return`, `median_return`, `std_return`, `skew_return`, `kurtosis_return`
* `return_trend`, `ewm_mean_return`, `hurst`, `adf_stat`, `adf_pval`, `entropy`
* `volatility`, `max_drawdown`, `sharpe`, `sortino`, `calmar`

### ✅ **Experiment Identity Columns**

These identify the agent, environment, and run setup:

* `config_hash`, `env_version`, `agent_name`, `ticker`, `config`, `seed`, `train_idx`, `test_idx`, `config_dict`

### ✅ **Outcome Scores**

Direct outputs of agent evaluation:

* `score_train`, `score_test`
* `advantage_train`, `advantage_test`
* `transfer_delta`, `transfer_success` *(int: 1 if advantage improved)*

### ✅ **Diagnostics / Behavior Metrics**

Agent behavior across episode:

* `success_trades`, `action_hold_ratio`, `action_long_ratio`

### ✅ **Control Parameters**

* `timesteps`, `episode_steps` *(both int64)*

---

**This dataframe is ready** for:

* Scoring (learnability, transferability, difficulty)
* Meta-model training
* Autoencoder or representation learning
* Filtering based on any metric

Let’s proceed to the next module you want to implement or improve. Should we work on the **meta-model enhancer** (autoencoder + predictive model) in PyTorch next?


In [3]:
import jupyter

In [2]:
import random
import numpy as np
import pandas as pd
import seaborn as sns
import gymnasium as gym
import matplotlib.pyplot as plt


from src.utils.system import boot, Notify
from src.defaults import RANDOM_SEEDS
from src.data.feature_pipeline import load_base_dataframe
from experiments import check_if_experiment_exists, register_experiment ,experiment_hash
from environments import PositionTradingEnv,PositionTradingEnvV1,PositionTradingEnvV2

# ========== SYSTEM BOOT ==========
DEVICE = boot()
EXPERIMENT_NAME = "trading_environment_development"
DEFAULT_PATH = "data/experiments/" + EXPERIMENT_NAME

# ========== CONFIG ==========
TICKER = "AAPL"
TIMESTEPS = 100_000
EVAL_EPISODES = 5
N_TIMESTEPS = 120
LOOKBACK = 0
SEEDS = RANDOM_SEEDS
MARKET_FEATURES = ['close']
BENCHMARK_PATH = DEFAULT_PATH+"/benchmark_episodes.json"
CHECKPOINT_DIR = DEFAULT_PATH+"/checkpoints"
SCORES_DIR = DEFAULT_PATH+"/scores"
META_PATH = DEFAULT_PATH+"/meta_df.csv"

MARKET_FEATURES.sort()
SEEDS.sort()

DEVICE = boot()
OHLCV_DF = load_base_dataframe()

NOTIFICATION = Notify(EXPERIMENT_NAME)

In [None]:
import os
import json
import hashlib
import numpy as np
import pandas as pd
from typing import Callable
from stable_baselines3 import PPO, A2C
from stable_baselines3.common.monitor import Monitor
from environments import PositionTradingEnv, PositionTradingEnvV1, PositionTradingEnvV2
from data import extract_meta_features
from scipy.stats import ttest_ind
import seaborn as sns
import matplotlib.pyplot as plt

class TradingEnvironmentLearningQuality:
    def __init__(self, df, benchmark_path=BENCHMARK_PATH, result_path=DEFAULT_PATH+"/meta_df_transfer.csv", checkpoint_dir=CHECKPOINT_DIR):
        self.df = df
        self.benchmark_path = benchmark_path
        self.result_path = result_path
        self.checkpoint_dir = checkpoint_dir
        self.benchmark_episodes = self.load_benchmark_episodes()
        os.makedirs(os.path.dirname(self.result_path), exist_ok=True)
        os.makedirs(self.checkpoint_dir, exist_ok=True)

    def load_benchmark_episodes(self):
        if os.path.exists(self.benchmark_path):
            with open(self.benchmark_path) as f:
                return json.load(f)
        else:
            print("[INFO] Sampling benchmark episodes...")
            np.random.seed(0)
            from data import sample_valid_episodes
            ticker = self.df['symbol'].iloc[0]
            benchmark_episodes = sample_valid_episodes(self.df[self.df['symbol'] == ticker], ticker, 120, 0, 5)
            with open(self.benchmark_path, "w") as f:
                json.dump(benchmark_episodes.tolist(), f)
            return benchmark_episodes

    def evaluate(self, ticker, env_cls, agent_cls, timesteps=TIMESTEPS, n_timesteps=N_TIMESTEPS, lookback=LOOKBACK, seeds=SEEDS,  env_config={"market_features":MARKET_FEATURES}, agent_config=None):
        def generate_config_hash(config):
            raw = json.dumps(config, sort_keys=True)
            return hashlib.sha256(raw.encode()).hexdigest()

        def save_model(model, config_full, config_hash):
            path = os.path.join(self.checkpoint_dir, f"agent_{config_hash}.zip")
            model.save(path)
            with open(path.replace(".zip", "_config.json"), "w") as f:
                json.dump(config_full, f, indent=2)

        agent_name = agent_cls.__name__
        env_version = f"v{env_cls.__version__}"
        df_ticker = self.df[self.df['symbol'] == ticker].reset_index(drop=True)
        meta_records = []

        if os.path.exists(self.result_path):
            existing = pd.read_csv(self.result_path)
            seen_hashes = set(zip(existing['config_hash'], existing['agent_name'], existing['seed']))
        else:
            seen_hashes = set()

        for seed in seeds:
            for start_idx in self.benchmark_episodes:
                test_idx = start_idx + n_timesteps
                if test_idx + n_timesteps >= len(df_ticker):
                    continue

                config = {
                    "ticker": ticker, "train_idx": int(start_idx), "test_idx": int(test_idx),
                    "timesteps": timesteps, "episode_steps": n_timesteps,
                    "env_version": env_version, "env_config": env_config,
                    "agent_config": agent_config
                }
                config_hash = generate_config_hash(config)
                if (config_hash, agent_name, seed) in seen_hashes:
                    continue

                env_train = Monitor(env_cls(df_ticker, ticker=ticker, seed=seed, start_idx=start_idx, **(env_config or {})))
                model = agent_cls("MlpPolicy", env_train, verbose=0, seed=seed, **(agent_config or {}))
                model.learn(total_timesteps=timesteps)

                score_train = self._evaluate_model(model, env_train)
                rand_train = self._evaluate_random(env_train)

                env_test = Monitor(env_cls(df_ticker, ticker=ticker, seed=seed, start_idx=test_idx, **(env_config or {})))
                score_test = self._evaluate_model(model, env_test)
                rand_test = self._evaluate_random(env_test)

                advantage_train = score_train - rand_train
                advantage_test = score_test - rand_test
                transfer_delta = score_test - score_train

                save_model(model, config, config_hash)

                meta = extract_meta_features(df_ticker.iloc[start_idx:start_idx + n_timesteps])
                diagnostics = self._compute_additional_metrics(env_test)

                meta.update({
                    "config_hash": config_hash, "env_version": env_version,
                    "agent_name": agent_name, "score_train": score_train,
                    "score_test": score_test, "advantage_train": advantage_train,
                    "advantage_test": advantage_test, "transfer_delta": transfer_delta,
                    "transfer_success": int(transfer_delta > 0), "ticker": ticker,
                    "seed": seed, "train_idx": int(start_idx), "test_idx": int(test_idx),
                    "timesteps": timesteps, "episode_steps": n_timesteps,
                    **diagnostics
                })
                meta_records.append(meta)

        result_df = pd.DataFrame(meta_records)
        if os.path.exists(self.result_path):
            result_df = pd.concat([pd.read_csv(self.result_path), result_df], ignore_index=True)
        result_df.to_csv(self.result_path, index=False)
        return result_df

    def _evaluate_model(self, model, env):
        obs, _ = env.reset()
        done, score = False, 0
        while not done:
            action, _ = model.predict(obs, deterministic=True)
            obs, reward, done, _, _ = env.step(action)
            score += reward
        return score

    def _evaluate_random(self, env):
        obs, _ = env.reset()
        done, score = False, 0
        while not done:
            action = env.action_space.sample()
            obs, reward, done, _, _ = env.step(action)
            score += reward
        return score

    def _compute_additional_metrics(self, env):
        if hasattr(env, "env"):  # unwrap Monitor
            env = env.env
        values = np.array(env.values)
        rewards = np.array(env.rewards)
        actions = np.array(env.actions)
        returns = pd.Series(values).pct_change().dropna()
        volatility = returns.std()
        entropy = -np.sum(np.bincount(actions, minlength=2)/len(actions) * np.log2(np.bincount(actions, minlength=2)/len(actions) + 1e-9))
        max_drawdown = (values / np.maximum.accumulate(values)).min() - 1
        sharpe = returns.mean() / (returns.std() + 1e-9) * np.sqrt(252)
        sortino = returns.mean() / (returns[returns < 0].std() + 1e-9) * np.sqrt(252)
        calmar = returns.mean() / abs(max_drawdown + 1e-9)
        success_trades = np.sum((np.diff(values) > 0) & (actions[1:] == 1)) + np.sum((np.diff(values) < 0) & (actions[1:] == 0))
        return {
            "volatility": volatility, "entropy": entropy, "max_drawdown": max_drawdown,
            "sharpe": sharpe, "sortino": sortino, "calmar": calmar,
            "success_trades": success_trades,
            "action_hold_ratio": np.mean(actions == 0),
            "action_long_ratio": np.mean(actions == 1)
        }

    def compare_environments(self, result_df, env_version_a="v0", env_version_b="v1"):
        summary = result_df.groupby("env_version")[[
            "score_train", "score_test", "advantage_train", "advantage_test",
            "transfer_delta", "success_trades", "sharpe", "sortino", "calmar",
            "max_drawdown", "volatility", "action_hold_ratio", "action_long_ratio"
        ]].agg(["mean", "std", "median"]).T

        mean_df = summary.xs('mean', level=1)
        diffs = (mean_df[env_version_a] - mean_df[env_version_b]).abs().sort_values(ascending=False)
        mean_df.loc[diffs.index].plot.bar(
            figsize=(14, 6),
            title=f"Env {env_version_a} vs {env_version_b} – Mean metric comparison (sorted by difference)",
            ylabel="Mean Value"
        )

        metrics = ["score_test", "advantage_test", "transfer_delta", "sharpe", "sortino"]
        for metric in metrics:
            v0 = result_df[result_df.env_version == env_version_a][metric]
            v1 = result_df[result_df.env_version == env_version_b][metric]
            stat, pval = ttest_ind(v0, v1)
            print(f"{metric}: p={pval:.4f} | {env_version_a}_mean={v0.mean():.3f}, {env_version_b}_mean={v1.mean():.3f}")
            sns.boxplot(data=result_df, x="env_version", y=metric)
            plt.title(f"{metric} by Environment Version")
            plt.show()

        result_df['composite_score'] = (
            result_df['advantage_test'] +
            result_df['transfer_delta'] +
            result_df['sharpe'] * 5 -
            result_df['max_drawdown'] * 10
        )
        return result_df, result_df.groupby("env_version")["composite_score"].mean()


In [None]:
import os
import json
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from typing import Callable
from stable_baselines3 import PPO, A2C
from stable_baselines3.common.monitor import Monitor
from environments import PositionTradingEnv, PositionTradingEnvV1, PositionTradingEnvV2
from data import extract_meta_features
from scipy.stats import ttest_ind

class TradingEnvironmentBattleground:
    def __init__(self, 
                 df: pd.DataFrame,
                 benchmark_path: str,
                 result_path: str,
                 checkpoint_dir: str,
                 seeds: list,
                 timesteps: int,
                 n_timesteps: int,
                 lookback: int,
                 agents: list,
                 envs: list,
                 market_features: list):
        self.df = df
        self.benchmark_path = benchmark_path
        self.result_path = result_path
        self.checkpoint_dir = checkpoint_dir
        self.seeds = seeds
        self.timesteps = timesteps
        self.n_timesteps = n_timesteps
        self.lookback = lookback
        self.agents = agents
        self.envs = envs
        self.market_features = market_features

    def load_benchmark_episodes(self, ticker, eval_episodes):
        if os.path.exists(self.benchmark_path):
            with open(self.benchmark_path) as f:
                self.benchmark_episodes = json.load(f)
        else:
            print("[INFO] Sampling benchmark episodes...")
            np.random.seed(0)
            from src.utils.episode_sampling import sample_valid_episodes
            self.benchmark_episodes = sample_valid_episodes(
                self.df[self.df['symbol'] == ticker], ticker, self.n_timesteps, self.lookback, eval_episodes)
            with open(self.benchmark_path, "w") as f:
                json.dump(self.benchmark_episodes, f)
        print("[INFO] Episódios de benchmark carregados.")

    def _generate_config_hash(self, config):
        return hashlib.sha256(json.dumps(config, sort_keys=True).encode()).hexdigest()

    def _compute_additional_metrics(self, env):
        if hasattr(env, "env"):  # unwrap Monitor
            env = env.env
        values = np.array(env.values)
        rewards = np.array(env.rewards)
        actions = np.array(env.actions)
        returns = pd.Series(values).pct_change().dropna()
        return {
            "volatility": returns.std(),
            "entropy": -np.sum(np.bincount(actions, minlength=2)/len(actions) * np.log2(np.bincount(actions, minlength=2)/len(actions) + 1e-9)),
            "max_drawdown": (values / np.maximum.accumulate(values)).min() - 1,
            "sharpe": returns.mean() / (returns.std() + 1e-9) * np.sqrt(252),
            "sortino": returns.mean() / (returns[returns < 0].std() + 1e-9) * np.sqrt(252),
            "calmar": returns.mean() / abs((values / np.maximum.accumulate(values)).min() - 1 + 1e-9),
            "success_trades": np.sum((np.diff(values) > 0) & (actions[1:] == 1)) + np.sum((np.diff(values) < 0) & (actions[1:] == 0)),
            "action_hold_ratio": np.mean(actions == 0),
            "action_long_ratio": np.mean(actions == 1)
        }

    def evaluate(self, ticker: str):
        df_ticker = self.df[self.df['symbol'] == ticker].reset_index(drop=True)
        os.makedirs(self.checkpoint_dir, exist_ok=True)
        os.makedirs(os.path.dirname(self.result_path), exist_ok=True)
        meta_records = []

        if os.path.exists(self.result_path):
            existing = pd.read_csv(self.result_path)
            seen_hashes = set(zip(existing['config_hash'], existing['agent_name'], existing['seed']))
        else:
            seen_hashes = set()

        for env_cls in self.envs:
            for agent_cls in self.agents:
                for seed in self.seeds:
                    for start_idx in self.benchmark_episodes:
                        test_idx = start_idx + self.n_timesteps
                        if test_idx + self.n_timesteps >= len(df_ticker):
                            continue

                        config = {
                            "ticker": ticker,
                            "train_idx": int(start_idx),
                            "test_idx": int(test_idx),
                            "timesteps": self.timesteps,
                            "episode_steps": self.n_timesteps,
                            "env_version": f"v{env_cls.__version__}",
                            "env_config": {"market_features": self.market_features},
                            "agent_config": {},
                        }
                        config_hash = self._generate_config_hash(config)
                        if (config_hash, agent_cls.__name__, seed) in seen_hashes:
                            continue

                        env_train = Monitor(env_cls(df_ticker, ticker=ticker, seed=seed, start_idx=start_idx, market_features=self.market_features))
                        model = agent_cls("MlpPolicy", env_train, verbose=0, seed=seed)
                        model.learn(total_timesteps=self.timesteps)

                        def eval_agent(env):
                            obs, _ = env.reset()
                            done, score = False, 0
                            while not done:
                                action, _ = model.predict(obs, deterministic=True)
                                obs, reward, done, _, _ = env.step(action)
                                score += reward
                            return score

                        def eval_random(env):
                            obs, _ = env.reset()
                            done, score = False, 0
                            while not done:
                                action = env.action_space.sample()
                                obs, reward, done, _, _ = env.step(action)
                                score += reward
                            return score

                        score_train = eval_agent(env_train)
                        rand_train = eval_random(env_train)

                        env_test = Monitor(env_cls(df_ticker, ticker=ticker, seed=seed, start_idx=test_idx, market_features=self.market_features))
                        score_test = eval_agent(env_test)
                        rand_test = eval_random(env_test)

                        advantage_train = score_train - rand_train
                        advantage_test = score_test - rand_test
                        transfer_delta = score_test - score_train

                        meta = extract_meta_features(df_ticker.iloc[start_idx:start_idx + self.n_timesteps])
                        diagnostics = self._compute_additional_metrics(env_test)

                        meta.update({
                            "config_hash": config_hash,
                            "env_version": f"v{env_cls.__version__}",
                            "agent_name": agent_cls.__name__,
                            "score_train": score_train,
                            "score_test": score_test,
                            "advantage_train": advantage_train,
                            "advantage_test": advantage_test,
                            "transfer_delta": transfer_delta,
                            "transfer_success": int(transfer_delta > 0),
                            "ticker": ticker,
                            "train_idx": int(start_idx),
                            "test_idx": int(test_idx),
                            "timesteps": self.timesteps,
                            "episode_steps": self.n_timesteps,
                            "seed": seed,
                            **diagnostics
                        })

                        meta_records.append(meta)

        result_df = pd.DataFrame(meta_records)
        if os.path.exists(self.result_path):
            result_df = pd.concat([pd.read_csv(self.result_path), result_df], ignore_index=True)
        result_df.to_csv(self.result_path, index=False)
        print("[INFO] Evaluation complete.")
        return result_df

    def compare_environments(self, result_df, env_version_a="v0", env_version_b="v1"):
        summary = result_df.groupby("env_version")[[
            "score_train", "score_test", "advantage_train", "advantage_test",
            "transfer_delta", "success_trades", "sharpe", "sortino", "calmar",
            "max_drawdown", "volatility", "action_hold_ratio", "action_long_ratio"
        ]].agg(["mean", "std", "median"]).T

        mean_df = summary.xs('mean', level=1)
        diffs = (mean_df[env_version_a] - mean_df[env_version_b]).abs().sort_values(ascending=False)

        mean_df.loc[diffs.index].plot.bar(
            figsize=(14, 6),
            title=f"Env {env_version_a} vs {env_version_b} – Mean metric comparison",
            ylabel="Mean Value"
        )

        metrics = ["score_test", "advantage_test", "transfer_delta", "sharpe", "sortino"]
        for metric in metrics:
            v0 = result_df[result_df.env_version == env_version_a][metric]
            v1 = result_df[result_df.env_version == env_version_b][metric]
            stat, pval = ttest_ind(v0, v1)
            print(f"{metric}: p={pval:.4f} | {env_version_a}_mean={v0.mean():.3f}, {env_version_b}_mean={v1.mean():.3f}")

        for metric in metrics:
            sns.boxplot(data=result_df, x="env_version", y=metric)
            plt.title(f"{metric} by Environment Version")
            plt.show()

        result_df['composite_score'] = (
            result_df['advantage_test'] +
            result_df['transfer_delta'] +
            result_df['sharpe'] * 5 -
            result_df['max_drawdown'] * 10
        )

        return result_df, result_df.groupby("env_version")["composite_score"].mean()
