In [17]:
# Install required packages
%pip install numpy pandas matplotlib seaborn plotly scikit-learn stable-baselines3 gymnasium pyarrow ta


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [None]:

# ===================== Paper-Based Signal Generation =====================
import os
import numpy as np
import pandas as pd
from enum import Enum

# Paper-Based Configuration (from research paper)
PATH = "../ETHUSDT_1m_with_indicators.parquet"
TS_COL = "ts"
PRICE_COL = "close"

# Signal parameters (from research paper)
MA_PERIOD = 60                 # Moving average period for pseudo-spread
WINDOW_SIZE = 120              # Window for z-score normalization  
OPEN_THRESHOLD = 2.0           # Z-score threshold to open positions
CLOSE_THRESHOLD = 0.5          # Z-score threshold to close positions

# Trading parameters (optimized from methodology)
SEED_MONEY = 10000.0
FEE_RATE = 0.0001              # Reduced from 0.0005 to enable trading
SLIPPAGE = 0.0001              # Reduced from 0.0002  
TRANSACTION_PENALTY_RATE = 0.00005  # Reduced from 0.001
ACTION_REWARD_SCALE = 0.1      # Increased from 0.01

OUTPUT_DIR = "./processed_data_paper"

print("🔬 PAPER-BASED DATA PREPROCESSING")
print("================================")
print("Following 'Reinforcement Learning Pair Trading: A Dynamic Scaling Approach'")
print(f"   MA Period: {MA_PERIOD}, Window: {WINDOW_SIZE}")
print(f"   Thresholds: Open={OPEN_THRESHOLD}, Close={CLOSE_THRESHOLD}")
print(f"   Simplified approach: 3 values vs 32+ complex features")

class TradingZone(Enum):
    """Trading zones based on z-score thresholds (from paper)"""
    LONG_ZONE = 0      # Z-score < -open_threshold (price below MA - buy signal)
    NEUTRAL_LONG = 1   # -open_threshold <= Z-score < -close_threshold  
    CLOSE_ZONE = 2     # -close_threshold <= Z-score <= close_threshold
    NEUTRAL_SHORT = 3  # close_threshold < Z-score <= open_threshold
    SHORT_ZONE = 4     # Z-score > open_threshold (price above MA - sell signal)

def calculate_pseudo_spread(prices: pd.Series, ma_period: int, window_size: int) -> pd.DataFrame:
    """Calculate pseudo-spread following research paper methodology"""
    print(f"   📊 Calculating pseudo-spread signals...")
    
    df_signals = pd.DataFrame()
    
    # Step 1: Calculate moving average (pseudo-spread baseline)
    ma = prices.rolling(window=ma_period, min_periods=1).mean()
     
    # Step 2: Calculate spread (price deviation from MA)
    spread = prices - ma
    
    # Step 3: Normalize spread using z-score over rolling window
    spread_mean = spread.rolling(window=window_size, min_periods=1).mean()
    spread_std = spread.rolling(window=window_size, min_periods=1).std()
    z_score = (spread - spread_mean) / (spread_std + 1e-8)  # Avoid division by zero
    z_score = z_score.fillna(0)  # Fill NaN with neutral value
    
    # Store signals
    df_signals['price'] = prices
    df_signals['ma'] = ma
    df_signals['spread'] = spread
    df_signals['z_score'] = z_score
    
    print(f"      Z-score range: [{z_score.min():.2f}, {z_score.max():.2f}]")
    
    return df_signals

def calculate_trading_zone(z_score: float, open_threshold: float, close_threshold: float) -> int:
    """Calculate trading zone based on z-score thresholds"""
    if z_score > open_threshold:
        return TradingZone.SHORT_ZONE.value  # Price above MA - sell signal
    elif z_score > close_threshold:
        return TradingZone.NEUTRAL_SHORT.value
    elif z_score >= -close_threshold:
        return TradingZone.CLOSE_ZONE.value  # Near MA - close positions
    elif z_score >= -open_threshold:
        return TradingZone.NEUTRAL_LONG.value
    else:
        return TradingZone.LONG_ZONE.value   # Price below MA - buy signal

# ===================== Load and Process Data =====================
print("\n📁 Loading data...")
df = pd.read_parquet(PATH)
df = df.reset_index()
df.columns = df.columns.str.strip()
df = df.sort_values(TS_COL).reset_index(drop=True)

# Clean price data
df[PRICE_COL] = pd.to_numeric(df[PRICE_COL], errors='coerce')
df = df.dropna(subset=[PRICE_COL])
df = df[df[PRICE_COL] > 0]

print(f"   Data loaded: {len(df)} rows")
print(f"   Date range: {pd.to_datetime(df[TS_COL], unit='s').min()} to {pd.to_datetime(df[TS_COL], unit='s').max()}")

# ===================== Calculate Paper-Based Signals =====================
signals_df = calculate_pseudo_spread(df[PRICE_COL], MA_PERIOD, WINDOW_SIZE)

# Add zones
print(f"   📍 Calculating trading zones...")
signals_df['zone'] = signals_df['z_score'].apply(
    lambda x: calculate_trading_zone(x, OPEN_THRESHOLD, CLOSE_THRESHOLD)
)

# Merge signals back to dataframe
df = pd.concat([df.reset_index(drop=True), signals_df.reset_index(drop=True)], axis=1)

# Show zone distribution
zone_names = {
    TradingZone.LONG_ZONE.value: 'LONG',
    TradingZone.NEUTRAL_LONG.value: 'NEUTRAL_LONG', 
    TradingZone.CLOSE_ZONE.value: 'CLOSE',
    TradingZone.NEUTRAL_SHORT.value: 'NEUTRAL_SHORT',
    TradingZone.SHORT_ZONE.value: 'SHORT'
}
df['zone_name'] = df['zone'].map(zone_names)
zone_counts = df['zone_name'].value_counts()
print(f"\n   🎯 Zone distribution:")
for zone, count in zone_counts.items():
    print(f"     {zone}: {count} ({count/len(df)*100:.1f}%)")

# ===================== Split Data (Paper-Based Approach) =====================
print(f"\n✂️  Splitting data for paper-based training...")
split_idx = int(len(df) * 0.8)
train_df = df.iloc[:split_idx].copy()
test_df  = df.iloc[split_idx:].copy()

print(f"   Train data: {len(train_df)} timesteps")
print(f"   Test data: {len(test_df)} timesteps")

# Paper-based features: only the essential signals
paper_features = ['z_score', 'zone']  # Simplified from 32+ features to 2!
export_cols = [TS_COL, PRICE_COL] + paper_features + ['zone_name']

# ===================== Export Paper-Based Data =====================
os.makedirs(OUTPUT_DIR, exist_ok=True)

train_path = os.path.join(OUTPUT_DIR, "train_paper.csv")
test_path  = os.path.join(OUTPUT_DIR, "test_paper.csv")
combo_path = os.path.join(OUTPUT_DIR, "combined_paper.csv")

train_df[export_cols].to_csv(train_path, index=False, float_format="%.6f")
test_df[export_cols].to_csv(test_path,   index=False, float_format="%.6f")
pd.concat([train_df[export_cols], test_df[export_cols]], ignore_index=True)\
  .to_csv(combo_path, index=False, float_format="%.6f")

print(f"\n✅ PAPER-BASED PREPROCESSING COMPLETE!")
print(f"   Exported paper-based signals to: {OUTPUT_DIR}")
print(f"   Features: {paper_features} (vs 32+ in original)")
print(f"   Files: train_paper.csv, test_paper.csv, combined_paper.csv")
print(f"   Key improvement: Dramatically simplified signal processing")

# Store paper-based features for environment
feat_cols_paper = paper_features


🔬 PAPER-BASED DATA PREPROCESSING
Following 'Reinforcement Learning Pair Trading: A Dynamic Scaling Approach'
   MA Period: 60, Window: 120
   Thresholds: Open=2.0, Close=0.5
   Simplified approach: 3 values vs 32+ complex features

📁 Loading data...
   Data loaded: 1928080 rows
   Date range: 2022-01-01 00:00:00+00:00 to 2025-08-31 23:59:00+00:00
   📊 Calculating pseudo-spread signals...
      Z-score range: [-10.32, 10.36]
   📍 Calculating trading zones...

   🎯 Zone distribution:
     NEUTRAL_LONG: 582263 (30.2%)
     CLOSE: 582227 (30.2%)
     NEUTRAL_SHORT: 579862 (30.1%)
     LONG: 95065 (4.9%)
     SHORT: 88663 (4.6%)

✂️  Splitting data for paper-based training...
   Train data: 1542464 timesteps
   Test data: 385616 timesteps


KeyboardInterrupt: 

In [None]:
# ===================== ENHANCED TRADING ENVIRONMENT - FUNDAMENTAL FIXES =====================
from typing import Dict, Optional, Tuple, Any
import gymnasium as gym
from gymnasium import spaces
import numpy as np

print("\n🚀 IMPLEMENTING COMPREHENSIVE FIXES FOR STATIC POSITION PROBLEM")
print("================================================================")
print("🔧 FUNDAMENTAL FIXES BEING APPLIED:")
print("   1. ⚖️  REBALANCED REWARDS: Action rewards 50x larger vs portfolio rewards")
print("   2. 🚫 ANTI-STATIC PENALTIES: Penalize unchanged positions over time")
print("   3. 🎯 IMPROVED ZONE INCENTIVES: Balanced, compelling zone-based rewards")
print("   4. 💸 REDUCED TRANSACTION COSTS: Make trading economically viable")
print("   5. 📊 ENHANCED OBSERVATIONS: Add momentum/change indicators")

# ENHANCED PARAMETERS - MAJOR REBALANCING
ENHANCED_SEED_MONEY = 10000.0
ENHANCED_FEE_RATE = 0.00005           # REDUCED: 50% lower than original  
ENHANCED_SLIPPAGE = 0.00005           # REDUCED: 50% lower than original
ENHANCED_TRANSACTION_PENALTY = 0.00001  # REDUCED: 80% lower than original

# REBALANCED REWARD SCALES  
ENHANCED_ACTION_REWARD_SCALE = 50.0   # INCREASED: 500x vs original (was 0.1)
STATIC_POSITION_PENALTY = 10.0        # NEW: Penalty for not changing position
ZONE_REWARD_MULTIPLIER = 25.0         # NEW: Extra multiplier for good zone actions
MOMENTUM_REWARD_SCALE = 15.0           # NEW: Reward for trading with momentum

print(f"📈 ENHANCED PARAMETERS:")
print(f"   Action Reward Scale: {ENHANCED_ACTION_REWARD_SCALE} (vs 0.1 original - 500x increase!)")
print(f"   Static Penalty: {STATIC_POSITION_PENALTY} (NEW - penalize unchanging positions)")
print(f"   Zone Multiplier: {ZONE_REWARD_MULTIPLIER} (NEW - extra rewards for good trades)")
print(f"   Transaction Costs: {ENHANCED_FEE_RATE + ENHANCED_SLIPPAGE} (vs 0.0002 - 50% reduction)")

# 可调缩放（你也可以放到 JSON 里）
PNL_REWARD_SCALE = 100.0      # 将 ΔNAV / NAV 归一后放大一点，便于学习
INCREMENT_STEP_SIZE = 0.5     # 动作步长：action∈[-1,1] -> 仓位增量∈[-0.5,0.5]

class EnhancedTradingEnv(gym.Env):
    """
    Enhanced Trading Environment - Incremental actions + PnL reward + zone-shaped incentives
    - 动作为“增量调仓”
    - 即时奖励以“净值变化（含交易成本）”为主 + 形状奖励
    - CLOSE/NEUTRAL 区激励修正，避免“开仓被劝退”和“奖励小动作”
    - zone 归一化进入观测
    """

    metadata = {"render_modes": []}

    def __init__(self, df: pd.DataFrame, feat_cols, episode_length: int = 1000, randomize_start: bool = True):
        super().__init__()

        self.df = df.copy()
        self.feat_cols = feat_cols
        self.episode_length = int(episode_length)
        self.randomize_start = bool(randomize_start)

        # 基础数组
        self.prices = self.df[PRICE_COL].astype(float).values
        self.z_scores = self.df['z_score'].astype(float).values
        self.zones = self.df['zone'].astype(int).values

        # 起止边界（健壮性）
        self.min_start = int(max(MA_PERIOD, WINDOW_SIZE))
        self.max_start = int(len(self.df) - self.episode_length - 2)
        if self.max_start < self.min_start:
            self.episode_length = max(10, len(self.df) - self.min_start - 2)
            self.max_start = max(self.min_start, len(self.df) - self.episode_length - 2)
            self.randomize_start = False
            print(f"      ⚠️ Data short. Adjusted episode_length={self.episode_length}, randomize_start=False")

        # 动作：[-1,1] 调仓指令（增量）
        self.action_space = spaces.Box(low=-1.0, high=1.0, shape=(1,), dtype=np.float32)

        # 观测：保持 6 维（用 zone_norm 替代原始 zone）
        # [position, z_score, zone_norm, price_mom, z_mom, position_change]
        self.observation_space = spaces.Box(
            low=np.array([-1.0, -5.0, -1.0, -0.1, -2.0, -1.0], dtype=np.float32),
            high=np.array([ 1.0,  5.0,  1.0,  0.1,  2.0,  1.0], dtype=np.float32),
            dtype=np.float32
        )

        # 状态
        self.position_history = []
        self.static_steps = 0

        self.current_step = None
        self.episode_start = None
        self.position = None
        self.portfolio_value = None
        self.cash = None
        self.shares = None

        # —— 可调属性（与你的批量训练子类一致）——
        self.action_reward_scale    = ENHANCED_ACTION_REWARD_SCALE
        self.zone_reward_multiplier = ZONE_REWARD_MULTIPLIER
        self.static_penalty_scale   = STATIC_POSITION_PENALTY
        self.momentum_reward_scale  = MOMENTUM_REWARD_SCALE
        self.static_delta_thresh    = 0.01

        # ✅ 实例级步长 & PnL 缩放（替代全局修改，避免跨模型副作用）
        self.increment_step_size = float(INCREMENT_STEP_SIZE)
        self.pnl_reward_scale    = float(PNL_REWARD_SCALE)

    # ------------------------ Observations ------------------------
    def _get_observation(self) -> np.ndarray:
        position = float(self.position)
        z_score = float(self.z_scores[self.current_step])
        zone = int(self.zones[self.current_step])
        zone_norm = float((zone - 2.0) / 2.0)  # [-1,1]

        price_momentum = 0.0
        z_momentum = 0.0
        position_change = 0.0

        if self.current_step > 0:
            prev_price = float(self.prices[self.current_step - 1])
            cur_price  = float(self.prices[self.current_step])
            if prev_price > 0:
                price_momentum = (cur_price - prev_price) / prev_price
                price_momentum = float(np.clip(price_momentum, -0.1, 0.1))

            z_momentum = float(np.clip(self.z_scores[self.current_step] - self.z_scores[self.current_step - 1], -2.0, 2.0))

            if self.position_history:
                position_change = float(np.clip(position - self.position_history[-1], -1.0, 1.0))

        return np.array([position, z_score, zone_norm, price_momentum, z_momentum, position_change], dtype=np.float32)

    # ------------------------ Shaping Rewards (zone/momentum/static) ------------------------
    def _shaping_rewards(self, action: float) -> Tuple[float, float, float]:
        """
        返回: (zone_shaping, static_penalty, momentum_reward)
        - 不含 PnL；PnL 在 step() 中用 ΔNAV 计算
        """
        current_zone = int(self.zones[self.current_step])
        zone_shaping = 0.0

        # LONG/SHORT 区：方向正确奖励（乘 multiplier），方向错误轻罚
        if current_zone == TradingZone.LONG_ZONE.value:
            # 越多做多越好；做空轻罚
            zone_shaping = self.action_reward_scale * (action if action > 0 else 0.5 * action) * self.zone_reward_multiplier

        elif current_zone == TradingZone.SHORT_ZONE.value:
            # 越多做空越好；做多轻罚
            if action < 0:
                zone_shaping = self.action_reward_scale * abs(action) * self.zone_reward_multiplier
            else:
                zone_shaping = -self.action_reward_scale * 0.5 * action

        elif current_zone == TradingZone.CLOSE_ZONE.value:
            # 奖励“减仓回 0”，并对持有非零仓位轻罚；从 0 开仓不再被劝退
            position_reduction = max(0.0, abs(self.position) - abs(self.position + self.increment_step_size * action))
            zone_shaping = self.action_reward_scale * position_reduction * (self.zone_reward_multiplier / 25.0)
            zone_shaping += -0.2 * self.action_reward_scale * abs(self.position)

        elif current_zone in [TradingZone.NEUTRAL_LONG.value, TradingZone.NEUTRAL_SHORT.value]:
            # 不奖励“小动作”，仅轻罚过大仓位（鼓励轻仓观望）
            zone_shaping = -0.1 * self.action_reward_scale * max(0.0, abs(self.position) - 0.2)

        # 反静止惩罚：动作与当前仓位过近时递增惩罚
        static_penalty = 0.0
        if abs(action) < self.static_delta_thresh:
            self.static_steps += 1
            static_penalty = -self.static_penalty_scale * (1.0 + 0.1 * self.static_steps)
        else:
            self.static_steps = 0

        # 动量奖励：顺势小加分
        momentum_reward = 0.0
        if self.current_step > 0:
            prev_price = float(self.prices[self.current_step - 1])
            cur_price  = float(self.prices[self.current_step])
            if prev_price > 0:
                pm = (cur_price - prev_price) / prev_price
                if (action > 0 and pm > 0) or (action < 0 and pm < 0):
                    momentum_reward = self.momentum_reward_scale * abs(action) * abs(pm)

        return float(zone_shaping), float(static_penalty), float(momentum_reward)

    # ------------------------ Portfolio update (incremental action) ------------------------
    def _update_portfolio_incremental(self, action: float):
        """
        将 action 视为“调仓幅度”，并在当前时刻成交，扣真实成本；随后用下一时刻价格重估净值。
        """
        # 记录历史
        self.position_history.append(self.position)
        if len(self.position_history) > 10:
            self.position_history.pop(0)

        # 计算新目标仓位（增量）
        new_position = float(np.clip(self.position + self.increment_step_size * action, -1.0, 1.0))
        position_change = new_position - self.position

        cur_price = float(self.prices[self.current_step])
        if cur_price <= 0:
            cur_price = 1e-8

        # 有调仓 -> 扣成本 -> 更新持仓
        if abs(position_change) > 1e-6:
            trade_value = abs(position_change) * self.portfolio_value
            trade_cost  = trade_value * (ENHANCED_FEE_RATE + ENHANCED_SLIPPAGE)  # 真实成本仅扣一次

            self.position = new_position
            target_equity_value = self.position * self.portfolio_value
            self.shares = target_equity_value / cur_price
            self.cash   = self.portfolio_value - target_equity_value - trade_cost
        else:
            self.position = new_position  # 不变

        # 用下一时刻价格重估净值
        if self.current_step + 1 < len(self.prices):
            next_price = float(self.prices[self.current_step + 1])
            equity_value = self.shares * next_price
            self.portfolio_value = float(self.cash + equity_value)

    # ------------------------ Reset/Step ------------------------
    def reset(self, seed: Optional[int] = None, options: Optional[Dict[str, Any]] = None) -> Tuple[np.ndarray, Dict]:
        super().reset(seed=seed)

        if self.randomize_start:
            start_low = self.min_start
            start_high = max(self.min_start + 1, self.max_start + 1)
            self.episode_start = int(self.np_random.integers(start_low, start_high))
        else:
            self.episode_start = int(self.min_start)

        self.current_step = int(self.episode_start)
        self.position = 0.0
        self.portfolio_value = float(ENHANCED_SEED_MONEY)
        self.cash = float(ENHANCED_SEED_MONEY)
        self.shares = 0.0

        self.position_history = []
        self.static_steps = 0

        observation = self._get_observation()
        info = {
            "portfolio_value": float(self.portfolio_value),
            "position": float(self.position),
            "nav": float(self.portfolio_value / ENHANCED_SEED_MONEY)
        }
        return observation, info

    def step(self, action: np.ndarray) -> Tuple[np.ndarray, float, bool, bool, Dict]:
        # 归一化单值动作
        a = float(np.clip(action if np.isscalar(action) else action[0], -1.0, 1.0))

        # episode 结束检查
        if self.current_step >= self.episode_start + self.episode_length:
            observation = self._get_observation()
            info = {
                "portfolio_value": float(self.portfolio_value),
                "position": float(self.position),
                "nav": float(self.portfolio_value / ENHANCED_SEED_MONEY)
            }
            return observation, 0.0, True, False, info

        # 形状奖励（基于当前步状态）
        zone_shaping, static_penalty, momentum_reward = self._shaping_rewards(a)

        # 记录旧净值 -> 调仓并重估 -> 计算 PnL 奖励
        old_value = float(self.portfolio_value)
        self._update_portfolio_incremental(a)
        pnl_reward = 0.0
        if old_value > 0:
            pnl_reward = (self.portfolio_value - old_value) / old_value * self.pnl_reward_scale

        # 合成奖励
        total_reward = float(pnl_reward + zone_shaping + static_penalty + momentum_reward)

        # 推进时间
        self.current_step += 1
        terminated = bool(self.current_step >= self.episode_start + self.episode_length)
        truncated = False

        observation = self._get_observation()
        info = {
            "portfolio_value": float(self.portfolio_value),
            "position": float(self.position),
            "nav": float(self.portfolio_value / ENHANCED_SEED_MONEY),
            "pnl_reward": float(pnl_reward),
            "zone_shaping": float(zone_shaping),
            "static_penalty": float(static_penalty),
            "momentum_reward": float(momentum_reward),
            "static_steps": int(self.static_steps)
        }
        return observation, float(total_reward), terminated, truncated, info


In [None]:
# ===================== BATCH TRAINING SETUP - LOAD FROM CONFIG FILE =====================
from stable_baselines3.common.vec_env import DummyVecEnv
import os
import time
import json
from datetime import datetime
import pandas as pd  # ✅ 补：你下面用到了 pd

print("\n🎯 BATCH TRAINING SETUP - LOADING FROM CONFIG FILE")
print("=================================================")
print("🔄 Loading training configurations from external JSON file")

# Load paper-based processed data
if 'train_df' not in locals():
    print("   📁 Loading paper-based data...")
    train_df = pd.read_csv(os.path.join(OUTPUT_DIR, "train_paper.csv"))
    test_df = pd.read_csv(os.path.join(OUTPUT_DIR, "test_paper.csv"))
    print(f"      Train: {len(train_df)} rows, Test: {len(test_df)} rows")

# Check data quality
print(f"\n   🔍 Data validation:")
required_cols = [TS_COL, PRICE_COL, 'z_score', 'zone']
for col in required_cols:
    if col not in train_df.columns:
        raise ValueError(f"Missing required column: {col}")
    print(f"      ✅ {col}: range [{train_df[col].min():.3f}, {train_df[col].max():.3f}]")

# ===================== LOAD TRAINING CONFIGURATIONS FROM JSON =====================
config_file = "training_config.json"

def _with_defaults(p: dict) -> dict:
    """给每个配置补默认值，兼容新环境的可调项"""
    p = dict(p or {})
    # 环境可调（与 EnhancedTradingEnv ACTIVE 版本一致）
    p.setdefault('action_reward_scale', 50.0)
    p.setdefault('zone_reward_multiplier', 25.0)
    p.setdefault('static_penalty', 10.0)
    p.setdefault('momentum_reward_scale', 15.0)
    p.setdefault('static_delta_thresh', 0.01)
    # （可选）全局缩放
    p.setdefault('increment_step_size', None)  # None=用环境默认
    p.setdefault('pnl_reward_scale', None)     # None=用环境默认
    # 算法超参
    p.setdefault('learning_rate', 5e-4)
    p.setdefault('n_steps', 1024)
    p.setdefault('gamma', 0.99)
    p.setdefault('gae_lambda', 0.95)
    p.setdefault('ent_coef', 0.02)
    p.setdefault('vf_coef', 0.5)
    p.setdefault('max_grad_norm', 0.5)
    p.setdefault('total_timesteps', 75000)
    return p

try:
    print(f"\n📄 Loading configurations from: {config_file}")
    with open(config_file, 'r') as f:
        raw = json.load(f)
    # 兼容两种结构：{"training_configurations":[...]} 或 直接是 list
    training_configs = raw.get('training_configurations', raw)
    if not isinstance(training_configs, list):
        raise ValueError("training_configurations should be a list of configs")
    # 补默认值
    for cfg in training_configs:
        cfg.setdefault('name', 'unnamed_config')
        cfg.setdefault('description', '')
        cfg['params'] = _with_defaults(cfg.get('params', {}))
    print(f"   ✅ Successfully loaded {len(training_configs)} configurations")

except FileNotFoundError:
    print(f"   ❌ Config file '{config_file}' not found!")
    print(f"   💡 Using fallback minimal configuration...")
    training_configs = [
        {
            'name': 'fallback_config',
            'description': 'Minimal fallback configuration',
            'params': _with_defaults({
                'action_reward_scale': 50.0,
                'zone_reward_multiplier': 25.0,
                'static_penalty': 10.0,
                'learning_rate': 0.0005,
                'total_timesteps': 50000
            })
        }
    ]

except json.JSONDecodeError as e:
    print(f"   ❌ Error parsing JSON file: {str(e)}")
    print(f"   💡 Please check the JSON syntax in {config_file}")
    raise

# ===================== DISPLAY LOADED CONFIGS (SUMMARY) =====================
print("\n🧾 CONFIG SUMMARY")
print("-------------------------------------------------")
print(f"Total configs loaded: {len(training_configs)}")
preview_n = min(5, len(training_configs))
if preview_n:
    print(f"\n🔎 Preview first {preview_n}:")
    for i, cfg in enumerate(training_configs[:preview_n], start=1):
        p = cfg['params']
        print(f"  {i}. {cfg['name']}")
        print(f"     - desc: {cfg.get('description','')}")
        print(f"     - env: action_reward_scale={p['action_reward_scale']}, "
              f"zone_reward_multiplier={p['zone_reward_multiplier']}, "
              f"static_penalty={p['static_penalty']}, "
              f"momentum_reward_scale={p['momentum_reward_scale']}, "
              f"static_delta_thresh={p['static_delta_thresh']}, "
              f"increment_step_size={p.get('increment_step_size','default')}, "
              f"pnl_reward_scale={p.get('pnl_reward_scale','default')}")
        print(f"     - a2c: lr={p['learning_rate']}, n_steps={p['n_steps']}, "
              f"gamma={p['gamma']}, gae_lambda={p['gae_lambda']}, "
              f"ent_coef={p['ent_coef']}, vf_coef={p['vf_coef']}, "
              f"max_grad_norm={p['max_grad_norm']}, total_timesteps={p['total_timesteps']}")

# 可选：把清单写到一个小的快照文件，便于复盘
snapshot_path = f"training_config_snapshot_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
with open(snapshot_path, "w") as f:
    json.dump(training_configs, f, indent=2)
print(f"\n💾 Snapshot saved to: {snapshot_path}")


In [None]:
import glob
import os
import time
import json
from datetime import datetime

from stable_baselines3 import A2C
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.monitor import Monitor

# ===================== EXECUTE BATCH TRAINING WITH CHECKPOINT SUPPORT =====================
print("\n🔥 STARTING BATCH TRAINING - ALL CONFIGURATIONS")
print("===============================================")
print("🔄 Checkpoint support: Will skip existing models")

trained_models = {}
training_results = []
total_start_time = time.time()

# Check for existing models and training summary
existing_models = set()
existing_summary_file = None

# Find most recent training summary
summary_files = glob.glob("batch_training_summary_*.json")
if summary_files:
    existing_summary_file = max(summary_files)
    print(f"📁 Found existing training summary: {existing_summary_file}")
    
    try:
        with open(existing_summary_file, 'r') as f:
            existing_results = json.load(f)
        
        # Load existing successful models
        for result in existing_results:
            if result.get('status') == 'success':
                existing_models.add(result['name'])
                training_results.append(result)  # Keep existing results
        
        print(f"✅ Found {len(existing_models)} existing successful models")
        print(f"🔄 Will resume training from where it left off")
        
    except Exception as e:
        print(f"⚠️  Could not load existing summary: {str(e)}")
        print(f"🔄 Starting fresh training")

# Check for existing model files
existing_model_files = set()
for config in training_configs:
    model_path = f"./a2c_{config['name']}"
    if os.path.exists(f"{model_path}.zip"):
        existing_model_files.add(config['name'])

if existing_model_files:
    print(f"📂 Found {len(existing_model_files)} existing model files")
    print(f"🔄 These will be skipped even if not in summary")

# Calculate what needs to be trained
configs_to_train = []
skipped_configs = []

for config in training_configs:
    config_name = config['name']
    
    # Skip if model exists in summary or as file
    if config_name in existing_models or config_name in existing_model_files:
        skipped_configs.append(config_name)
        
        # Add to training results if not already there
        if config_name not in [r['name'] for r in training_results]:
            training_results.append({
                'name': config_name,
                'params': config['params'],
                'model_path': f"./a2c_{config_name}",
                'training_time_minutes': 0.0,  # Unknown time
                'status': 'success',
                'note': 'Loaded from existing model file'
            })
    else:
        configs_to_train.append(config)

print(f"\n📊 TRAINING PLAN:")
print("=" * 50)
print(f"Total configurations: {len(training_configs)}")
print(f"Already completed: {len(skipped_configs)}")
print(f"Need to train: {len(configs_to_train)}")
print(f"Estimated time: ~{len(configs_to_train) * 0.15:.1f} minutes")

if skipped_configs:
    print(f"\n⏭️  SKIPPING EXISTING MODELS:")
    for name in skipped_configs[:10]:  # Show first 10
        print(f"   ✅ {name}")
    if len(skipped_configs) > 10:
        print(f"   ... and {len(skipped_configs) - 10} more")

if not configs_to_train:
    print(f"\n🎉 ALL MODELS ALREADY TRAINED!")
    print(f"✅ No new training needed")
    successful_models = [r for r in training_results if r.get('status') == 'success']
else:
    print(f"\n🚀 STARTING TRAINING FOR {len(configs_to_train)} NEW CONFIGURATIONS...")

# -------------------- 修改点 ①：只做参数注入，不重写奖励 --------------------
class ConfiguredEnhancedTradingEnv(EnhancedTradingEnv):
    def __init__(self, df, feat_cols, episode_length=800, randomize_start=True, params=None):
        super().__init__(df, feat_cols, episode_length, randomize_start)
        p = params or {}
        # —— 把 JSON 参数灌进父类的可调属性（step() 会读取它们）——
        self.action_reward_scale    = float(p.get('action_reward_scale',    50.0))
        self.zone_reward_multiplier = float(p.get('zone_reward_multiplier', 25.0))
        self.static_penalty_scale   = float(p.get('static_penalty',         10.0))
        self.momentum_reward_scale  = float(p.get('momentum_reward_scale',  15.0))
        self.static_delta_thresh    = float(p.get('static_delta_thresh',    0.01))
        # ✅ 实例级注入：替代 global 覆盖，避免跨模型副作用
        if p.get('increment_step_size') is not None:
            self.increment_step_size = float(p['increment_step_size'])
        if p.get('pnl_reward_scale') is not None:
            self.pnl_reward_scale = float(p['pnl_reward_scale'])

# Train only the missing configurations
for i, config in enumerate(configs_to_train):
    config_name = config['name']
    params = config['params']
    
    print(f"\n🎯 TRAINING CONFIG {i+1}/{len(configs_to_train)}: {config_name.upper()}")
    print(f"   📍 Overall progress: {len(skipped_configs) + i + 1}/{len(training_configs)}")
    print("=" * 60)
    
    start_time = time.time()
    
    try:
        # -------------------- 修改点 ②：创建 env 时把 params 传进去 --------------------
        def make_env():
            env = ConfiguredEnhancedTradingEnv(
                df=train_df,
                feat_cols=feat_cols_paper,
                episode_length=800,
                randomize_start=True,
                params=params
            )
            return Monitor(env)

        train_env = DummyVecEnv([make_env])
        
        print(f"   🔧 Environment configured with custom parameters:")
        print(f"      Action reward scale: {params.get('action_reward_scale', 50.0)}")
        print(f"      Zone reward multiplier: {params.get('zone_reward_multiplier', 25.0)}")
        print(f"      Static penalty: {params.get('static_penalty', 10.0)}")
        print(f"      Momentum reward scale: {params.get('momentum_reward_scale', 15.0)}")
        if 'increment_step_size' in params or 'pnl_reward_scale' in params:
            print(f"      step_size={params.get('increment_step_size', 'default')}, "
                  f"pnl_scale={params.get('pnl_reward_scale', 'default')}")

        # Create A2C model
        model = A2C(
            policy="MlpPolicy",
            env=train_env,
            learning_rate=params.get('learning_rate', 0.0005),
            n_steps=params.get('n_steps', 1024),
            gamma=params.get('gamma', 0.99),
            gae_lambda=params.get('gae_lambda', 0.95),
            ent_coef=params.get('ent_coef', 0.02),
            vf_coef=params.get('vf_coef', 0.5),
            max_grad_norm=params.get('max_grad_norm', 0.5),
            verbose=0,
            seed=42,
            device='auto'
        )
        
        # Train the model
        total_timesteps = int(params.get('total_timesteps', 75000))
        print(f"   ⏱️  Training for {total_timesteps:,} timesteps...")
        model.learn(total_timesteps=total_timesteps)
        
        # Save the model
        model_path = f"./a2c_{config_name}"
        model.save(model_path)
        trained_models[config_name] = model
        
        training_time = (time.time() - start_time) / 60.0
        
        print(f"   ✅ Training complete! Time: {training_time:.1f} minutes")
        print(f"   💾 Model saved to: {model_path}")
        
        # Store results
        training_results.append({
            'name': config_name,
            'params': params,
            'model_path': model_path,
            'training_time_minutes': training_time,
            'status': 'success'
        })
        
        # Save progress after each successful model (checkpoint)
        checkpoint_file = f"batch_training_summary_{datetime.now().strftime('%Y%m%d_%H%M')}.json"
        with open(checkpoint_file, 'w') as f:
            json.dump(training_results, f, indent=2)
        
        print(f"   💾 Progress saved to: {checkpoint_file}")
        
    except Exception as e:
        error_time = (time.time() - start_time) / 60.0
        print(f"   ❌ Training failed: {str(e)}")
        print(f"   ⏱️  Time before failure: {error_time:.1f} minutes")
        
        training_results.append({
            'name': config_name,
            'params': params,
            'error': str(e),
            'training_time_minutes': error_time,
            'status': 'failed'
        })
        
        # Save progress even after failures
        checkpoint_file = f"batch_training_summary_{datetime.now().strftime('%Y%m%d_%H%M')}.json"
        with open(checkpoint_file, 'w') as f:
            json.dump(training_results, f, indent=2)

total_time = (time.time() - total_start_time) / 60.0
successful_models = [r for r in training_results if r.get('status') == 'success']
failed_models = [r for r in training_results if r.get('status') == 'failed']

print(f"\n🎉 BATCH TRAINING COMPLETE!")
print("=" * 50)
print(f"⏱️  Total time: {total_time:.1f} minutes")
print(f"✅ Successful models: {len(successful_models)}/{len(training_configs)}")
print(f"❌ Failed models: {len(failed_models)}")
print(f"⏭️  Skipped existing: {len(skipped_configs)}")
print(f"🆕 New models trained: {len(configs_to_train)}")
print(f"📁 Models saved in current directory")

# Save final training summary
final_summary_file = f"batch_training_summary_{datetime.now().strftime('%Y%m%d_%H%M')}.json"
with open(final_summary_file, 'w') as f:
    json.dump(training_results, f, indent=2)

print(f"💾 Final training summary saved to: {final_summary_file}")

if successful_models:
    print(f"\n🚀 READY FOR EVALUATION OF ALL MODELS!")
    print(f"📊 {len(successful_models)} models available for evaluation")
else:
    print(f"\n⚠️  No successful models to evaluate")


In [None]:
# === 用训练时的 params 还原评估环境（与训练保持一致） ===
from stable_baselines3 import A2C
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.monitor import Monitor

def build_eval_env_from_params(params, df, feat_cols, episode_length=None, randomize_start=False):
    ep_len = episode_length or max(100, len(df) - max(MA_PERIOD, WINDOW_SIZE) - 1)
    p = params or {}

    class EvalConfiguredEnv(EnhancedTradingEnv):
        def __init__(self):
            super().__init__(
                df=df,
                feat_cols=feat_cols,
                episode_length=ep_len,
                randomize_start=randomize_start
            )
            # —— 与训练一致：只注入可调属性（父类 step() 会读取）——
            self.action_reward_scale    = float(p.get('action_reward_scale',    50.0))
            self.zone_reward_multiplier = float(p.get('zone_reward_multiplier', 25.0))
            self.static_penalty_scale   = float(p.get('static_penalty',         10.0))
            self.momentum_reward_scale  = float(p.get('momentum_reward_scale',  15.0))
            self.static_delta_thresh    = float(p.get('static_delta_thresh',    0.01))

            # ✅ 实例级注入：动作步长 & PnL 奖励缩放（不再修改全局，避免跨模型副作用）
            if p.get('increment_step_size') is not None:
                self.increment_step_size = float(p['increment_step_size'])
            if p.get('pnl_reward_scale') is not None:
                self.pnl_reward_scale = float(p['pnl_reward_scale'])

    # 包一层 Monitor 方便记录评估指标（可选）
    return DummyVecEnv([lambda: Monitor(EvalConfiguredEnv())])

def restore_model_and_env(model_entry, df, feat_cols):
    """
    给定一条 training_results 记录（含 params、model_path），
    恢复：评估环境（与训练一致）+ 已训练模型
    """
    params = model_entry.get("params", {})
    model_path = model_entry["model_path"]
    env = build_eval_env_from_params(params, df=df, feat_cols=feat_cols, randomize_start=False)
    model = A2C.load(model_path, device='auto')
    return model, env


In [None]:
# ===================== Paper-Based Evaluation and Visualization (FIXED) =====================
import os
import matplotlib.pyplot as plt
import numpy as np
import json
from datetime import datetime

print("\n📊 PAPER-BASED MODEL EVALUATION")
print("==============================")
print("This will capture all trading steps like the original but with paper-based methodology")

def evaluate_detailed_trading(model, env, num_episodes=1, evalulation_in_minutes=40320):
    import numpy as np
    print(f"📈 Running detailed trading analysis...")

    def _unwrap_base_env(vecenv):
        # ✅ 修复：正确“剥包”到底层 env（先用 unwrapped，再沿 .env 链下钻）
        base_env = vecenv.envs[0]
        base_env = getattr(base_env, "unwrapped", base_env)
        while hasattr(base_env, "env") and getattr(base_env, "env") is not None:
            base_env = base_env.env
        return base_env

    all_episodes = []

    for episode in range(num_episodes):
        print(f"   Episode {episode + 1}/{num_episodes}")

        obs = env.reset()  # VecEnv.reset() -> obs
        done = False
        step_count = 0

        episode_data = {
            'timestamps': [], 'prices': [], 'portfolio_values': [], 'positions': [],
            'z_scores': [], 'zones': [], 'actions': [], 'rewards': [], 'navs': []
        }

        while not done and step_count < evalulation_in_minutes:
            action, _ = model.predict(obs, deterministic=True)
            obs, rewards, dones, infos = env.step(action)

            base_env = _unwrap_base_env(env)

            r0 = float(rewards[0]) if np.ndim(rewards) else float(rewards)
            info0 = infos[0] if isinstance(infos, (list, tuple)) else infos

            # action -> 标量
            try:
                a0 = float(np.array(action).reshape(-1)[0])
            except Exception:
                a0 = float(action[0]) if isinstance(action, (list, tuple)) else float(action)

            cur_step = int(getattr(base_env, "current_step", 0))
            if cur_step < len(base_env.prices):
                episode_data['timestamps'].append(cur_step)
                episode_data['prices'].append(float(base_env.prices[cur_step]))
                episode_data['portfolio_values'].append(float(info0['portfolio_value']))
                episode_data['positions'].append(float(info0['position']))
                episode_data['z_scores'].append(float(base_env.z_scores[cur_step]) if cur_step < len(base_env.z_scores) else 0.0)
                episode_data['zones'].append(int(base_env.zones[cur_step]) if cur_step < len(base_env.zones) else 2)
                episode_data['actions'].append(a0)
                episode_data['rewards'].append(r0)
                episode_data['navs'].append(float(info0['nav']))

            step_count += 1
            done = bool(dones[0]) if isinstance(dones, (list, tuple, np.ndarray)) else bool(dones)

        all_episodes.append(episode_data)

        base_cash = ENHANCED_SEED_MONEY
        final_value = episode_data['portfolio_values'][-1] if episode_data['portfolio_values'] else base_cash
        total_return = (final_value - base_cash) / base_cash
        total_steps = len(episode_data['portfolio_values'])
        print(f"      Steps: {total_steps}, Final Value: ${final_value:,.2f}, Return: {total_return:.2%}")

    return all_episodes

def create_enhanced_trading_plots(episode_data, tag="model"):
    """
    Create enhanced trading visualizations matching the enhanced A2C style
    Shows portfolio value comparison, drawdown, positions, and ETH price.
    'tag' will be embedded into title and filename to avoid overwriting.
    """
    if not episode_data['timestamps']:
        print("❌ No data to plot")
        return

    # 用 step indices 为横轴
    time_steps = list(range(len(episode_data['timestamps'])))

    # Buy & Hold 基准
    prices = np.array(episode_data['prices'], dtype=float)
    initial_price = prices[0]
    buy_hold_values = [ENHANCED_SEED_MONEY * (price / initial_price) for price in prices]

    # ---- 画图 ----
    fig, axes = plt.subplots(4, 1, figsize=(16, 12))
    fig.suptitle(f'Enhanced A2C Trading Model Performance - {tag}', fontsize=16, fontweight='bold')

    # 1) 组合价值 vs B&H
    ax1 = axes[0]
    portfolio_values = episode_data['portfolio_values']
    ax1.plot(time_steps, portfolio_values, label='Paper-Based A2C Model', linewidth=2)
    ax1.plot(time_steps, buy_hold_values, label='Buy & Hold', linewidth=2)
    ax1.axhline(y=ENHANCED_SEED_MONEY, linestyle='--', alpha=0.7, label='Initial Investment')
    ax1.set_title('Portfolio Value Comparison')
    ax1.set_ylabel('Portfolio Value ($)')
    ax1.legend()
    ax1.grid(True, alpha=0.3)

    # 2) 回撤
    ax2 = axes[1]
    pv = np.array(portfolio_values, dtype=float)
    peak = np.maximum.accumulate(pv)
    drawdown_pct = (pv - peak) / peak * 100
    max_dd = np.min(drawdown_pct)
    ax2.fill_between(time_steps, drawdown_pct, 0, alpha=0.3)
    ax2.plot(time_steps, drawdown_pct, linewidth=1)
    ax2.set_title(f'Drawdown Analysis (Max: {max_dd:.1f}%)')
    ax2.set_ylabel('Drawdown (%)')
    ax2.grid(True, alpha=0.3)

    # 3) 仓位
    ax3 = axes[2]
    positions = episode_data['positions']
    ax3.plot(time_steps, positions, linewidth=1.5, label="Position")
    ax3.axhline(y=0, linestyle='-', alpha=0.5)
    ax3.axhline(y=1, linestyle='--', alpha=0.5, label='Max Long')
    ax3.axhline(y=-1, linestyle='--', alpha=0.5, label='Max Short')
    ax3.set_title('Position over Time')
    ax3.set_ylabel('Position [-1, 1]')
    ax3.legend()
    ax3.grid(True, alpha=0.3)

    # 4) 价格
    ax4 = axes[3]
    ax4.plot(time_steps, prices, linewidth=1, alpha=0.8)
    ax4.set_title('ETH Price Movement During Evaluation')
    ax4.set_ylabel('ETH Price ($)')
    ax4.set_xlabel('Time Steps')
    ax4.grid(True, alpha=0.3)

    plt.tight_layout()

    # --- 唯一文件名：包含模型 tag ---
    safe_tag = "".join(ch if ch.isalnum() or ch in "-_." else "_" for ch in str(tag))
    out_dir = "eval_charts"
    os.makedirs(out_dir, exist_ok=True)
    out_path = os.path.join(out_dir, f'paper_based_performance_{safe_tag}.png')

    plt.savefig(out_path, dpi=300, bbox_inches='tight')
    plt.show()
    print(f"📁 Saved chart: {out_path}")

def calculate_detailed_metrics(episode_data):
    """Calculate comprehensive trading metrics like original drl.ipynb"""
    if not episode_data['portfolio_values']:
        return {"error": "No data for metrics calculation"}
    
    portfolio_values = np.array(episode_data['portfolio_values'], dtype=float)
    positions = np.array(episode_data['positions'], dtype=float)

    # Basic metrics
    initial_value = ENHANCED_SEED_MONEY  # ✅ 对齐新环境
    final_value = float(portfolio_values[-1])
    total_return = (final_value - initial_value) / initial_value

    # Calculate trades (position changes > threshold)
    position_changes = np.abs(np.diff(positions))
    trades = int(np.sum(position_changes > 0.01))  # Threshold for significant trade

    # Calculate win rate（逐步 NAV 变化）
    returns = np.diff(portfolio_values) / portfolio_values[:-1]
    wins = int(np.sum(returns > 0))
    losses = int(np.sum(returns < 0))
    win_rate = wins / (wins + losses) if (wins + losses) > 0 else 0.0

    # Average win/loss
    avg_win = float(np.mean(returns[returns > 0])) if np.any(returns > 0) else 0.0
    avg_loss = float(np.mean(returns[returns < 0])) if np.any(returns < 0) else 0.0

    # Sharpe ratio（minute-level 假设：一年 ~ 525600 分钟）
    if len(returns) > 1 and np.std(returns) > 0:
        sharpe = float(np.mean(returns) / np.std(returns) * np.sqrt(525600))
    else:
        sharpe = 0.0

    # Maximum drawdown
    peak = np.maximum.accumulate(portfolio_values)
    drawdown = (portfolio_values - peak) / peak
    max_drawdown = float(np.min(drawdown))

    # Average position
    avg_position = float(np.mean(np.abs(positions)))

    # CAGR（短样本仅作参考）
    periods = len(portfolio_values)
    cagr = float((final_value / initial_value) ** (525600 / periods) - 1) if periods > 0 else 0.0

    metrics = {
        "FinalNAV": final_value / initial_value,
        "FinalValue": final_value,
        "TotalReturn": total_return,
        "CAGR": cagr,
        "Sharpe": sharpe,
        "MaxDD": abs(max_drawdown),
        "Trades": trades,
        "WinRate": win_rate,
        "AvgWin": avg_win,
        "AvgLoss": avg_loss,
        "AvgPos": avg_position,
        "TotalSteps": periods
    }
    return metrics

print("✅ Evaluation functions defined")
print("   - evaluate_detailed_trading(): Captures all trading steps")
print("   - create_enhanced_trading_plots(): Shows price, positions, portfolio")
print("   - calculate_detailed_metrics(): Comprehensive performance metrics")


In [None]:
# ===================== EVALUATE ALL TRAINED MODELS (FIXED) =====================
print("\n🎯 EVALUATING ALL TRAINED MODELS")
print("===============================")

import os
import json
import glob
import numpy as np
from datetime import datetime

# 选取最新的训练摘要（优先固定文件名，其次按修改时间）
summary_files = []
if os.path.exists("batch_training_summary.json"):
    summary_files.append("batch_training_summary.json")
summary_files += glob.glob("batch_training_summary_*.json")

if summary_files:
    latest_summary = max(summary_files, key=os.path.getmtime)
    print(f"📁 Loading training results from: {latest_summary}")
    with open(latest_summary, 'r') as f:
        training_results = json.load(f)
    successful_models = [r for r in training_results if r.get('status') == 'success']
else:
    print("❌ No training summary found")
    successful_models = []

print(f"📊 Evaluating {len(successful_models)} successfully trained models")

all_evaluation_results = []

for i, result in enumerate(successful_models):
    config_name = result['name']
    model_path = result['model_path']
    print(f"\n📈 EVALUATING {i+1}/{len(successful_models)}: {config_name.upper()}")
    print("=" * 60)

    try:
        # ✅ 关键：按该模型的 params 还原环境 + 加载模型
        model, test_env = restore_model_and_env(result, df=test_df, feat_cols=feat_cols_paper)

        # 运行详细评估（确保 evaluate_detailed_trading 为 FIXED 版）
        detailed_episodes = evaluate_detailed_trading(model, test_env, num_episodes=1)

        if detailed_episodes and detailed_episodes[0].get('portfolio_values'):
            episode_data = detailed_episodes[0]
            detailed_metrics = calculate_detailed_metrics(episode_data)

            # 先算 avg_abs_position，直接存进去，避免后面重复计算
            avg_pos = float(np.mean(np.abs(episode_data['positions']))) if episode_data.get('positions') else float('inf')

            all_evaluation_results.append({
                'config_name': config_name,
                'model_path': model_path,
                'metrics': detailed_metrics,
                'episode_data': episode_data,
                'avg_abs_position': avg_pos
            })

            print(f"   📊 METRICS SUMMARY:")
            print(f"      Final NAV: {detailed_metrics['FinalNAV']:.3f}")
            print(f"      Avg |Position|: {avg_pos:.3f}")
            print(f"      Trades: {detailed_metrics['Trades']}")
            print(f"      Sharpe: {detailed_metrics['Sharpe']:.3f}")

            status = "🟢 BALANCED - Static position problem SOLVED!" if avg_pos < 0.5 else \
                     ("🟡 MODERATE - Partially solved" if avg_pos < 0.7 else "🔴 STATIC - Still has position sticking")
            print(f"      Status: {status}")
        else:
            print("   ❌ Evaluation failed - no trading data")

    except Exception as e:
        print(f"   ❌ Evaluation error: {e}")

# 比较汇总
print(f"\n🏆 FINAL COMPARISON - ALL MODELS")
print("=" * 80)
print(f"{'Model':<20} {'NAV':<8} {'AvgPos':<8} {'Trades':<8} {'Sharpe':<8} {'Status':<15}")
print("-" * 80)

if all_evaluation_results:
    for r in sorted(all_evaluation_results, key=lambda x: x['avg_abs_position']):
        name = r['config_name'][:18]
        nav = f"{r['metrics']['FinalNAV']:.3f}"
        avg_pos = f"{r['avg_abs_position']:.3f}"
        trades = str(r['metrics']['Trades'])
        sharpe = f"{r['metrics']['Sharpe']:.2f}"
        status = "🟢 BALANCED" if r['avg_abs_position'] < 0.5 else ("🟡 MODERATE" if r['avg_abs_position'] < 0.7 else "🔴 STATIC")
        print(f"{name:<20} {nav:<8} {avg_pos:<8} {trades:<8} {sharpe:<8} {status:<15}")
else:
    print("（no results）")

# 保存评估结果
eval_file = f"batch_evaluation_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
with open(eval_file, 'w') as f:
    json.dump(all_evaluation_results, f, indent=2, default=str)
print(f"\n💾 Evaluation results saved to: {eval_file}")

# 推荐 & 画图（如已定义 create_enhanced_trading_plots）
if all_evaluation_results:
    best_balanced = min(all_evaluation_results, key=lambda x: x['avg_abs_position'])
    best_performance = max(all_evaluation_results, key=lambda x: x['metrics']['FinalNAV'])

    print(f"\n🎯 RECOMMENDATIONS:")
    print(f"   Most Balanced: {best_balanced['config_name']} "
          f"(AvgPos: {best_balanced['avg_abs_position']:.3f}, NAV: {best_balanced['metrics']['FinalNAV']:.3f})")
    print(f"   Best Performance: {best_performance['config_name']} "
          f"(NAV: {best_performance['metrics']['FinalNAV']:.3f}, AvgPos: {best_performance['avg_abs_position']:.3f})")

    try:
        plotted = set()
        for chosen in [best_balanced, best_performance]:
            cname = chosen['config_name']
            if cname in plotted:
                continue
            create_enhanced_trading_plots(chosen['episode_data'], tag=cname)
            plotted.add(cname)
        print(f"📊 Performance charts saved for {len(plotted)} model(s)")
    except NameError:
        print("ℹ️  Skipping plots (create_enhanced_trading_plots not defined)")
else:
    print(f"\n⚠️  No successful evaluations to analyze")
