# Deep Reinforcement Learning for Ethereum Trading — 15D Methodology

This notebook implements the methodology from your paper summary for ETH trading using DRL, adapted to 15-day episodes and a simplified, anti-overfitting signal design.

## Structure
1. Imports
2. Configuration (defaults + JSON override support)
3. Utilities (signals, metrics, helpers)
4. Data Loading & Feature Preparation
5. Trading Environment (Enhanced incremental actions + PnL-focused rewards)
6. Training Helpers (A2C)
7. Bulk Config Testing (reads external JSON of runs)
8. Evaluation & Visualization

Notes:
- Signals use a moving-average pseudo-spread and rolling z-score to define trading zones.
- The environment uses incremental position changes and PnL-focused rewards to reduce static positions.
- Bulk testing reads a separate JSON file to sweep hyperparameters and environment knobs.


In [None]:
# ===================== IMPORTS =====================
import os
import json
import time
import numpy as np
import pandas as pd
from datetime import datetime
from typing import Dict, Any, Optional, Tuple, List

# RL / Env
import gymnasium as gym
from gymnasium import spaces
from stable_baselines3 import A2C
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnNoModelImprovement

# Viz
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('default')
plt.rcParams['figure.figsize'] = (15, 8)
plt.rcParams['figure.dpi'] = 110

print("✅ Imports ready")


Gym has been unmaintained since 2022 and does not support NumPy 2.0 amongst other critical functionality.
Please upgrade to Gymnasium, the maintained drop-in replacement of Gym, or contact the authors of your software and request that they upgrade.
Users of this version of Gym should be able to simply replace 'import gym' with 'import gymnasium as gym' in the vast majority of cases.
See the migration guide at https://gymnasium.farama.org/introduction/migration_guide/ for additional information.


✅ Imports ready


In [None]:
# ===================== CONFIGURATION (defaults + JSON override) =====================

# Data
DATA_PATH = "../ETHUSDT_1m_with_indicators.parquet"
TIMESTAMP_COL = "ts"
PRICE_COL = "close"
OUTPUT_DIR = "./processed_data_15d"

# Signal generation (pseudo-spread + z-score)
MA_PERIOD = 60
WINDOW_SIZE = 120
OPEN_THRESHOLD = 2.0
CLOSE_THRESHOLD = 0.5

# Trading
SEED_MONEY = 10000.0
FEE_RATE = 0.00005
SLIPPAGE = 0.00005

# Episode sizing for 15 days
MINUTES_PER_DAY = 24 * 60
EPISODE_LENGTH = 15 * MINUTES_PER_DAY
EPISODE_LENGTH_EVAL = 15 * MINUTES_PER_DAY

# Temporal split
TRAIN_DURATION_MINUTES = 365 * MINUTES_PER_DAY
TEST_DURATION_MINUTES = 28 * MINUTES_PER_DAY
MIN_DATA_BUFFER = 200
RANDOM_SEED = 42

# Enhanced rewards / actions
INCREMENTAL_STEP_SIZE = 0.3
PNL_REWARD_SCALE = 80.0
MOMENTUM_REWARD_SCALE = 0.5
ACTIVITY_REWARD_SCALE = 0.1
STATIC_DELTA_THRESH = 0.015
STATIC_PENALTY_BASE = 0.05
STATIC_PENALTY_ESCALATION = 0.2

# A2C defaults
TOTAL_TIMESTEPS = 100_000
LEARNING_RATE = 1e-4
BATCH_SIZE = 1024
GAMMA = 0.99
GAE_LAMBDA = 0.95
ENT_COEF = 0.02
VF_COEF = 0.5
MAX_GRAD_NORM = 0.5

# Bulk config file path (for sweeps)
BULK_CONFIG_PATH = "../Test_drl_training/training_config_15d.json"  # 15D-specific sweep file

os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"🔧 Defaults loaded. 15D episode length: {EPISODE_LENGTH} minutes")


🔧 Defaults loaded. 15D episode length: 21600 minutes


In [None]:
# ===================== UTILITIES =====================
from enum import Enum

class TradingZone(Enum):
    LONG = 0
    NEUTRAL_LONG = 1
    CLOSE = 2
    NEUTRAL_SHORT = 3
    SHORT = 4

def calculate_pseudo_spread(prices: pd.Series, ma_period: int, window_size: int) -> pd.DataFrame:
    ma = prices.rolling(window=ma_period, min_periods=1).mean()
    spread = prices - ma
    spread_mean = spread.rolling(window=window_size, min_periods=1).mean()
    spread_std = spread.rolling(window=window_size, min_periods=1).std()
    z_score = (spread - spread_mean) / (spread_std + 1e-8)
    z_score = z_score.fillna(0)
    out = pd.DataFrame({
        'price': prices,
        'ma': ma,
        'spread': spread,
        'z_score': z_score
    })
    return out

def z_to_zone(z: float, open_thresh: float, close_thresh: float) -> int:
    if z > open_thresh:
        return TradingZone.SHORT.value
    elif z > close_thresh:
        return TradingZone.NEUTRAL_SHORT.value
    elif z >= -close_thresh:
        return TradingZone.CLOSE.value
    elif z >= -open_thresh:
        return TradingZone.NEUTRAL_LONG.value
    else:
        return TradingZone.LONG.value

def create_temporal_data_split(df: pd.DataFrame, train_minutes: int, test_minutes: int,
                               min_buffer: int, seed: Optional[int]) -> Tuple[pd.DataFrame, pd.DataFrame, Dict[str, Any]]:
    if seed is not None:
        np.random.seed(seed)
    total = len(df)
    required = train_minutes + test_minutes + min_buffer
    if total < required:
        raise ValueError(f"Insufficient data: need {required}, have {total}")
    max_train_start = total - train_minutes - test_minutes
    min_train_start = min_buffer
    start = np.random.randint(min_train_start, max_train_start)
    train_df = df.iloc[start:start+train_minutes].reset_index(drop=True)
    test_df = df.iloc[start+train_minutes:start+train_minutes+test_minutes].reset_index(drop=True)
    info = {
        'train_start_idx': int(start),
        'train_end_idx': int(start+train_minutes),
        'test_start_idx': int(start+train_minutes),
        'test_end_idx': int(start+train_minutes+test_minutes)
    }
    return train_df, test_df, info

def calculate_metrics(nav_series: np.ndarray, initial: float) -> Dict[str, float]:
    if len(nav_series) == 0:
        return {}
    final_value = float(nav_series[-1])
    returns = np.diff(nav_series) / nav_series[:-1] if len(nav_series) > 1 else np.array([0.0])
    sharpe = (np.mean(returns) / (np.std(returns) + 1e-12)) * np.sqrt(525600) if len(returns) > 1 else 0.0
    peak = np.maximum.accumulate(nav_series)
    drawdown = (nav_series - peak) / (peak + 1e-12)
    return {
        'final_value': final_value,
        'final_nav': final_value / initial,
        'total_return': (final_value - initial) / initial,
        'sharpe': float(sharpe),
        'max_drawdown': float(abs(np.min(drawdown)))
    }

print("✅ Utilities ready")


✅ Utilities ready


In [None]:
# ===================== DATA LOADING & FEATURES =====================

def load_and_prepare(data_path: str) -> Tuple[pd.DataFrame, pd.DataFrame, List[str], Dict[str, Any]]:
    if not os.path.exists(data_path):
        raise FileNotFoundError(f"Missing data file: {data_path}")
    df = pd.read_parquet(data_path)
    if df.index.name and 'ts' in str(df.index.name):
        df = df.reset_index()
    if TIMESTAMP_COL not in df.columns:
        for cand in ['timestamp', 'time', 'date', 'datetime', 'ts']:
            if cand in df.columns:
                df = df.rename(columns={cand: TIMESTAMP_COL})
                break
        else:
            raise ValueError(f"Timestamp column not found. Available: {list(df.columns)}")
    df = df.sort_values(TIMESTAMP_COL).reset_index(drop=True)
    df[PRICE_COL] = pd.to_numeric(df[PRICE_COL], errors='coerce')
    df = df.dropna(subset=[PRICE_COL])
    df = df[df[PRICE_COL] > 0]

    sig = calculate_pseudo_spread(df[PRICE_COL], MA_PERIOD, WINDOW_SIZE)
    df = pd.concat([df.reset_index(drop=True), sig.reset_index(drop=True)], axis=1)
    df['zone'] = df['z_score'].apply(lambda z: z_to_zone(z, OPEN_THRESHOLD, CLOSE_THRESHOLD))

    features = ['z_score', 'zone']
    train_df, test_df, info = create_temporal_data_split(
        df, TRAIN_DURATION_MINUTES, TEST_DURATION_MINUTES, MIN_DATA_BUFFER, RANDOM_SEED
    )

    return train_df, test_df, features, info

train_df, test_df, feature_cols, split_info = load_and_prepare(DATA_PATH)
print(f"📊 Train: {len(train_df):,} rows | Test: {len(test_df):,} rows | Features: {feature_cols}")


📊 Train: 525,600 rows | Test: 40,320 rows | Features: ['z_score', 'zone']


In [None]:
# ===================== ENVIRONMENT (15D Enhanced) =====================

class TradingEnv15D(gym.Env):
    metadata = {"render_modes": []}

    def __init__(self, df: pd.DataFrame, feature_cols: List[str], episode_length: int,
                 seed_money: float,
                 incremental_step: float,
                 pnl_scale: float,
                 static_delta_thresh: float):
        super().__init__()
        self.df = df.reset_index(drop=True)
        self.feature_cols = feature_cols
        self.episode_length = int(episode_length)
        self.seed_money = float(seed_money)
        self.incremental_step = float(incremental_step)
        self.pnl_scale = float(pnl_scale)
        self.static_delta_thresh = float(static_delta_thresh)

        self.prices = self.df[PRICE_COL].values
        self.z_scores = self.df['z_score'].values
        self.zones = self.df['zone'].values

        self.min_start = max(MA_PERIOD, WINDOW_SIZE)
        self.max_start = len(self.df) - self.episode_length - 2

        self.action_space = spaces.Box(low=-1.0, high=1.0, shape=(1,), dtype=np.float32)
        self.observation_space = spaces.Box(
            low=np.array([-1.0, -5.0, 0.0], dtype=np.float32),
            high=np.array([1.0, 5.0, 4.0], dtype=np.float32),
            dtype=np.float32
        )

        self.position = 0.0
        self.portfolio_value = self.seed_money
        self.cash = self.seed_money
        self.shares = 0.0
        self.current_step = 0
        self.episode_start = 0
        self.static_steps = 0

    def _obs(self) -> np.ndarray:
        i = min(self.current_step, len(self.z_scores)-1)
        return np.array([float(self.position), float(self.z_scores[i]), float(self.zones[i])], dtype=np.float32)

    def reset(self, seed: Optional[int] = None, options: Optional[Dict[str, Any]] = None):
        super().reset(seed=seed)
        self.episode_start = self.np_random.integers(self.min_start, self.max_start)
        self.current_step = self.episode_start
        self.position = 0.0
        self.portfolio_value = self.seed_money
        self.cash = self.seed_money
        self.shares = 0.0
        self.static_steps = 0
        return self._obs(), {"portfolio_value": self.portfolio_value, "nav": self.portfolio_value / self.seed_money}

    def step(self, action: np.ndarray):
        if np.isscalar(action):
            a = float(np.clip(action, -1.0, 1.0))
        else:
            a = float(np.clip(action[0], -1.0, 1.0))

        # Incremental position change
        new_position = float(np.clip(self.position + a * self.incremental_step, -1.0, 1.0))
        delta = abs(new_position - self.position)
        self.position = new_position

        # Trade cost
        i = min(self.current_step, len(self.prices)-2)
        price_now = self.prices[i]
        trade_value = delta * self.portfolio_value
        cost = trade_value * (FEE_RATE + SLIPPAGE)

        # Update holdings
        target_equity = self.position * self.portfolio_value
        self.shares = target_equity / price_now
        self.cash = self.portfolio_value - target_equity - cost

        # Portfolio update to next step
        price_next = self.prices[i+1]
        equity_value = self.shares * price_next
        self.portfolio_value = self.cash + equity_value

        # Rewards
        price_ret = (price_next - price_now) / (price_now + 1e-12)
        pnl_reward = ((equity_value + self.cash) - (self.cash + self.shares * price_now)) / max(self.portfolio_value, 1e-8)
        pnl_reward *= self.pnl_scale

        # Activity incentive / static penalty
        if delta > self.static_delta_thresh:
            activity_reward = ACTIVITY_REWARD_SCALE * delta
            self.static_steps = 0
        else:
            self.static_steps += 1
            activity_reward = -STATIC_PENALTY_BASE * (1 + STATIC_PENALTY_ESCALATION * self.static_steps)

        reward = float(pnl_reward + activity_reward)

        self.current_step += 1
        terminated = self.current_step >= self.episode_start + self.episode_length

        obs = self._obs()
        info = {"portfolio_value": float(self.portfolio_value), "nav": float(self.portfolio_value / self.seed_money)}
        return obs, reward, terminated, False, info

print("✅ 15D Environment ready")


✅ 15D Environment ready


In [None]:
# ===================== TRAINING HELPERS (A2C) =====================

def make_train_env(incremental_step=INCREMENTAL_STEP_SIZE,
                   pnl_scale=PNL_REWARD_SCALE,
                   static_delta=STATIC_DELTA_THRESH):
    def _fn():
        return Monitor(TradingEnv15D(
            df=train_df,
            feature_cols=feature_cols,
            episode_length=EPISODE_LENGTH,
            seed_money=SEED_MONEY,
            incremental_step=incremental_step,
            pnl_scale=pnl_scale,
            static_delta_thresh=static_delta
        ))
    return DummyVecEnv([_fn])

def make_eval_env(incremental_step=INCREMENTAL_STEP_SIZE,
                  pnl_scale=PNL_REWARD_SCALE,
                  static_delta=STATIC_DELTA_THRESH):
    def _fn():
        return Monitor(TradingEnv15D(
            df=test_df,
            feature_cols=feature_cols,
            episode_length=EPISODE_LENGTH_EVAL,
            seed_money=SEED_MONEY,
            incremental_step=incremental_step,
            pnl_scale=pnl_scale,
            static_delta_thresh=static_delta
        ))
    return DummyVecEnv([_fn])

def train_a2c(config: Dict[str, Any], run_name: str) -> str:
    lr = float(config.get('learning_rate', LEARNING_RATE))
    n_steps = int(config.get('n_steps', BATCH_SIZE))
    gamma = float(config.get('gamma', GAMMA))
    gae_lambda = float(config.get('gae_lambda', GAE_LAMBDA))
    ent_coef = float(config.get('ent_coef', ENT_COEF))
    vf_coef = float(config.get('vf_coef', VF_COEF))
    max_grad_norm = float(config.get('max_grad_norm', MAX_GRAD_NORM))
    total_timesteps = int(config.get('total_timesteps', TOTAL_TIMESTEPS))

    inc = float(config.get('increment_step_size', INCREMENTAL_STEP_SIZE))
    pnl_scale = float(config.get('pnl_reward_scale', PNL_REWARD_SCALE))
    static_delta = float(config.get('static_delta_thresh', STATIC_DELTA_THRESH))

    env = make_train_env(inc, pnl_scale, static_delta)

    model = A2C(
        policy='MlpPolicy',
        env=env,
        learning_rate=lr,
        n_steps=n_steps,
        gamma=gamma,
        gae_lambda=gae_lambda,
        ent_coef=ent_coef,
        vf_coef=vf_coef,
        max_grad_norm=max_grad_norm,
        verbose=0,
        seed=42,
        device='auto'
    )

    eval_env = make_eval_env(inc, pnl_scale, static_delta)
    early_stop = StopTrainingOnNoModelImprovement(max_no_improvement_evals=3, min_evals=3, verbose=0)
    eval_cb = EvalCallback(
        eval_env,
        best_model_save_path=OUTPUT_DIR,
        log_path=OUTPUT_DIR,
        eval_freq=int(config.get('eval_freq', 25_000)),
        n_eval_episodes=int(config.get('n_eval_episodes', 5)),
        deterministic=True,
        render=False,
        callback_after_eval=early_stop
    )

    start = time.time()
    model.learn(total_timesteps=total_timesteps, callback=eval_cb)
    mins = (time.time() - start) / 60

    model_path = os.path.join(OUTPUT_DIR, f"a2c_{run_name}")
    model.save(model_path)
    print(f"💾 Saved {run_name} in {OUTPUT_DIR} ({mins:.1f}m)")
    return model_path + ".zip"


In [None]:
# ===================== BULK CONFIG TESTING =====================

import glob

def load_bulk_configs(config_path: str) -> List[Dict[str, Any]]:
    with open(config_path, 'r') as f:
        data = json.load(f)
    return data.get('training_configurations', [])

results_rows: List[Dict[str, Any]] = []

def run_bulk(config_path: str, limit: Optional[int] = None, name_prefix: str = "15d") -> pd.DataFrame:
    configs = load_bulk_configs(config_path)
    if limit is not None:
        configs = configs[:limit]
    print(f"🧪 Running {len(configs)} configs from {config_path}")

    for idx, entry in enumerate(configs, 1):
        cfg_name = entry.get('name', f'cfg_{idx:04d}')
        params = entry.get('params', {})
        run_name = f"{name_prefix}_{cfg_name}"
        print(f"[{idx}/{len(configs)}] Training {run_name}")
        model_zip = train_a2c(params, run_name)

        # model-driven evaluation episode
        env = make_eval_env(
            params.get('increment_step_size', INCREMENTAL_STEP_SIZE),
            params.get('pnl_reward_scale', PNL_REWARD_SCALE),
            params.get('static_delta_thresh', STATIC_DELTA_THRESH)
        )
        try:
            model = A2C.load(model_zip, env=env)
        except Exception:
            model = A2C.load(model_zip.replace('.zip',''), env=env)
        obs = env.reset()
        done = False
        navs = []
        steps = 0
        while not done and steps < EPISODE_LENGTH_EVAL:
            action, _ = model.predict(obs, deterministic=True)
            obs, reward, done, info = env.step(action)
            navs.append(info[0].get('portfolio_value', SEED_MONEY))
            done = bool(done[0])
            steps += 1
        metrics = calculate_metrics(np.array(navs, dtype=float), SEED_MONEY)

        row = {
            'run_name': run_name,
            **{k: v for k, v in params.items()},
            **metrics,
            'model_path': model_zip
        }
        results_rows.append(row)
        if idx % 10 == 0:
            pd.DataFrame(results_rows).to_csv(os.path.join(OUTPUT_DIR, 'bulk_results_partial.csv'), index=False)

    df_res = pd.DataFrame(results_rows)
    out_csv = os.path.join(OUTPUT_DIR, 'bulk_results.csv')
    df_res.to_csv(out_csv, index=False)
    print(f"✅ Bulk results saved: {out_csv}")
    return df_res

# Kick off small dry run by default (you can increase limit)
# bulk_results = run_bulk(BULK_CONFIG_PATH, limit=5, name_prefix="15d")


In [None]:
# ===================== EVALUATION & VISUALIZATION =====================

def evaluate_model_simple(env, model, max_steps: int) -> Dict[str, Any]:
    obs = env.reset()
    done = False
    navs, positions = [], []
    steps = 0
    while not done and steps < max_steps:
        action, _ = model.predict(obs, deterministic=True)
        obs, reward, done, info = env.step(action)
        navs.append(info[0].get('portfolio_value', SEED_MONEY))
        base = env.envs[0].env
        positions.append(float(base.position))
        done = bool(done[0])
        steps += 1
    metrics = calculate_metrics(np.array(navs, dtype=float), SEED_MONEY)
    metrics['avg_abs_position'] = float(np.mean(np.abs(positions))) if positions else 0.0
    return metrics, navs, positions


def plot_nav(navs: List[float], title: str, save: Optional[str] = None):
    plt.figure(figsize=(12,5))
    plt.plot(navs, lw=1.5)
    plt.axhline(y=SEED_MONEY, color='gray', ls='--', alpha=0.6)
    plt.title(title)
    plt.ylabel('Portfolio Value ($)')
    plt.xlabel('Minutes')
    plt.grid(alpha=0.3)
    if save:
        plt.savefig(save, dpi=200, bbox_inches='tight')
    plt.show()

print("✅ Eval & viz ready")


✅ Eval & viz ready


## How to run

- Optional: create a 15D-specific JSON config (recommended) and set `BULK_CONFIG_PATH` to it.
- Quick sanity check: uncomment the last line in the Bulk section to run a small `limit=5` sweep.
- After bulk finishes, load `processed_data_15d/bulk_results.csv` and sort by `final_nav`.
- Train a final model with the best config and then evaluate with `evaluate_model_simple` + `plot_nav`.



In [None]:
# ===================== SELECT BEST AND VISUALIZE =====================

# Load bulk results, pick top by NAV, re-evaluate and plot NAV

def select_and_visualize(results_csv: str, top_k: int = 5):
    if not os.path.exists(results_csv):
        print(f"No results found at {results_csv}")
        return
    df = pd.read_csv(results_csv)
    if 'final_nav' not in df.columns:
        print("Results CSV missing 'final_nav'")
        print(df.head())
        return
    df_sorted = df.sort_values('final_nav', ascending=False).reset_index(drop=True)
    print(df_sorted.head(top_k))

    best = df_sorted.iloc[0].to_dict()
    inc = float(best.get('increment_step_size', INCREMENTAL_STEP_SIZE))
    pnl = float(best.get('pnl_reward_scale', PNL_REWARD_SCALE))
    sdt = float(best.get('static_delta_thresh', STATIC_DELTA_THRESH))

    env = make_eval_env(inc, pnl, sdt)
    model_path = str(best['model_path'])
    try:
        model = A2C.load(model_path, env=env)
    except Exception:
        model = A2C.load(model_path.replace('.zip',''), env=env)

    metrics, navs, positions = evaluate_model_simple(env, model, EPISODE_LENGTH_EVAL)
    print("Best metrics:", metrics)
    plot_nav(navs, title=f"Best run: {best.get('run_name','')} NAV={metrics.get('final_nav',0):.3f}",
             save=os.path.join(OUTPUT_DIR, f"best_{best.get('run_name','')}_nav.png"))

# Example usage (uncomment after bulk):
# select_and_visualize(os.path.join(OUTPUT_DIR, 'bulk_results.csv'), top_k=5)
