In [1]:
import jupyter

In [2]:
import random
import numpy as np
import pandas as pd
import seaborn as sns
import gymnasium as gym
import matplotlib.pyplot as plt


from src.utils.system import boot
from src.defaults import RANDOM_SEEDS
from src.data.feature_pipeline import load_base_dataframe
from experiments import check_if_experiment_exists, register_experiment ,experiment_hash
from environments import PositionTradingEnv,PositionTradingEnvV1,PositionTradingEnvV2

# ========== SYSTEM BOOT ==========
DEVICE = boot()
EXPERIMENT_NAME = "qa__trading_environments"
DEFAULT_PATH = "data/experiments/" + EXPERIMENT_NAME

# ========== CONFIG ==========
TICKER = "AAPL"
TIMESTEPS = 10_000
EVAL_EPISODES = 5
N_TIMESTEPS = 60
LOOKBACK = 0
SEEDS = [42, 52, 62]
MARKET_FEATURES = ['close']
BENCHMARK_PATH = DEFAULT_PATH+"/benchmark_episodes.json"
CHECKPOINT_DIR = DEFAULT_PATH+"/checkpoints"
SCORES_DIR = DEFAULT_PATH+"/scores"

MARKET_FEATURES.sort()
SEEDS.sort()

DEVICE = boot()
OHLCV_DF = load_base_dataframe()

  from pandas.core import (


In [3]:
from typing import Tuple
import numpy as np


class PositionTradingEnvV3(PositionTradingEnvV2):
    """
    PositionTradingEnvV3
    ---------------------

    A human-inspired, curriculum-learning environment for reinforcement learning agents in trading.
    This version introduces Kai's "School Reward Curriculum" — a staged reward system designed to mimic 
    how we teach children to explore, persist through failure, and progressively master difficult tasks.

    ---------------------
    Why This Matters:
    ---------------------
    Traditional RL assumes agents can survive cold optimization. But we’re building an intelligent, self-reflective system. 
    And like all intelligent learners, it must be nurtured.

    So instead of punishing early failure or passivity too harshly, we reward **meaningful attempts to act**.
    This fosters early exploration, builds confidence, and allows the agent to discover structure in the market 
    before we tighten expectations.

    ---------------------
    The School Reward Curriculum:
    ---------------------

    ◉ Phase 1: Exploration Over Inaction
        - Reward is generous toward action.
        - Foresight bonus: If a position switch *happened to be well-timed*, the agent gets extra points.
        - Exploration bonus: Trying new positions is encouraged — even if the immediate outcome isn't profitable.
        - Goal: Reward **trying**, not just winning. Build initiative.

    ◉ Phase 2: Mastery Emerges
        - Bonuses are gradually decayed.
        - Agent must begin to **sustain good decisions**, not just get lucky.
        - Less encouragement for randomness; more weight on consistent performance.
        - Goal: Build **skill**, not just courage.

    ◉ Phase 3: Graduation
        - Return to strict oracle-relative reward.
        - No more bonuses: the agent is ready for the real world.
        - Encourage specialization — regime-awareness, style, timeframe expertise.
        - Goal: Become a **professional**.

    ---------------------
    Usage:
    ---------------------
    Use the `reward_phase` parameter to set the phase manually, or optionally let the system 
    transition automatically after N episodes.

    Available Phases:
        - "exploration"
        - "mastery"
        - "strict"

    ---------------------
    Designed With ❤️ by Pi & Kai
    ---------------------
    """

    def __init__(self, *args, reward_phase="exploration", foresight_bonus=0.1, **kwargs):
        super().__init__(*args, **kwargs)
        self.reward_phase = reward_phase
        self.foresight_bonus = foresight_bonus
        self.prev_position = 0

    def _step_reward(self, action: int, price_change: float, oracle_action: int) -> float:
        # Base reward: oracle-relative
        base_reward = 0
        if action == oracle_action:
            base_reward = 1 * abs(price_change)
        elif action != 0:
            base_reward = -1 * abs(price_change)

        bonus = 0

        # --- Phase-specific logic ---
        if self.reward_phase == "exploration":
            if action != self.prev_position:
                # Position switch bonus
                if np.sign(price_change) == (1 if action == 1 else -1):
                    bonus += self.foresight_bonus * abs(price_change)
        elif self.reward_phase == "mastery":
            if action != self.prev_position:
                if np.sign(price_change) == (1 if action == 1 else -1):
                    bonus += 0.5 * self.foresight_bonus * abs(price_change)
        # "strict" phase does not add bonus

        self.prev_position = action
        return base_reward + bonus

    def step(self, action: int) -> Tuple[np.ndarray, float, bool, bool, dict]:
        obs, price_change, done, truncated,  _ = super().step(action)
        oracle_action = action
        if price_change < 0 :
            oracle_action = abs(action-1)
        reward = self._step_reward(action, price_change, oracle_action)
        return obs, reward, done, truncated, {}

    def reset(self, **kwargs):
        self.prev_position = 0
        return super().reset(**kwargs)


# TESTS 


## Core categories:
1. **Data** 
   * _prepare_ticker_df
2. **Episode** 
   * _resample_episode
   * _precompute_step_weights
3. **Reward**
   * Oracle computation
   * Expected reward for action/step
4. **Money** 
   * market_progress
   * wallet_progress
   * alpha_progress
5. **Position**
   * position updates
6. **Observation**
   * Check get observation - if matches expected reality
7. **Behaviours**
   * later
8. **Edge cases**


In [55]:
import unittest
import numpy as np
import pandas as pd
#from your_module import PositionTradingEnvV1  # Replace with actual path

class TestPositionTradingEnvSynthetic(unittest.TestCase):
    def __init__(self,env_class):
        self.env_class= env_class
        self.setUp()
        
    def setUp(self):
        # Synthetic constant uptrend (price +1 per step)
        dates = pd.date_range("2024-01-01", periods=10, freq="B")
        self.df = pd.DataFrame({
           "date": pd.date_range("2024-01-01", periods=10, freq="B"),  # 10 business days
            "symbol": ["TEST"] * 10,
            "close": [100, 102, 101, 103, 104, 105, 107, 106, 108, 110],
            "day_of_week": list(range(5)) * 2,                         # fake, for observations
        })
        self.env_kwargs = {
            "full_df": self.df,
            "ticker": "TEST",
            "market_features": ["close"],
            "n_timesteps": 5,
            "lookback": 0,
            "seed": 0,
            "start_idx": 0
        }

    def get_env(self):
        env = self.env_class(**self.env_kwargs)
        env.reset()
        return env
        
    def test_oracle_reward_sum(self):
        env = self.env_class(**self.env_kwargs)
        env.reset()
        oracle_reward = 0
        for i in range(env.n_timesteps - 1):
            price_diff = env.prices[i + 1] - env.prices[i]
            weight = env.step_weights[i]
            reward = abs(weight * np.sign(price_diff)  * 100)
            oracle_reward += reward
        
        self.assertAlmostEqual(oracle_reward, 100.0, places=4)

    def test_episode_prices(self):
        env = self.env_class(**self.env_kwargs)
        obs, _ = env.reset()
       
        expected_prices = [100, 102, 101, 103, 104]
        self.assertTrue((env.prices == expected_prices).all())
        
    def test_position_updates(self):
        env = self.env_class(**self.env_kwargs)
        obs, _ = env.reset()
        print(env.position)
        self.assertTrue(env.position== 0)

        # Buy
        obs, reward, terminated, truncated, _ = env.step(1)
        self.assertTrue(env.position== 1)
        self.assertTrue(env.entry_price > 0)

        # Hold
        obs, reward, terminated, truncated, _ = env.step(1)
        self.assertTrue(env.position== 1)
        self.assertGreater(env.holding_time, 0)

        # Sell
        obs, reward, terminated, truncated, _ = env.step(0)
        self.assertTrue(env.position== 0)
        self.assertTrue(env.entry_price== 0)

    def test_scaled_reward_behavior(self):
        env_kwargs = self.env_kwargs
        env_kwargs['start_idx'] = 2
        env = self.env_class(**env_kwargs)
        
        obs, _ = env.reset()
        rewards = []
        for _ in range(env.n_timesteps - 1):
            obs, reward, terminated, truncated, _ = env.step(1)
            rewards.append(reward)
        
        self.assertTrue(all([r > 0 for r in rewards if r != 0]))  # uptrend

    def test_flat_position_gets_penalized(self):
        env = self.env_class(**self.env_kwargs)
        obs, _ = env.reset()
        rewards = []
        for _ in range(env.n_timesteps - 1):
            obs, reward, terminated, truncated, _ = env.step(0)
            rewards.append(reward)
        self.assertTrue(all([r <= 0 for r in rewards]))

#if __name__ == "__main__":
#    unittest.main()
t = TestPositionTradingEnvSynthetic(PositionTradingEnvV3)

In [56]:
#np.sum(t.get_env().step_weights)
t.test_oracle_reward_sum()
t.test_episode_prices()
t.test_position_updates()
t.test_scaled_reward_behavior()
t.test_flat_position_gets_penalized()

0


In [34]:
dates = pd.date_range("2024-01-01", periods=10, freq="B")
dates

DatetimeIndex(['2024-01-01', '2024-01-02', '2024-01-03', '2024-01-04',
               '2024-01-05', '2024-01-08', '2024-01-09', '2024-01-10',
               '2024-01-11', '2024-01-12'],
              dtype='datetime64[ns]', freq='B')