In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import StandardScaler


In [34]:
class BaseMarketDataset(tf.keras.utils.Sequence):
    def __init__(
        self,
        df: pd.DataFrame, feature_cols: list[str],
        window: int = 48, batch_size: int = 32,
        use_images: bool = False, shuffle: bool = True, indices: np.ndarray | None = None, scaler: StandardScaler | None = None,
    ):
        super().__init__()

        self.df         = df
        self.window     = window
        self.batch_size = batch_size
        
        self.use_images = use_images
        self.shuffle    = shuffle

        self.feature_cols = feature_cols
        self.indices = indices if indices is not None else np.arange(len(df) - window)
        
        if scaler is not None: self.scaler = scaler
        else:
            self.scaler = StandardScaler()
            self._fit_scaler()

        if self.shuffle: np.random.shuffle(self.indices)

    # ---------------------------------------------------------------------------------------
    def _fit_scaler(self):
        X = self.df[self.feature_cols].values
        self.scaler.fit(X)

    # ---------------------------------------------------------------------------------------
    def __len__(self):
        return int(np.ceil(len(self.indices) / self.batch_size))

    # ---------------------------------------------------------------------------------------
    def on_epoch_end(self):
        if self.shuffle: np.random.shuffle(self.indices)

    # ---------------------------------------------------------------------------------------
    def candle_image(self, df_window: pd.DataFrame, height: int=64):
        """
        Generate a binary image of candlesticks within a given time window.

        Each candlestick is represented as a column in the image:
        - Wick is drawn as a vertical line
        - Body is drawn as a filled rectangle

        Parameters
        ----------
        df_window : DataFrame -- containing the columns ['open','high','low','close'] for a time window
        height : int, default=64 -- Image height (number of pixels along the y-axis)

        Returns
        -------
        np.ndarray -- Binary array of shape (height, len(df_window)*3) representing the candlestick visualization.
        """
        img = np.zeros((height, len(df_window)*3))
        norm = self._normalize_window(df_window)

        for i, row in norm.iterrows():
            x = (i - df_window.index[0]) * 3 + 1

            o, h, l, c = row
            y_o = int(o * (height-1))
            y_c = int(c * (height-1))
            y_h = int(h * (height-1))
            y_l = int(l * (height-1))

            img[y_l:y_h+1, x] = 1							# wick
            img[min(y_o,y_c):max(y_o,y_c)+1, x-1:x+2] = 1	# body

        return img
    
    # p′ = (p−min(low)) / (max(high)−min(low)) ----------------------------------------------
    @staticmethod
    def _normalize_window(df_window: pd.DataFrame):
        low  = df_window['low' ].min()
        high = df_window['high'].max()
        return (df_window[['open','high','low','close']] - low) / (high - low)



In [None]:
class Stage1MarketDataset(BaseMarketDataset):
    # ---------------------------------------------------------------------------------------
    def __getitem__(self, idx):
        batch_idx = self.indices[idx*self.batch_size:(idx+1)*self.batch_size]

        feature_input, candles_input, outputs = [], [], []

        for i in batch_idx:
            window_df = self.df.iloc[i:i+self.window]
            feature_input.append(self.scaler.transform(window_df[self.feature_cols].values))
            outputs.append(self.df.loc[i+self.window, "market_regime"])

            if self.use_images: candles_input.append(self.candle_image(window_df))

        if self.use_images: inputs = {"tabular": np.array(feature_input, dtype=np.float32), "image": np.array(candles_input)}
        else:               inputs = np.array(feature_input, dtype=np.float32)
        outputs = np.array(outputs, dtype=np.int32)

        return inputs, outputs


In [36]:
class Stage2MarketDataset(BaseMarketDataset):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        valid_1 = self.df["market_regime"].values[self.indices + self.window] != 0
        valid_2 = np.isin(self.df["trade_outcome"].values[self.indices + self.window], [-1, 1])
        self.indices = self.indices[valid_1 & valid_2]

        self._targets = (
            self.df.loc[self.indices + self.window, "trade_outcome"].map({-1: 0, 1: 1}).values.astype(np.int32)
        )

    # ---------------------------------------------------------------------------------------
    def __getitem__(self, idx):
        batch_idx = self.indices[idx*self.batch_size:(idx+1)*self.batch_size]

        feature_input, candles_input, outputs = [], [], []

        for j, i in enumerate(batch_idx):
            window_df = self.df.iloc[i:i+self.window]
            feature_input.append(self.scaler.transform(window_df[self.feature_cols].values))
            outputs.append(self._targets[j + idx*self.batch_size])

            if self.use_images: candles_input.append(self.candle_image(window_df))

        if self.use_images: inputs = {"tabular": np.array(feature_input, dtype=np.float32), "image": np.array(candles_input)}
        else:               inputs = np.array(feature_input, dtype=np.float32)
        outputs = np.array(outputs, dtype=np.int32)

        return inputs, outputs


In [37]:
class DatasetManager:
    def __init__(self, df: pd.DataFrame, window: int = 48, split_ratio: tuple=(0.7, 0.15, 0.15)):
        self.df = df.reset_index(drop=True)
        self.window = window
        self.split_ratio = split_ratio
        self._split()

    def _split(self):
        n_samples = len(self.df) - self.window

        n_train = int(self.split_ratio[0] * n_samples)
        n_val   = int(self.split_ratio[1] * n_samples)

        self.train_idx = np.arange(0, n_train)
        self.val_idx   = np.arange(n_train, n_train + n_val)
        self.test_idx  = np.arange(n_train + n_val, n_samples)


In [None]:
data = pd.read_csv('./Data/processed/EURUSD -- 5 (2024-09-01 - 2025-12-01).csv')

feature_cols = [
    "time_sin", "time_cos",
    "return", "body", "range", "upper_wick", "lower_wick",
    "ema_20", "price_ema_ratio", "volatility_20", "volume_norm",
    "dist_round", "trend_144", "dist_shadow_prev_time", "dist_shadow_prev_week"
]

manager  = DatasetManager(data, window=48)
train_ds = Stage1MarketDataset(data, feature_cols, indices=manager.train_idx, shuffle=True)
valid_ds = Stage1MarketDataset(data, feature_cols, scaler=train_ds.scaler, indices=manager.val_idx , shuffle=False)
test_ds  = Stage1MarketDataset(data, feature_cols, scaler=train_ds.scaler, indices=manager.test_idx, shuffle=False, batch_size=1)



In [44]:
train_ds[0]

(array([[[ 6.49999976e-02,  9.98000026e-01, -3.60840990e-04,
           2.99999992e-05,  5.30000019e-04,  2.30000005e-04,
           2.69999990e-04,  1.10853744e+00,  9.99803901e-01,
           1.90217994e-04,  5.18498540e-01,  1.44999996e-01,
           9.99046087e-01,  4.93525171e+00,  1.81294956e+01],
         [ 8.69999975e-02,  9.95999992e-01,  0.00000000e+00,
           0.00000000e+00,  3.99999990e-05,  1.99999995e-05,
           1.99999995e-05,  1.10851669e+00,  9.99822557e-01,
           1.89893995e-04,  2.13996530e-01,  1.65999994e-01,
           9.99059260e-01,  6.29820061e+00,  2.04421597e+01],
         [ 1.08999997e-01,  9.94000018e-01, -5.41000009e-05,
          -5.99999985e-05,  1.50000007e-04,  9.00000014e-05,
           0.00000000e+00,  1.10849226e+00,  9.99790490e-01,
           1.74487999e-04,  1.96202531e-01,  1.58999994e-01,
           9.99018908e-01,  6.35013247e+00,  2.09442978e+01],
         [ 1.30999997e-01,  9.90999997e-01, -3.60999984e-05,
          -2.99999992