In [1]:
import os
import json
import pandas as pd
import numpy as np
from finta import TA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
import logging


# Configure logging
logger = logging.getLogger(__name__)

class EnvConfig():
    """environment configuration from json file
       tgym requires you configure your own parameters in json file.
        Args:
            config_file path/file.json

    """
    def __init__(self,config_file):
        self.config = {}
        with open(config_file) as j: 
            self.config = json.load(j)

    def env_parameters(self,item=''):   
        """environment variables 
        """ 
        if item:
            return self.config["env"][item]
        else:
            return self.config["env"]
        
    def symbol(self, asset="GBPUSD", item='') :
        """get trading pair (symbol) information

        Args:
            asset (str, optional): symbol in config. Defaults to "GBPUSD".
            item (str, optional): name of item, if '' return dict, else return item value. Defaults to ''.

        Returns:
            [type]: [description]
        """
        if item:
            return self.config["symbol"][asset][item]
        else:
            return self.config["symbol"][asset]
        
    def data_processing_parameters(self, item=''):
        """Get data processing config"""
        if item:
            return self.config["data_processing"][item]
        return self.config["data_processing"]
            
    def trading_hour(self,place="New York"):
        """forex trading hour from different markets

        Args:
            place (str, optional): [Sydney,Tokyo,London] Defaults to "New York".

        Returns:
            [dict]: from time, to time
        """
        if place:
            return self.config["trading_hour"][place]
        else:
            return self.config["trading_hour"]

In [2]:
def patch_missing_data(df, dt_col_name='time', cf=None):
    # ["time","open", "high", "low", "close"]
    required_cols = cf.data_processing_parameters("required_cols")    
    
    # df မှာ 6 columns ရှိရင် vol ပါထည့်မယ် 
    if df.shape[1] == 6:
        df.columns = required_cols + ['vol']  
    elif df.shape[1] == 5:
        df.columns = required_cols
    else:
        raise ValueError(f"Invalid number of columns: {df.shape[1]} =>{required_cols}")
    
    logger.warning(f"shape of  column: {df.shape[1]}")
    # 1. Column validation
    if missing := set(required_cols) - set(df.columns):
        raise ValueError(f"Missing columns: {missing}")

    # 2. Auto-detect datetime column
    dt_candidates = {'time', 'timestamp', 'date', 'datetime'}
    if dt_col_name not in df.columns:
        found = list(dt_candidates & set(df.columns))
        if not found:
            raise KeyError(f"No datetime column found. Tried: {dt_candidates}")
        dt_col_name = found[0]
        logger.info(f"Using datetime column: {dt_col_name}")

    # 3. Convert to datetime index
    df[dt_col_name] = pd.to_datetime(df[dt_col_name], utc=True)
    df = df.set_index(dt_col_name).sort_index()

    # 4. Create complete 5-min grid (Mon 00:00 - Fri 23:55 UTC)
    new_index = pd.date_range(
        start=df.index.min().floor('D'),
        end=df.index.max().ceil('D'),
        freq='5T',
        tz='UTC'
    )
    
    # 5. Forward-fill OHLC prices
    df = df.reindex(new_index)
    # 6. Apply Limited Forward-Fill (FFILL) - Internal Gaps ကိုသာ ဖြည့်ရန်
    # fill_limit = 12 # ဥပမာ: 1 နာရီ (12 bars) ထက်ပိုတဲ့ ကွက်လပ်ကို မဖြည့်ပါ
    # df[['open', 'high', 'low', 'close', 'vol']] = df[['open', 'high', 'low', 'close', 'vol']].ffill(fill_limit)
    # 6. Apply Limited Forward-Fill (FFILL) - Internal Gaps ကိုသာ ဖြည့်ရန်
    fill_limit = 12 # ဥပမာ: 1 နာရီ (12 bars) ထက်ပိုတဲ့ ကွက်လပ်ကို မဖြည့်ပါ
    fill_cols = ['open', 'high', 'low', 'close', 'vol'] if 'vol' in df.columns else ['open', 'high', 'low', 'close']
    
    # FFill: ရှေ့က data ဖြင့် ဖြည့်ပါ
    df[fill_cols] = df[fill_cols].ffill(limit=fill_limit)

    # # 6. Filter weekends (keep Friday 22:00-23:55 as "pseudo Sunday")
    # df = df[(df.index.weekday < 5) | (
    #     (df.index.weekday == 4) & (df.index.hour >= 22)
    # )]

    # 7. Validate bars per week
    # min bar per week က timeframe အလိုက် အပြောင်းလဲရှိမယ်
    # 5-min timeframe (repo မှာ သုံး ထားတဲ့ အတိုင်း): 
    # တစ်ရက် ၂၄ နာရီ x ၆၀ မိနစ် / ၅ = ၂၈၈ bars/ရက်။ တစ်ပတ် (၅ ရက်) ဆို ၁၄၄၀ bars လောက် ရှိ ရမယ် 
    # (min_bars_per_week=1440 လောက်)။
    # 5-min: min_bars_per_week = 1440 (၂၈၈ bars/ရက် x ၅ ရက်)
    # 15-min: 480 (၉၆ bars/ရက် x ၅)
    # 1-hour: 120 (၂၄ bars/ရက် x ၅)
    min_bars = cf.data_processing_parameters("min_bars_per_week")
    for week, group in df.groupby(pd.Grouper(freq='W-MON')):
        if len(group) != min_bars:
            logger.warning(f"Week {week} has {len(group)}/{min_bars} bars")
            
    # 7. ❗️ NaN အဖြစ် ကျန်ရှိနေသော Rows များကို ဖယ်ရှားခြင်း ❗️
    # Open price မရှိတဲ့ rows တွေကို စစ်ဆေးပြီး ဖယ်ရှားပါ (Weekend Gap ကြောင့် NaN ကျန်ခဲ့သော bars များ)
    df = df.dropna(subset=['open'])
    logger.warning(f"Total rows dropped due to NaN (likely weekend gap): {len(new_index) - len(df)}")

    return df.reset_index().rename(columns={'index': dt_col_name})


In [3]:
def add_time_feature(df, symbol):
    """Add temporal features with proper index handling"""
    
    if 'time' not in df.columns:
        raise KeyError("'time' column missing after patch_missing_data")
        
    df = df.set_index('time')
    df.index = pd.to_datetime(df.index, utc=True)
    
    # Cyclical time features
    df['weekday'] = df.index.dayofweek  # 0=Monday
    df['day'] = df.index.day
    df['week'] = df.index.isocalendar().week
    df['month'] = df.index.month
    df['year'] = df.index.year
    df['hour'] = df.index.hour
    df['hour_sin'] = np.sin(2 * np.pi * df['hour']/24).round(6)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour']/24).round(6)
    df['minute_block'] = df.index.minute // 5  # 0-11
    df['minute_sin'] = np.sin(2 * np.pi * df['minute_block']/12).round(6)
    df['minute_cos'] = np.cos(2 * np.pi * df['minute_block']/12).round(6)
    
    # Market sessions (GMT)
    df['london_session'] = ((df['hour'] >= 8) & (df['hour'] < 16)).astype(int)
    df['ny_session'] = ((df['hour'] >= 13) & (df['hour'] < 21)).astype(int)
    df['overlap_session'] = ((df['hour'] >= 13) & (df['hour'] < 16)).astype(int)
    
    df['symbol'] = symbol
    return df.reset_index()


In [4]:
def tech_indicators(df, cf=None):  # 288 = 24hrs in 5-min bars
    """Calculate technical indicators with proper NaN handling"""
    period = cf.data_processing_parameters("indicator_period")
    # 1. Preserve raw prices before normalization
    raw_cols = ['mean_std_open','mean_std_high','mean_std_low','mean_std_close']
    df[raw_cols] = df[['open','high','low','close']].copy()
    # Calculate indicators
    df['macd'] = TA.MACD(df).SIGNAL.ffill().round(6)
    bb = TA.BBANDS(df)
    df['boll_ub'] = bb['BB_UPPER'].ffill()
    df['boll_lb'] = bb['BB_LOWER'].ffill()
    
    df['rsi_30'] = TA.RSI(df, period=period).ffill()
    df['dx_30'] = TA.ADX(df, period=period).ffill()
    df['close_30_sma'] = TA.SMA(df, period=period).ffill()
    df['close_60_sma'] = TA.SMA(df, period=period*2).ffill()
    df['atr'] = TA.ATR(df, period=period).ffill()
     # Add returns and volatility ratio
    df['returns_5'] = df['close'].pct_change(5).round(6)
    df['returns_24'] = df['close'].pct_change(24).round(6)
    df['volatility_ratio'] = (df['high'] - df['low']) / df['close'].round(6)
        
    # Normalize
    scaler = StandardScaler()
    scale_cols = cf.data_processing_parameters("scale_cols")  

    df[scale_cols] = scaler.fit_transform(df[scale_cols])
    df = df.replace([np.inf, -np.inf], np.nan).fillna(0)
    # 1. Identify numeric columns
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    # 2. Apply clipping only to numeric features
    df[numeric_cols] = df[numeric_cols].clip(lower=-1e5, upper=1e5)
    # 3. Round decimal values
    df[numeric_cols] = df[numeric_cols].round(6).clip(-1e5, 1e5)
    return df


In [5]:
class TimeSeriesScaler:
    """
    Manages the MinMax Scaling process for time series features.
    It fits the scaler on the first chunk of data (expected to be the training start)
    and uses that fitted scaler to transform all subsequent data chunks (including eval).
    """
    def __init__(self):
        # MinMaxScaler ကို အသုံးပြုပြီး 0 နဲ့ 1 ကြားကို ပြောင်းပါ
        self.scaler = MinMaxScaler()
        self.is_fitted = False
        self.price_cols = ['mean_std_open', 'mean_std_high', 'mean_std_low', 'mean_std_close']
        
    def fit_and_transform(self, df):
        """Fit the scaler on the data and transform it."""
        logger.info("Fitting Scaler on current week data (TRAIN set base)")
        # .copy() လုပ်ပြီးမှ transform လုပ်ပါ
        df_copy = df.copy() 
        df_copy[self.price_cols] = self.scaler.fit_transform(df_copy[self.price_cols])
        self.is_fitted = True
        return df_copy
    
    def transform(self, df):
        """Transform data using the previously fitted scaler."""
        if not self.is_fitted:
            raise ValueError("Scaler must be fitted on the training data first!")
            
        logger.info("Transforming current week data using fitted scaler.")
        # .copy() လုပ်ပြီးမှ transform လုပ်ပါ
        df_copy = df.copy() 
        df_copy[self.price_cols] = self.scaler.transform(df_copy[self.price_cols])
        return df_copy

In [11]:
def split_time_series_v2(df, freq='W-FRI', symbol='EURUSD', cf=None, scaler_manager=None):
    """
    Split data with weekly alignment and performs MinMax scaling using 
    Fit-Transform for the first non-NaN chunk and then Transform for others.
    """
    if scaler_manager is None:
        raise ValueError("scaler_manager (TimeSeriesScaler instance) must be provided.")
        
    split_cfg = cf.data_processing_parameters("train_eval_split")
    base_path = split_cfg["base_path"].format(symbol=symbol)
        
    # Align with Forex week (Monday-Sunday)
    df['time'] = pd.to_datetime(df['time'], utc=True)
    df = df.set_index('time')
    
    # W-MON သည် တနင်္လာနေ့တွင် စတင်သော အပတ်ကို ကိုယ်စားပြုသည်။
    groups = df.groupby(pd.Grouper(freq=freq))
    
    # Indicators columns
    indicator_cols = ['macd', 'boll_ub', 'boll_lb', 'rsi_30', 'dx_30', 'close_30_sma', 'close_60_sma', 'atr']
    
    for week_start, week_df in groups:
        if week_df.empty:
            continue
        
        # 1. Check raw indicators to determine Eval set (Data Leakage မဖြစ်စေရန်)
        first_row = week_df[indicator_cols].iloc[0]
        has_nan = first_row.isna().any()
        has_zero = (first_row == 0).any()
        is_eval = has_nan or has_zero # Indicator များ မပြည့်စုံသေးသော အပတ်ကို Eval အဖြစ် သတ်မှတ်
        
        # Data အရေအတွက် စစ်ဆေးခြင်း
        if len(week_df) < 1440:
            logger.warning(f"Skipping {week_start}: {len(week_df)}/1440 bars")
            continue
        
        # 2. Normalize and validate (Fit-Transform Logic)
        if not scaler_manager.is_fitted and not is_eval:
            # Scaler ကို ပထမဆုံးသော၊ Indicators ပြည့်စုံသော (is_eval=False) Training Set တွင် Fit လုပ်ပါ
            week_df = scaler_manager.fit_and_transform(week_df)
            dir_type = 'train'
        elif scaler_manager.is_fitted:
            # Scaler Fit ပြီးပါက၊ Train နှင့် Eval နှစ်ခုလုံးကို Transform လုပ်ပါ
            week_df = scaler_manager.transform(week_df)
            dir_type = 'eval' if is_eval else 'train'
        else:
            # Fit မလုပ်ရသေးဘဲ is_eval ဖြစ်နေရင် ကျော်သွားပါ (Indicators မပြည့်သေးလို့ Fit မလုပ်သင့်)
            logger.warning(f"Skipping {week_start}: Indicators not ready for fitting and not fitted yet.")
            continue


        # 3. Save to appropriate directory
        path = os.path.join(base_path, split_cfg[f"{dir_type}_dir"])
        os.makedirs(path, exist_ok=True)
        
        iso_year, iso_week, _ = week_start.isocalendar()
        fname = f"{symbol}_{iso_year}_{iso_week:02d}.csv"
        week_df.reset_index().to_csv(f"{path}/{fname}", index=False)
        logger.critical(f"Saved {dir_type} file: {fname}")

In [7]:
symbol = 'EURUSD'
file = f'./drive/MyDrive/data/raw/{symbol}_M5.csv'
# 1. Load & clean
raw = pd.read_csv(file)
raw.tail(3)


Unnamed: 0,2022-03-16 15:25,1.10006,1.10027,1.09966,1.1002,679
199996,2024-11-15 02:45,1.05378,1.05426,1.05378,1.05398,1085
199997,2024-11-15 02:50,1.05399,1.05416,1.05398,1.05416,964
199998,2024-11-15 02:55,1.05416,1.05422,1.05405,1.0541,1170


In [8]:
cf = EnvConfig('./drive/MyDrive/configure.json')  
df = patch_missing_data(raw,cf=cf)
df.tail(3)

shape of  column: 6
  new_index = pd.date_range(
Week 2022-03-21 00:00:00+00:00 has 1728/1440 bars
Week 2022-03-28 00:00:00+00:00 has 2016/1440 bars
Week 2022-04-04 00:00:00+00:00 has 2016/1440 bars
Week 2022-04-11 00:00:00+00:00 has 2016/1440 bars
Week 2022-04-18 00:00:00+00:00 has 2016/1440 bars
Week 2022-04-25 00:00:00+00:00 has 2016/1440 bars
Week 2022-05-02 00:00:00+00:00 has 2016/1440 bars
Week 2022-05-09 00:00:00+00:00 has 2016/1440 bars
Week 2022-05-16 00:00:00+00:00 has 2016/1440 bars
Week 2022-05-23 00:00:00+00:00 has 2016/1440 bars
Week 2022-05-30 00:00:00+00:00 has 2016/1440 bars
Week 2022-06-06 00:00:00+00:00 has 2016/1440 bars
Week 2022-06-13 00:00:00+00:00 has 2016/1440 bars
Week 2022-06-20 00:00:00+00:00 has 2016/1440 bars
Week 2022-06-27 00:00:00+00:00 has 2016/1440 bars
Week 2022-07-04 00:00:00+00:00 has 2016/1440 bars
Week 2022-07-11 00:00:00+00:00 has 2016/1440 bars
Week 2022-07-18 00:00:00+00:00 has 2016/1440 bars
Week 2022-07-25 00:00:00+00:00 has 2016/1440 bars
W

Unnamed: 0,time,open,high,low,close,vol
201787,2024-11-15 03:45:00+00:00,1.05416,1.05422,1.05405,1.0541,1170.0
201788,2024-11-15 03:50:00+00:00,1.05416,1.05422,1.05405,1.0541,1170.0
201789,2024-11-15 03:55:00+00:00,1.05416,1.05422,1.05405,1.0541,1170.0


In [9]:
# 2. Feature engineering
df = add_time_feature(df, symbol=symbol)
df.tail(3)

Unnamed: 0,time,open,high,low,close,vol,weekday,day,week,month,...,hour,hour_sin,hour_cos,minute_block,minute_sin,minute_cos,london_session,ny_session,overlap_session,symbol
201787,2024-11-15 03:45:00+00:00,1.05416,1.05422,1.05405,1.0541,1170.0,4,15,46,11,...,3,0.707107,0.707107,9,-1.0,-0.0,0,0,0,EURUSD
201788,2024-11-15 03:50:00+00:00,1.05416,1.05422,1.05405,1.0541,1170.0,4,15,46,11,...,3,0.707107,0.707107,10,-0.866025,0.5,0,0,0,EURUSD
201789,2024-11-15 03:55:00+00:00,1.05416,1.05422,1.05405,1.0541,1170.0,4,15,46,11,...,3,0.707107,0.707107,11,-0.5,0.866025,0,0,0,EURUSD


In [10]:
df = tech_indicators(df, cf=cf) 
df.tail(3)

Unnamed: 0,time,open,high,low,close,vol,weekday,day,week,month,...,boll_ub,boll_lb,rsi_30,dx_30,close_30_sma,close_60_sma,atr,returns_5,returns_24,volatility_ratio
201787,2024-11-15 03:45:00+00:00,1.05416,1.05422,1.05405,1.0541,1170.0,4,15,46,11,...,1.054244,1.053825,-0.575375,1.088574,1.053937,1.056404,0.567566,0.0,0.000798,0.000161
201788,2024-11-15 03:50:00+00:00,1.05416,1.05422,1.05405,1.0541,1170.0,4,15,46,11,...,1.054244,1.053847,-0.575375,1.083338,1.053935,1.056391,0.566534,0.0,0.000703,0.000161
201789,2024-11-15 03:55:00+00:00,1.05416,1.05422,1.05405,1.0541,1170.0,4,15,46,11,...,1.054241,1.053871,-0.575375,1.078119,1.053933,1.056377,0.566792,0.0,0.000874,0.000161


In [12]:
# Scaler instance ကို တည်ဆောက်ပါ
scaler_manager = TimeSeriesScaler()

# Function ကို ခေါ်ပါ
split_time_series_v2(df, freq='W-FRI', symbol=symbol, cf=cf, scaler_manager=scaler_manager)

Skipping 2022-03-18 00:00:00+00:00: 654/1440 bars
Saved train file: EURUSD_2022_12.csv
Saved train file: EURUSD_2022_13.csv
Saved train file: EURUSD_2022_14.csv
Saved train file: EURUSD_2022_15.csv
Saved train file: EURUSD_2022_16.csv
Saved train file: EURUSD_2022_17.csv
Saved train file: EURUSD_2022_18.csv
Saved train file: EURUSD_2022_19.csv
Saved train file: EURUSD_2022_20.csv
Saved train file: EURUSD_2022_21.csv
Saved train file: EURUSD_2022_22.csv
Saved train file: EURUSD_2022_23.csv
Saved train file: EURUSD_2022_24.csv
Saved train file: EURUSD_2022_25.csv
Saved train file: EURUSD_2022_26.csv
Saved train file: EURUSD_2022_27.csv
Saved train file: EURUSD_2022_28.csv
Saved train file: EURUSD_2022_29.csv
Saved train file: EURUSD_2022_30.csv
Saved train file: EURUSD_2022_31.csv
Saved train file: EURUSD_2022_32.csv
Saved train file: EURUSD_2022_33.csv
Saved train file: EURUSD_2022_34.csv
Saved train file: EURUSD_2022_35.csv
Saved train file: EURUSD_2022_36.csv
Saved train file: EURUSD_