In [97]:
import pandas as pd
import numpy as np
import os 
from typing import Dict, Any


def convert_1m_to_5m_df(file_path: str) -> pd.DataFrame:
    # Column Names (·Äû·ÄÑ·Ä∑·Ä∫·Äõ·Ä≤·Ä∑ Data ·Ä°·ÄÖ·ÄÆ·Ä°·ÄÖ·Äâ·Ä∫·Ä°·Äê·Ä≠·ÄØ·ÄÑ·Ä∫·Ä∏)
    COLUMN_NAMES = ['Date', 'Time', 'Open', 'High', 'Low', 'Close', 'Volume']
    DATETIME_FORMAT = '%Y.%m.%d %H:%M'

    """ CSV File ·Äô·Äæ 1-Minute Data ·ÄÄ·Ä≠·ÄØ Load ·Äï·Äº·ÄÆ·Ä∏ 5-Minute Candle ·Äû·Ä≠·ÄØ·Ä∑ ·Äï·Äº·Ä±·Ä¨·ÄÑ·Ä∫·Ä∏·Äú·Ä≤·Äû·Ää·Ä∫·Åã """
    if not os.path.exists(file_path):
        print(f"üö® Error: File not found at path: {file_path}")
        return pd.DataFrame()

    try:
        df = pd.read_csv(file_path, sep=',', header=None, names=COLUMN_NAMES,
                         dtype={'Open': np.float64, 'High': np.float64, 'Low': np.float64, 'Close': np.float64})
        
        # Volume column ·ÄÄ·Ä≠·ÄØ ·Äö·Ä¨·Äö·ÄÆ float ·Ä°·Äî·Ä±·Äñ·Äº·ÄÑ·Ä∑·Ä∫ ·Äû·Äê·Ä∫·Äô·Äæ·Äê·Ä∫·Äï·Äº·ÄÆ·Ä∏ NA ·Äô·Äª·Ä¨·Ä∏·ÄÄ·Ä≠·ÄØ 0 ·Äñ·Äº·ÄÑ·Ä∑·Ä∫ ·Ä°·ÄÖ·Ä¨·Ä∏·Äë·Ä≠·ÄØ·Ä∏·Äû·Ää·Ä∫·Åã
        df['Volume'] = pd.to_numeric(df['Volume'], errors='coerce').fillna(0)
        df['Volume'] = df['Volume'].astype(np.int64) 
        
    except Exception as e:
        print(f"üö® Error loading CSV file: {e}")
        return pd.DataFrame()

    # Datetime Index ·ÄÄ·Ä≠·ÄØ ·Äê·Ää·Ä∫·ÄÜ·Ä±·Ä¨·ÄÄ·Ä∫·ÄÅ·Äº·ÄÑ·Ä∫·Ä∏
    df['Datetime'] = df['Date'].astype(str) + ' ' + df['Time'].astype(str)
    df['Datetime'] = pd.to_datetime(df['Datetime'], format=DATETIME_FORMAT, errors='coerce')
    df.set_index('Datetime', inplace=True)
    df.drop(columns=['Date', 'Time'], inplace=True)
    #df.dropna(subset=[df.index.name], inplace=True) # Invalid Datetime ·Äô·Äª·Ä¨·Ä∏·ÄÄ·Ä≠·ÄØ ·Äñ·Äö·Ä∫·Äõ·Äæ·Ä¨·Ä∏·ÄÅ·Äº·ÄÑ·Ä∫·Ä∏

    ohlcv_aggregation_rules: Dict[str, Any] = {
        'Open': 'first', 'High': 'max', 'Low': 'min', 'Close': 'last', 'Volume': 'sum'
    }
    df = df.resample('5Min').agg(ohlcv_aggregation_rules)
    df.dropna(inplace=True)
    df = df[df['Volume'] > 0]
    
    print(f"‚úÖ Conversion successful! 5-Min rows: {len(df)}")
    return df.reset_index().rename(columns={'index': 'Datetime'})

In [99]:
file_path_from_data = "./content/drive/MyDrive/data/raw/EURUSD_2020_all.csv"
file_path_to_data = "./content/drive/MyDrive/data/raw/EURUSD_2020_all_5.csv"

raw = convert_1m_to_5m_df(file_path_from_data)
raw.to_csv(file_path_to_data, index=False)

‚úÖ Conversion successful! 5-Min rows: 149715


In [100]:
import json

class EnvConfig():
    """environment configuration from json file
       tgym requires you configure your own parameters in json file.
        Args:
            config_file path/file.json

    """
    def __init__(self,config_file):
        self.config = {}
        with open(config_file) as j:
            self.config = json.load(j)

    def env_parameters(self,item=''):
        """environment variables
        """
        if item:
            return self.config["env"][item]
        else:
            return self.config["env"]

    def symbol(self, asset="GBPUSD", item='') :
        """get trading pair (symbol) information

        Args:
            asset (str, optional): symbol in config. Defaults to "GBPUSD".
            item (str, optional): name of item, if '' return dict, else return item value. Defaults to ''.

        Returns:
            [type]: [description]
        """
        if item:
            return self.config["symbol"][asset][item]
        else:
            return self.config["symbol"][asset]

    def data_processing_parameters(self, item=''):
        """Get data processing config"""
        if item:
            return self.config["data_processing"][item]
        return self.config["data_processing"]

    def trading_hour(self,place="NewYork"):
        """forex trading hour from different markets

        Args:
            place (str, optional): [Sydney,Tokyo,London] Defaults to "New York".

        Returns:
            [dict]: from time, to time
        """
        if place:
            return self.config["trading_hour"][place]
        else:
            return self.config["trading_hour"]

    def indicator(self,place="sma_fast_period"):
        """forex trading hour from different markets

        Args:
            place (str, optional): [Sydney,Tokyo,London] Defaults to "New York".

        Returns:
            [dict]: from time, to time
        """
        if place:
            return self.config["data_processing"]["indicator"][place]
        else:
            return self.config["data_processing"]["indicator"]



In [101]:
import pandas as pd
import logging

import logging
# Configure logging
logger = logging.getLogger(__name__)

def patch_missing_data(df, dt_col_name='time', cf=None):
    min_bars = cf.data_processing_parameters("min_bars_per_week")

    # ["time","open", "high", "low", "close"]
    required_cols = cf.data_processing_parameters("required_cols")

    # df ·Äô·Äæ·Ä¨ 6 columns ·Äõ·Äæ·Ä≠·Äõ·ÄÑ·Ä∫ vol ·Äï·Ä´·Äë·Ää·Ä∑·Ä∫·Äô·Äö·Ä∫
    if df.shape[1] == 6:
        df.columns = required_cols + ['Volume']
    elif df.shape[1] == 5:
        df.columns = required_cols
    else:
        raise ValueError(f"Invalid number of columns: {df.shape[1]} =>{required_cols}")

    logger.warning(f"shape of  column: {df.shape[1]}")
    # 1. Column validation
    if missing := set(required_cols) - set(df.columns):
        raise ValueError(f"Missing columns: {missing}")

    # 2. Auto-detect datetime column
    dt_candidates = {'time', 'timestamp', 'date', 'datetime', 'Datetime'}
    if dt_col_name not in df.columns:
        found = list(dt_candidates & set(df.columns))
        if not found:
            raise KeyError(f"No datetime column found. Tried: {dt_candidates}")
        dt_col_name = found[0]
        logger.info(f"Using datetime column: {dt_col_name}")

    # 3. Convert to datetime index
    df[dt_col_name] = pd.to_datetime(df[dt_col_name], utc=True)
    df = df.set_index(dt_col_name).sort_index()
    groups = df.groupby(pd.Grouper(freq='W-SUN'))

    patched_weeks = []  # patched weekly df storage

    for w, week_df in groups:
        if week_df.empty:
            continue

        if len(week_df) != min_bars:
            logger.warning(f"Week {w} has {len(week_df)}/{min_bars} bars")

        # Create 5-minute frequency index
        new_index = pd.date_range(
            start=week_df.index.min(),
            end=week_df.index.max(),
            freq='5min',
            tz='UTC'
        )

        # Reindex + forward fill
        week_df = week_df.reindex(new_index)
        week_df.index = week_df.index.tz_localize(None)
        fill_limit = 12 # ·Ä•·Äï·Äô·Ä¨: 1 ·Äî·Ä¨·Äõ·ÄÆ (12 bars) ·Äë·ÄÄ·Ä∫·Äï·Ä≠·ÄØ·Äê·Ä≤·Ä∑ ·ÄÄ·ÄΩ·ÄÄ·Ä∫·Äú·Äï·Ä∫·ÄÄ·Ä≠·ÄØ ·Äô·Äñ·Äº·Ää·Ä∑·Ä∫·Äï·Ä´
        fill_cols = ['open', 'high', 'low', 'close', 'vol'] if 'vol' in df.columns else ['open', 'high', 'low', 'close']
        # FFill: ·Äõ·Äæ·Ä±·Ä∑·ÄÄ data ·Äñ·Äº·ÄÑ·Ä∑·Ä∫ ·Äñ·Äº·Ää·Ä∑·Ä∫·Äï·Ä´
        week_df[fill_cols] = week_df[fill_cols].ffill(limit=fill_limit)
        patched_weeks.append(week_df)

    # Merge back all weeks
    if patched_weeks:
        all_df = pd.concat(patched_weeks)
    else:
        all_df = df.copy()

    return all_df.reset_index().rename(columns={'index': 'Datetime'})


In [82]:
cf = EnvConfig('./content/drive/MyDrive/configure.json')
raw = pd.read_csv(file_path_to_data)
df = patch_missing_data(raw,cf=cf)

shape of  column: 6
Week 2020-01-05 00:00:00+00:00 has 576/1440 bars
Week 2020-01-12 00:00:00+00:00 has 1439/1440 bars
Week 2020-03-29 00:00:00+00:00 has 1433/1440 bars
Week 2020-09-20 00:00:00+00:00 has 1439/1440 bars
Week 2020-10-04 00:00:00+00:00 has 1436/1440 bars
Week 2020-12-27 00:00:00+00:00 has 1152/1440 bars
Week 2021-01-03 00:00:00+00:00 has 1142/1440 bars
Week 2021-05-23 00:00:00+00:00 has 1439/1440 bars
Week 2021-05-30 00:00:00+00:00 has 1437/1440 bars
Week 2021-06-06 00:00:00+00:00 has 1439/1440 bars
Week 2021-09-19 00:00:00+00:00 has 1439/1440 bars
Week 2021-10-10 00:00:00+00:00 has 1439/1440 bars
Week 2021-12-12 00:00:00+00:00 has 1439/1440 bars
Week 2021-12-26 00:00:00+00:00 has 1438/1440 bars
Week 2022-01-02 00:00:00+00:00 has 1428/1440 bars


In [102]:
df.tail(3)

Unnamed: 0,Datetime,open,high,low,close,Volume
149734,2021-12-31 23:45:00,1.1378,1.13801,1.13764,1.13795,172.0
149735,2021-12-31 23:50:00,1.13796,1.13823,1.13778,1.1378,216.0
149736,2021-12-31 23:55:00,1.1378,1.13781,1.1365,1.1366,210.0


In [103]:
def add_time_feature(df_5m: pd.DataFrame, cf=None, source_tz='UTC') -> pd.DataFrame:
    """
    5M Data Frame (DatetimeIndex ·Äï·Ä´·Äù·ÄÑ·Ä∫·Äû·Ää·Ä∫·Äü·ÄØ ·Äö·Ä∞·ÄÜ·Äï·Ä´) ·Ä°·Äê·ÄΩ·ÄÄ·Ä∫ Temporal features ·Äë·Ää·Ä∑·Ä∫·Äû·ÄΩ·ÄÑ·Ä∫·Ä∏·ÄÅ·Äº·ÄÑ·Ä∫·Ä∏·Åã
    """
    # üß≠ Ensure datetime index
    if not isinstance(df_5m.index, pd.DatetimeIndex):
        if 'Datetime' in df_5m.columns:
            df_5m['Datetime'] = pd.to_datetime(df_5m['Datetime'])
            df_5m = df_5m.set_index('Datetime')
        else:
            raise ValueError("DataFrame must have datetime index or 'time' column")

    # DataFrame ·Åè Index ·ÄÄ·Ä≠·ÄØ DatetimeIndex ·Ä°·Äñ·Äº·ÄÖ·Ä∫ ·ÄÖ·ÄÖ·Ä∫·ÄÜ·Ä±·Ä∏·ÄÅ·Äº·ÄÑ·Ä∫·Ä∏
    if not isinstance(df_5m.index, pd.DatetimeIndex):
         raise TypeError("DataFrame ·Åè Index ·Äû·Ää·Ä∫ DatetimeIndex ·Äñ·Äº·ÄÖ·Ä∫·Äõ·Äï·Ä´·Äô·Ää·Ä∫·Åã")

    df_5m.index = df_5m.index.tz_localize(None)
    # Index ·ÄÄ·Ä≠·ÄØ Timezone aware (UTC) ·Ä°·Äñ·Äº·ÄÖ·Ä∫ ·Äû·Ä±·ÄÅ·Äª·Ä¨·Ä°·Ä±·Ä¨·ÄÑ·Ä∫·Äú·ÄØ·Äï·Ä∫·ÄÅ·Äº·ÄÑ·Ä∫·Ä∏
    if df_5m.index.tz is None:
        # Timezone-Naive data ·ÄÄ·Ä≠·ÄØ ·Äô·Ä∞·Äõ·ÄÑ·Ä∫·Ä∏ Source Timezone ·Äñ·Äº·ÄÑ·Ä∑·Ä∫ localize
        # Dukascopy data ·ÄÜ·Ä≠·ÄØ·Äõ·ÄÑ·Ä∫ 'UTC' ·Äû·ÄØ·Ä∂·Ä∏·Äï·Äº·ÄÆ·Ä∏·Åä Broker data ·ÄÜ·Ä≠·ÄØ·Äõ·ÄÑ·Ä∫ 'GMT+3' ·Äú·Ä≠·ÄØ·Äô·Äª·Ä≠·ÄØ·Ä∏ ·Äû·ÄØ·Ä∂·Ä∏·Äî·Ä≠·ÄØ·ÄÑ·Ä∫·Äï·Ä´·Äê·Äö·Ä∫
        df = df_5m.tz_localize(source_tz, ambiguous='NaT', nonexistent='NaT')
        df = df.tz_convert('UTC')
    else:
        df = df_5m.copy()
    
    # ----------------------------------------------------
    # I. ·Ä°·ÄÅ·Äº·Ä±·ÄÅ·Ä∂ features ·Äî·Äæ·ÄÑ·Ä∑·Ä∫ Cyclical Encoding ·Äô·Äª·Ä¨·Ä∏ (Hour ·ÄÄ·Ä≠·ÄØ Index ·Äô·Äæ ·Äê·Ä≠·ÄØ·ÄÄ·Ä∫·Äõ·Ä≠·ÄØ·ÄÄ·Ä∫·Äö·Ä∞·ÄÅ·Äº·ÄÑ·Ä∫·Ä∏)
    # ----------------------------------------------------

    # df['weekday'] = df.index.dayofweek 
    # df['day'] = df.index.day
    # df['week'] = df.index.isocalendar().week.astype(int)
    # df['month'] = df.index.month
    # df['year'] = df.index.year
    df['hour'] = df.index.hour
    
    # ·Äî·Ä¨·Äõ·ÄÆ·Ä°·Äê·ÄΩ·ÄÄ·Ä∫ Cyclical features
    df['hour_sin'] = np.sin(2 * np.pi * df['hour']/24).round(6)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour']/24).round(6)

    # ----------------------------------------------------
    # III. DST-Aware Market Sessions (Timezone Handling)
    # ----------------------------------------------------
    
    # ·Äî·Ä¨·Äõ·ÄÆ·ÄÄ·Ä≠·ÄØ local time zone ·Äû·Ä≠·ÄØ·Ä∑ ·Äï·Äº·Ä±·Ä¨·ÄÑ·Ä∫·Ä∏·Äú·Ä≤ (Timezone Aware Index ·Äô·Äæ·Äû·Ä¨ tz_convert ·Äú·ÄØ·Äï·Ä∫·Äî·Ä≠·ÄØ·ÄÑ·Ä∫·Äû·Ää·Ä∫)
    london_time = df.index.tz_convert('Europe/London')
    ny_time = df.index.tz_convert('America/New_York')

    # Session Hours (cf ·Äô·Äæ Local Time ·Äî·Ä¨·Äõ·ÄÆ·Äô·Äª·Ä¨·Ä∏·ÄÄ·Ä≠·ÄØ ·Äï·Ä±·Ä∏·Äï·Ä≠·ÄØ·Ä∑·Äõ·Äï·Ä´·Äô·Ää·Ä∫·Åã)
    ny = cf.trading_hour('NewYork')
    ldn = cf.trading_hour('London')

    # London Session (Local Time: 08:00 - 16:00)
    df['london_session'] = ((london_time.hour >= ldn['from']) & (london_time.hour < ldn['to'])).astype(int)
    
    # NY Session (Local Time: 13:00 - 21:00 UTC/GMT) -> (9:00 - 17:00 EST/EDT)
    # cf ·Äô·Äæ Local NY Time (·Ä•·Äï·Äô·Ä¨: 9, 17) ·ÄÄ·Ä≠·ÄØ ·Äï·Ä±·Ä∏·Äï·Ä≠·ÄØ·Ä∑·Äõ·Äô·Ää·Ä∫
    df['ny_session'] = ((ny_time.hour >= ny['from']) & (ny_time.hour < ny['to'])).astype(int)

    df['overlap_session'] = (df['london_session'] & df['ny_session']).astype(int)

    # ... (IV. Holiday features ·ÄÄ·Ä≠·ÄØ ·ÄÜ·ÄÄ·Ä∫·Äú·ÄÄ·Ä∫·Äë·Ää·Ä∑·Ä∫·Äû·ÄΩ·ÄÑ·Ä∫·Ä∏·Äî·Ä≠·ÄØ·ÄÑ·Ä∫·Äû·Ää·Ä∫) ...
    
    #df['symbol'] = symbol
    
    # ·Äö·Ä¨·Äö·ÄÆ columns ·Äô·Äª·Ä¨·Ä∏·ÄÄ·Ä≠·ÄØ ·Äñ·Äö·Ä∫·Äõ·Äæ·Ä¨·Ä∏·ÄÅ·Äº·ÄÑ·Ä∫·Ä∏
    df = df.drop(columns=['hour'], errors='ignore') # minute_block_15 ·Äû·Ää·Ä∫ 1M data ·Äô·Äæ ·Äú·Ä¨·Äú·Äª·Äæ·ÄÑ·Ä∫·Äû·Ä¨ ·Äú·Ä≠·ÄØ·Ä°·Äï·Ä∫·Äû·Ää·Ä∫·Åã 5M ·Äê·ÄΩ·ÄÑ·Ä∫ ·Äô·Äú·Ä≠·ÄØ·Ä°·Äï·Ä∫·Äï·Ä´·Åã
    
    # Index ·ÄÄ·Ä≠·ÄØ reset ·Äô·Äú·ÄØ·Äï·Ä∫·Äò·Ä≤ ·Äï·Äº·Äî·Ä∫·Äï·Ä≠·ÄØ·Ä∑·Äï·Ä´ (Env ·Ä°·Äê·ÄΩ·ÄÄ·Ä∫ Datetime Index ·Äú·Ä≠·ÄØ·Ä°·Äï·Ä∫·Äû·Ää·Ä∫)
    return df.reset_index().rename(columns={'index': 'Datetime'})


In [104]:
# Broker Data (00:00 ·Äô·Äæ ·ÄÖ·Äû·Ä±·Ä¨) ·ÄÄ·Ä≠·ÄØ ·ÄÅ·Ä±·Ä´·Ä∫·ÄÜ·Ä≠·ÄØ·Äû·Ää·Ä∑·Ä∫·Ä°·ÄÅ·Ä´
# GMT+2/GMT+3 ·ÄÄ·Ä≠·ÄØ ·Ä°·Äú·Ä≠·ÄØ·Ä°·Äú·Äª·Ä±·Ä¨·ÄÄ·Ä∫ ·ÄÄ·Ä≠·ÄØ·ÄÑ·Ä∫·Äê·ÄΩ·Äö·Ä∫·Äï·Ä±·Ä∏·Äõ·Äî·Ä∫
axiory_tz = 'Europe/Kiev'  

dft = add_time_feature(df, cf=cf, source_tz=axiory_tz)

In [105]:
dft.head(2)

Unnamed: 0,Datetime,open,high,low,close,Volume,hour_sin,hour_cos,london_session,ny_session,overlap_session
0,2020-01-01 22:00:00+00:00,1.12117,1.12128,1.12087,1.12114,31.0,-0.5,0.866025,0,0,0
1,2020-01-01 22:05:00+00:00,1.12117,1.12124,1.12103,1.12103,44.0,-0.5,0.866025,0,0,0


In [106]:
dft.to_csv('dft.csv', index=False)


In [107]:
from finta import TA

def tech_indicators(df, cf=None):
    """
    Forex RL ·Ä°·Äê·ÄΩ·ÄÄ·Ä∫ Price Action·Åä Momentum ·Äî·Äæ·ÄÑ·Ä∑·Ä∫ Long-Term Trend Features ·Äô·Äª·Ä¨·Ä∏·ÄÄ·Ä≠·ÄØ ·Äë·Ää·Ä∑·Ä∫·Äû·ÄΩ·ÄÑ·Ä∫·Ä∏·ÄÅ·Äº·ÄÑ·Ä∫·Ä∏·Åã
    """
    price   =   df['close']
    sma_fast_period = cf.indicator('sma_fast_period')
    sma_mid_period = cf.indicator('sma_mid_period')
    sma_slow_period = cf.indicator('sma_slow_period')
    atr_period = cf.indicator('atr_period')
    rsi_period = cf.indicator('rsi_period')
    
    df['fast_ma'] = TA.SMA(df, period=sma_fast_period)
    df['mid_ma'] = TA.SMA(df, period=sma_mid_period)
    df['slow_ma'] = TA.SMA(df, period=sma_slow_period)
    df['rsi'] = TA.RSI(df, period=rsi_period).ffill().round(6)
    
    # trend strength
    df['fast_ts']  =   (price - df['fast_ma']) / df['fast_ma']
    df['mid_ts']  =   (price - df['mid_ma']) / df['mid_ma']
    df['slow_ts']  =   (price - df['slow_ma']) / df['slow_ma']

    df['fast_td'] = np.sign(df['fast_ts'])
    df['mid_td'] = np.sign(df['mid_ts'])
    df['slow_td'] = np.sign(df['slow_ts'])

    df['fast_mid_gap'] = (df['fast_ma'] - df['mid_ma']) / df['mid_ma']
    df['mid_slow_gap'] = (df['mid_ma'] - df['slow_ma']) / df['slow_ma']
    df['fast_slow_gap'] = (df['fast_ma'] - df['slow_ma']) / df['slow_ma']

    # --- ·ÅÅ·Åã Volatility Measure (ATR ·ÄÄ·Ä≠·ÄØ Base ·Ä°·Äñ·Äº·ÄÖ·Ä∫ ·Ä°·Äû·ÄØ·Ä∂·Ä∏·Äï·Äº·ÄØ·Äõ·Äî·Ä∫) ---
    df['atr_base'] = TA.ATR(df, period=atr_period).ffill()
    df['atr_norm'] = df['atr_base'] / price

    window=100
    df['low_thr'] = df['atr_norm'].rolling(window).quantile(0.33)
    df['high_thr'] = df['atr_norm'].rolling(window).quantile(0.66)

    # Volatility categories (one-hot)
    df['vol_low'] = (df['atr_norm'] < df['low_thr']).astype(int)
    df['vol_med'] = ((df['atr_norm'] >= df['low_thr']) &
                    (df['atr_norm'] < df['high_thr'])).astype(int)
    df['vol_high'] = (df['atr_norm'] >= df['high_thr']).astype(int)

    # Momentum
    df['momentum_score'] = (df['rsi'] - 50) / 50
    # RSI categories one-hot
    df['mom_bearish'] = (df['rsi'] < 45).astype(int)
    df['mom_neutral'] = ((df['rsi'] >= 45) & (df['rsi'] <= 55)).astype(int)
    df['mom_bullish'] = (df['rsi'] > 55).astype(int)


    df.dropna(inplace=True)
    
    return df

In [108]:
dft = pd.read_csv('dft.csv')
dfi = tech_indicators(dft, cf=cf)
dfi.to_csv('dfi.csv', index=False)

In [109]:
dft.head(3)

Unnamed: 0,Datetime,open,high,low,close,Volume,hour_sin,hour_cos,london_session,ny_session,...,atr_norm,low_thr,high_thr,vol_low,vol_med,vol_high,momentum_score,mom_bearish,mom_neutral,mom_bullish
199,2020-01-02 14:35:00+00:00,1.11767,1.11802,1.11767,1.1178,518.0,-0.5,-0.866025,1,1,...,0.00037,0.000224,0.000272,0,0,1,-0.434099,1,0,0
200,2020-01-02 14:40:00+00:00,1.1178,1.11781,1.11712,1.11725,496.0,-0.5,-0.866025,1,1,...,0.000393,0.000229,0.000273,0,0,1,-0.536659,1,0,0
201,2020-01-02 14:45:00+00:00,1.11727,1.11746,1.11712,1.11727,506.0,-0.5,-0.866025,1,1,...,0.000393,0.000232,0.000276,0,0,1,-0.52583,1,0,0


In [127]:
def add_news_features(df: pd.DataFrame, news_df: pd.DataFrame, window_pre=30, window_post=30):
    # üß≠ Ensure datetime index
    if not isinstance(df.index, pd.DatetimeIndex):
        if 'Datetime' in df.columns:
            df['Datetime'] = pd.to_datetime(df['Datetime'])
            df = df.set_index('Datetime')
        else:
            raise ValueError("DataFrame must have datetime index or 'time' column")

    
    df['pre_news'] = 0.0
    df['post_news'] = 0.0
    news_df['Start'] = pd.to_datetime(news_df['Start'], utc=True)

    for _, row in news_df.iterrows():
        news_time  = row['Start']
        pre_mask = (df.index >= news_time - pd.Timedelta(minutes=window_pre)) & (df.index < news_time)

        if pre_mask.any():
            minutes_to_news = (news_time - df.index[pre_mask]).total_seconds() / 60
            df.loc[pre_mask, 'pre_news'] = 1 - (minutes_to_news / window_pre)

        # --- Post-news: 1 ‚Üí 0 decay ---
        post_mask = (df.index > news_time) & (df.index <= news_time + pd.Timedelta(minutes=window_post))
        if post_mask.any():
            minutes_after_news = (df.index[post_mask] - news_time).total_seconds() / 60
            df.loc[post_mask, 'post_news'] = 1 - (minutes_after_news / window_post)


        # --- News candle itself ---
        exact_mask = (df.index == news_time)
        if exact_mask.any():
            df.loc[exact_mask, ['pre_news', 'post_news']] = 1.0
            
    df['pre_news'] = df['pre_news'].clip(0, 1)
    df['post_news'] = df['post_news'].clip(0, 1)
    # Index ·ÄÄ·Ä≠·ÄØ reset ·Äô·Äú·ÄØ·Äï·Ä∫·Äò·Ä≤ ·Äï·Äº·Äî·Ä∫·Äï·Ä≠·ÄØ·Ä∑·Äï·Ä´ (Env ·Ä°·Äê·ÄΩ·ÄÄ·Ä∫ Datetime Index ·Äú·Ä≠·ÄØ·Ä°·Äï·Ä∫·Äû·Ää·Ä∫)
    return df.reset_index().rename(columns={'index': 'Datetime'})



In [128]:
news = pd.read_csv("calendar-event-list.csv")
dfi = pd.read_csv("dfi.csv")

dfn = add_news_features(dfi, news, window_pre=30, window_post=30)
dfn.to_csv('dfn.csv', index=False)