In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Load pre-market data
pre_df = pd.read_csv('pre_market_data.csv') 

pre_df['time_ny'] = pd.to_datetime(pre_df['time_ny'])
pre_df['date'] = pd.to_datetime(pre_df['date'])

In [3]:
# --- STEP 1: Compute and shift previous day's levels ---
session_levels = pre_df.groupby('date').agg({
    'high': 'max',
    'low': 'min',
    'close': 'last',
    'tick_volume': 'sum'
})

# Shift the values to get previous day's levels
session_levels['prev_high'] = session_levels['high'].shift(1)
session_levels['prev_low'] = session_levels['low'].shift(1)
session_levels['prev_close'] = session_levels['close'].shift(1)

session_levels.drop(columns=['high', 'low'], inplace=True)
session_levels.rename(columns={'tick_volume': 'session_volume'}, inplace=True)

In [4]:
# --- STEP 2: Multi-Day Rolling Levels ---
session_levels['rolling_high'] = session_levels['prev_high'].shift(1)
session_levels['rolling_low'] = session_levels['prev_low'].shift(1)

spread = session_levels['rolling_high'] - session_levels['rolling_low']
session_levels['zone_mid'] = (session_levels['rolling_high'] + session_levels['rolling_low']) / 2
session_levels['zone_top'] = session_levels['zone_mid'] + spread / 4
session_levels['zone_bottom'] = session_levels['zone_mid'] - spread / 4

In [5]:
# --- STEP 4: Volume-Based Zones ---
volume_zones = []

for date, group in pre_df.groupby('date'):
    bins = np.linspace(group['low'].min(), group['high'].max(), 20)
    group = group.copy()
    group['price_bin'] = pd.cut(group['close'], bins=bins)
    vol_per_bin = group.groupby('price_bin')['tick_volume'].sum()

    if vol_per_bin.empty:
        volume_zones.append((date, np.nan, np.nan))
        continue

    top_bins = vol_per_bin.sort_values(ascending=False).head(2).index
    vol_zone_low = top_bins.min().left
    vol_zone_high = top_bins.max().right

    volume_zones.append((date, vol_zone_high, vol_zone_low))

volume_df = pd.DataFrame(volume_zones, columns=['date', 'vol_zone_top', 'vol_zone_bottom'])
volume_df.set_index('date', inplace=True)
session_levels = session_levels.merge(volume_df, left_index=True, right_index=True, how='left')

  vol_per_bin = group.groupby('price_bin')['tick_volume'].sum()
  vol_per_bin = group.groupby('price_bin')['tick_volume'].sum()
  vol_per_bin = group.groupby('price_bin')['tick_volume'].sum()
  vol_per_bin = group.groupby('price_bin')['tick_volume'].sum()
  vol_per_bin = group.groupby('price_bin')['tick_volume'].sum()
  vol_per_bin = group.groupby('price_bin')['tick_volume'].sum()
  vol_per_bin = group.groupby('price_bin')['tick_volume'].sum()
  vol_per_bin = group.groupby('price_bin')['tick_volume'].sum()
  vol_per_bin = group.groupby('price_bin')['tick_volume'].sum()
  vol_per_bin = group.groupby('price_bin')['tick_volume'].sum()
  vol_per_bin = group.groupby('price_bin')['tick_volume'].sum()
  vol_per_bin = group.groupby('price_bin')['tick_volume'].sum()
  vol_per_bin = group.groupby('price_bin')['tick_volume'].sum()
  vol_per_bin = group.groupby('price_bin')['tick_volume'].sum()
  vol_per_bin = group.groupby('price_bin')['tick_volume'].sum()
  vol_per_bin = group.groupby('price_bin

In [6]:
# --- STEP 5: Add Breakout Features in Pre-Market ---
pre_df = pre_df.merge(session_levels[['prev_high', 'prev_low', 'zone_top', 'zone_bottom', 'prev_close']], on='date', how='left')

pre_df['broke_above_prev_high'] = pre_df['high'] > pre_df['prev_high']
pre_df['broke_below_prev_low'] = pre_df['low'] < pre_df['prev_low']
pre_df['broke_above_zone_top'] = pre_df['high'] > pre_df['zone_top']
pre_df['broke_below_zone_bottom'] = pre_df['low'] < pre_df['zone_bottom']
pre_df['broke_above_prev_close'] = pre_df['high'] > pre_df['prev_close']
pre_df['broke_below_prev_close'] = pre_df['low'] < pre_df['prev_close']

In [7]:
# --- STEP 6: Breakout Summary ---
breakout_stats = pre_df.groupby('date').agg({
    'broke_above_prev_high': ['any', 'sum'],
    'broke_below_prev_low': ['any', 'sum'],
    'broke_above_zone_top': ['any', 'sum'],
    'broke_below_zone_bottom': ['any', 'sum'],
    'broke_above_prev_close': ['any', 'sum'],
    'broke_below_prev_close': ['any', 'sum']
})
breakout_stats.columns = ['_'.join(col) for col in breakout_stats.columns]

first_breakout_above_close = pre_df[pre_df['broke_above_prev_close']].groupby('date')['time_ny'].min()
first_breakout_below_close = pre_df[pre_df['broke_below_prev_close']].groupby('date')['time_ny'].min()
first_breakout_above_close = (first_breakout_above_close.dt.hour * 60 + first_breakout_above_close.dt.minute).rename('minutes_to_breakout_above_close')
first_breakout_below_close = (first_breakout_below_close.dt.hour * 60 + first_breakout_below_close.dt.minute).rename('minutes_to_breakout_below_close')

breakout_stats = breakout_stats.merge(first_breakout_above_close, on='date', how='left')
breakout_stats = breakout_stats.merge(first_breakout_below_close, on='date', how='left')

In [8]:
# --- STEP 6: Outlier Filtering ---
z_thresh = 2.5
session_levels['zscore_high'] = (
    (session_levels['prev_high'] - session_levels['prev_high'].rolling(10).mean()) /
    session_levels['prev_high'].rolling(10).std()
)
session_levels['zscore_low'] = (
    (session_levels['prev_low'] - session_levels['prev_low'].rolling(10).mean()) /
    session_levels['prev_low'].rolling(10).std()
)
session_levels.loc[session_levels['zscore_high'].abs() > z_thresh, 'prev_high'] = np.nan
session_levels.loc[session_levels['zscore_low'].abs() > z_thresh, 'prev_low'] = np.nan
session_levels.drop(['zscore_high', 'zscore_low'], axis=1, inplace=True)

merged = pre_df.merge(
    session_levels[['prev_high', 'prev_low', 'prev_close']].reset_index(),
    on='date',
    how='left'
)

# Rename for clarity
merged.rename(columns={
    'prev_high_y': 'prev_high',
    'prev_low_y': 'prev_low',
    'prev_close_y': 'prev_close'
}, inplace=True)

# (Optional) Drop the other ones if you don’t need them
merged.drop(columns=['prev_high_x', 'prev_low_x', 'prev_close_x'], inplace=True)

# Drop rows with missing previous session levels (normal for first day)
merged.dropna(subset=['prev_high', 'prev_low', 'prev_close'], inplace=True)

# Now you can compute features
merged['dist_to_prev_high'] = merged['close'] - merged['prev_high']
merged['dist_to_prev_low'] = merged['close'] - merged['prev_low']
merged['dist_to_prev_close'] = merged['close'] - merged['prev_close']

range_eps = (merged['prev_high'] - merged['prev_low']) + 1e-6
merged['range_to_prev_high_ratio'] = merged['dist_to_prev_high'] / range_eps
merged['range_to_prev_low_ratio'] = merged['dist_to_prev_low'] / range_eps

merged['broke_prev_high'] = (merged['high'] > merged['prev_high']).astype(int)
merged['broke_prev_low'] = (merged['low'] < merged['prev_low']).astype(int)
merged['breakout_above_magnitude'] = np.maximum(0, merged['high'] - merged['prev_high'])
merged['breakout_below_magnitude'] = np.maximum(0, merged['prev_low'] - merged['low'])


In [9]:
# --- STEP 8: Aggregate Final Support/Resistance Features ---
support_resistance_features = merged.groupby('date').agg({
    'dist_to_prev_high': ['mean', 'min'],
    'dist_to_prev_low': ['mean', 'max'],
    'range_to_prev_high_ratio': 'mean',
    'range_to_prev_low_ratio': 'mean',
    'broke_prev_high': 'max',
    'broke_prev_low': 'max',
    'breakout_above_magnitude': 'max',
    'breakout_below_magnitude': 'max'
})

support_resistance_features.columns = [
    'avg_dist_to_prev_high', 'min_dist_to_prev_high',
    'avg_dist_to_prev_low', 'max_dist_to_prev_low',
    'avg_ratio_to_high', 'avg_ratio_to_low',
    'broke_prev_high', 'broke_prev_low',
    'breakout_above_magnitude', 'breakout_below_magnitude'
]

final_df = pd.concat([session_levels, support_resistance_features, breakout_stats], axis=1).dropna().reset_index()
final_df.to_csv('support_resistance_features_enhanced.csv', index=False)
print(final_df)

          date    close  session_volume  prev_high  prev_low  prev_close  \
0   2024-01-10  2026.15          216736    2042.09   2031.06     2034.31   
1   2024-01-11  2030.77          294884    2040.37   2023.72     2026.15   
2   2024-01-15  2051.82          162827    2062.33   2034.04     2057.99   
3   2024-01-18  2013.86          254387    2029.08   2008.37     2008.93   
4   2024-01-22  2026.34          266587    2039.50   2020.73     2026.09   
..         ...      ...             ...        ...       ...         ...   
185 2025-05-23  3350.93           43683    3339.41   3283.15     3311.14   
186 2025-05-26  3335.30           30313    3364.53   3301.94     3350.93   
187 2025-05-27  3296.49           42888    3351.83   3323.52     3335.30   
188 2025-05-28  3305.94           36115    3341.46   3287.84     3296.49   
189 2025-05-29  3318.99           43633    3325.44   3294.40     3305.94   

     rolling_high  rolling_low  zone_mid   zone_top  ...  \
0         2030.65      2018

In [10]:
print(final_df.columns)

Index(['date', 'close', 'session_volume', 'prev_high', 'prev_low',
       'prev_close', 'rolling_high', 'rolling_low', 'zone_mid', 'zone_top',
       'zone_bottom', 'vol_zone_top', 'vol_zone_bottom',
       'avg_dist_to_prev_high', 'min_dist_to_prev_high',
       'avg_dist_to_prev_low', 'max_dist_to_prev_low', 'avg_ratio_to_high',
       'avg_ratio_to_low', 'broke_prev_high', 'broke_prev_low',
       'breakout_above_magnitude', 'breakout_below_magnitude',
       'broke_above_prev_high_any', 'broke_above_prev_high_sum',
       'broke_below_prev_low_any', 'broke_below_prev_low_sum',
       'broke_above_zone_top_any', 'broke_above_zone_top_sum',
       'broke_below_zone_bottom_any', 'broke_below_zone_bottom_sum',
       'broke_above_prev_close_any', 'broke_above_prev_close_sum',
       'broke_below_prev_close_any', 'broke_below_prev_close_sum',
       'minutes_to_breakout_above_close', 'minutes_to_breakout_below_close'],
      dtype='object')
