In [1]:
pip install ta

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
pre_df = pd.read_csv('pre_market_data.csv')

In [3]:
pre_df['body'] = (pre_df['close'] - pre_df['open']).abs()
pre_df['upper_wick'] = pre_df['high'] - pre_df[['open', 'close']].max(axis=1)
pre_df['lower_wick'] = pre_df[['open', 'close']].min(axis=1) - pre_df['low']
pre_df['candle_range'] = pre_df['high'] - pre_df['low']
pre_df['body_to_wick_ratio'] = pre_df['body'] / (pre_df['upper_wick'] + pre_df['lower_wick'] + 1e-6)
pre_df['bullish'] = (pre_df['close'] > pre_df['open']).astype(int)

In [4]:
grouped = pre_df.groupby('date')

In [5]:
import numpy as np

# Calculate returns
pre_df['return'] = pre_df['close'].pct_change()
pre_df['log_return'] = np.log(pre_df['close'] / pre_df['close'].shift(1))

# Previous close for true range
pre_df['prev_close'] = pre_df['close'].shift(1)

# True Range: max of (high - low, |high - prev_close|, |low - prev_close|)
pre_df['true_range'] = pre_df[['high', 'low', 'prev_close']].apply(
    lambda row: max(
        row['high'] - row['low'],
        abs(row['high'] - row['prev_close']),
        abs(row['low'] - row['prev_close'])
    ), axis=1
)

# Momentum: 3-period return
pre_df['momentum_3'] = pre_df['close'].pct_change(3)

# Group again
grouped = pre_df.groupby('date')

# Aggregate features
price_action_features = grouped.agg({
    'open': 'first',
    'close': 'last',
    'high': 'max',
    'low': 'min',
    'return': ['mean', 'std'],
    'log_return': 'std',
    'tick_volume': 'sum',
    'momentum_3': 'last',
    'true_range': 'mean'
})

# Flatten multi-index columns
price_action_features.columns = [
    'first_open', 'last_close', 'max_high', 'min_low',
    'avg_return', 'volatility_return',
    'volatility_log_return', 'total_volume',
    'last_momentum_3', 'avg_true_range'
]

# Session-level calculations
price_action_features['session_return'] = (
    price_action_features['last_close'] / price_action_features['first_open'] - 1
)
price_action_features['range'] = (
    price_action_features['max_high'] - price_action_features['min_low']
)
price_action_features['range_ratio'] = (
    price_action_features['range'] / price_action_features['first_open']
)
price_action_features['range_ratio_close'] = (
    price_action_features['range'] / price_action_features['last_close']
)

# Up/down candle counts
price_action_features['up_moves'] = grouped.apply(lambda x: (x['close'] > x['open']).sum())
price_action_features['down_moves'] = grouped.apply(lambda x: (x['close'] < x['open']).sum())
price_action_features['up_move_ratio'] = (
    price_action_features['up_moves'] /
    (price_action_features['up_moves'] + price_action_features['down_moves'])
)

# Reset index if needed
price_action_features = price_action_features.reset_index()

# Final output
print(price_action_features.head())

price_action_features.to_csv('price_action_features.csv')


         date  first_open  last_close  max_high  min_low  avg_return  \
0  2024-01-08     2022.60     2028.94   2030.46  2018.31    0.000083   
1  2024-01-09     2032.76     2034.75   2042.09  2031.06    0.000025   
2  2024-01-10     2026.70     2025.54   2040.37  2023.72   -0.000040   
3  2024-01-11     2032.94     2029.39   2045.80  2024.76    0.000017   
4  2024-01-12     2035.98     2057.63   2062.33  2034.04    0.000122   

   volatility_return  volatility_log_return  total_volume  last_momentum_3  \
0           0.000618               0.000618        124663         0.000399   
1           0.000461               0.000461        217012         0.000836   
2           0.000564               0.000564        212728        -0.000587   
3           0.000632               0.000632        289535         0.000794   
4           0.000616               0.000616        332701        -0.000204   

   avg_true_range  session_return  range  range_ratio  range_ratio_close  \
0        2.067021     

  price_action_features['up_moves'] = grouped.apply(lambda x: (x['close'] > x['open']).sum())
  price_action_features['down_moves'] = grouped.apply(lambda x: (x['close'] < x['open']).sum())


In [20]:
geometry_features = grouped.agg({
    'body': ['mean', 'std', 'max', 'min'],
    'candle_range': 'mean',
    'close': ['std', 'last'],
    'bullish': ['sum', lambda x: len(x) - x.sum()],  # bullish + bearish count
    'open': 'first'
})

geometry_features.columns = [
    'avg_body', 'std_body', 'max_body', 'min_body',
    'avg_candle_range',
    'std_close', 'last_close',
    'bullish_count', 'bearish_count',
    'first_open'
]

geometry_features['min_body'].fillna(0.0, inplace=True)

geometry_features['first_last_change'] = geometry_features['last_close'] - geometry_features['first_open']

geometry_features.to_csv('geometry_features.csv')


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  geometry_features['min_body'].fillna(0.0, inplace=True)


In [7]:
# Basic candle components
pre_df['body'] = abs(pre_df['close'] - pre_df['open'])
pre_df['upper_wick'] = pre_df['high'] - pre_df[['close', 'open']].max(axis=1)
pre_df['lower_wick'] = pre_df[['close', 'open']].min(axis=1) - pre_df['low']
pre_df['range'] = pre_df['high'] - pre_df['low']

# Avoid division by zero
pre_df['range'] = pre_df['range'].replace(0, np.nan)

# --- Pattern Detection Functions ---

# 1. Doji: Small body, large wicks
pre_df['is_doji'] = (pre_df['body'] / pre_df['range'] < 0.1)

# 2. Hammer: Small body near top, long lower wick
pre_df['is_hammer'] = (
    (pre_df['body'] / pre_df['range'] < 0.3) &
    (pre_df['lower_wick'] > 2 * pre_df['body']) &
    (pre_df['upper_wick'] < pre_df['body'])
)

# 3. Inverted Hammer: Small body near bottom, long upper wick
pre_df['is_inverted_hammer'] = (
    (pre_df['body'] / pre_df['range'] < 0.3) &
    (pre_df['upper_wick'] > 2 * pre_df['body']) &
    (pre_df['lower_wick'] < pre_df['body'])
)

# 4. Engulfing (Bullish & Bearish) - need previous row
pre_df['prev_open'] = pre_df['open'].shift(1)
pre_df['prev_close'] = pre_df['close'].shift(1)

pre_df['is_bullish_engulfing'] = (
    (pre_df['prev_close'] < pre_df['prev_open']) &  # previous candle bearish
    (pre_df['close'] > pre_df['open']) &  # current candle bullish
    (pre_df['open'] < pre_df['prev_close']) & (pre_df['close'] > pre_df['prev_open'])
)

pre_df['is_bearish_engulfing'] = (
    (pre_df['prev_close'] > pre_df['prev_open']) &  # previous candle bullish
    (pre_df['close'] < pre_df['open']) &  # current candle bearish
    (pre_df['open'] > pre_df['prev_close']) & (pre_df['close'] < pre_df['prev_open'])
)

# Shifted values for 3-candle pattern
pre_df['c1_open'] = pre_df['open'].shift(2)
pre_df['c1_close'] = pre_df['close'].shift(2)
pre_df['c2_open'] = pre_df['open'].shift(1)
pre_df['c2_close'] = pre_df['close'].shift(1)
pre_df['c3_open'] = pre_df['open']
pre_df['c3_close'] = pre_df['close']

# Morning Star Pattern
pre_df['is_morning_star'] = (
    # 1st candle: bearish
    (pre_df['c1_close'] < pre_df['c1_open']) &
    
    # 2nd candle: small body (Doji/spinning top)
    (abs(pre_df['c2_close'] - pre_df['c2_open']) / (pre_df['high'].shift(1) - pre_df['low'].shift(1) + 1e-8) < 0.3) &
    
    # 3rd candle: bullish
    (pre_df['c3_close'] > pre_df['c3_open']) &
    
    # 3rd candle closes into or above midpoint of 1st candle
    (pre_df['c3_close'] >= (pre_df['c1_open'] + pre_df['c1_close']) / 2)
)

pattern_cols = ['is_doji', 'is_hammer', 'is_inverted_hammer', 'is_bullish_engulfing', 'is_bearish_engulfing', 'is_morning_star']
pattern_features = pre_df.groupby('date')[pattern_cols].sum().add_prefix('count_')

pattern_features.to_csv('patterns_features.csv')

In [8]:
import ta

# RSI (Relative Strength Index)
pre_df['rsi_14'] = ta.momentum.RSIIndicator(close=pre_df['close'], window=14).rsi()

# EMA (Exponential Moving Average)
pre_df['ema_20'] = ta.trend.EMAIndicator(close=pre_df['close'], window=20).ema_indicator()

# MACD
macd = ta.trend.MACD(close=pre_df['close'])
pre_df['macd'] = macd.macd()
pre_df['macd_signal'] = macd.macd_signal()
pre_df['macd_diff'] = macd.macd_diff()

# ATR (Average True Range)
atr = ta.volatility.AverageTrueRange(high=pre_df['high'], low=pre_df['low'], close=pre_df['close'], window=14)
pre_df['atr_14'] = atr.average_true_range()

# Bollinger Bands
bb = ta.volatility.BollingerBands(close=pre_df['close'], window=20, window_dev=2)
pre_df['bb_upper'] = bb.bollinger_hband()
pre_df['bb_lower'] = bb.bollinger_lband()
pre_df['bb_mavg'] = bb.bollinger_mavg()

In [9]:
grouped = pre_df.groupby('date')
tech_features = grouped.agg({
    'rsi_14': ['mean', 'std'],
    'ema_20': ['mean'],
    'macd': ['mean'],
    'macd_signal': ['mean'],
    'macd_diff': ['mean', 'std'],
    'atr_14': ['mean'],
    'bb_upper': ['mean'],
    'bb_lower': ['mean'],
    'bb_mavg': ['mean']
})
tech_features.columns = ['_'.join(col) for col in tech_features.columns]

tech_features.to_csv('tech_features.csv')

In [10]:
print(price_action_features.columns)

Index(['date', 'first_open', 'last_close', 'max_high', 'min_low', 'avg_return',
       'volatility_return', 'volatility_log_return', 'total_volume',
       'last_momentum_3', 'avg_true_range', 'session_return', 'range',
       'range_ratio', 'range_ratio_close', 'up_moves', 'down_moves',
       'up_move_ratio'],
      dtype='object')


In [11]:
print(geometry_features.columns)

Index(['avg_body', 'std_body', 'max_body', 'min_body', 'avg_candle_range',
       'volatility_close', 'bullish_count', 'bearish_count', 'first_open',
       'last_close', 'first_last_change'],
      dtype='object')


In [12]:
print(tech_features.columns)

Index(['rsi_14_mean', 'rsi_14_std', 'ema_20_mean', 'macd_mean',
       'macd_signal_mean', 'macd_diff_mean', 'macd_diff_std', 'atr_14_mean',
       'bb_upper_mean', 'bb_lower_mean', 'bb_mavg_mean'],
      dtype='object')


In [13]:
print(pattern_features.columns)

Index(['count_is_doji', 'count_is_hammer', 'count_is_inverted_hammer',
       'count_is_bullish_engulfing', 'count_is_bearish_engulfing',
       'count_is_morning_star'],
      dtype='object')


In [19]:
print(geometry_features.head)

<bound method NDFrame.head of             avg_body  std_body  max_body  min_body  avg_candle_range  \
date                                                                   
2024-01-08  1.020000  0.742308      2.89      0.03          2.066809   
2024-01-09  0.708142  0.569034      2.97      0.02          1.630088   
2024-01-10  0.626018  0.587435      3.26      0.01          1.478584   
2024-01-11  0.702832  0.858658      5.13      0.00          1.793451   
2024-01-12  0.789381  0.765020      3.69      0.00          1.768407   
...              ...       ...       ...       ...               ...   
2025-05-26  1.137232  0.926524      5.13      0.01          2.346161   
2025-05-27  1.775000  1.963766     12.61      0.03          3.649636   
2025-05-28  1.423717  1.117491      6.11      0.02          2.819027   
2025-05-29  1.572478  1.300388      5.80      0.01          2.945310   
2025-05-30  1.508761  1.615036     13.91      0.00          3.126372   

            volatility_close  bul