In [1]:
import yfinance as yf
import pandas as pd
import numpy as np
import os
import xgboost as xgb
import pywt

In [2]:
def load_price_data(tickers, start_date="1980-01-01"):

    print("Downloading data from Yahoo Finance...")
    raw_data = yf.download(list(tickers.values()), start=start_date, group_by='ticker', auto_adjust=True)

    # Split into one cleaned DataFrame per ticker
    price_data = {}
    for label, yf_ticker in tickers.items():
        if yf_ticker in raw_data.columns.get_level_values(0):
            df = raw_data[yf_ticker].copy()
            df = df.dropna(subset=['Close'])
            df.index = pd.to_datetime(df.index)
            df.sort_index(inplace=True)
            price_data[label] = df
        else:
            print(f"⚠️ Skipping {label} ({yf_ticker}) — no data downloaded.")

    return price_data

tickers = {
    'ES': 'ES=F',
    'CL': 'CL=F',
    'ZN': 'ZN=F',
    '6E': '6E=F'
}

price_data = load_price_data(tickers)

Downloading data from Yahoo Finance...


[*********************100%***********************]  4 of 4 completed


In [15]:
def pre_process(price_data, selected_feature_dict, fred_api_key='946c422a593cc6d5cf114880e3d5a704',
                ewma_span=882, interaction_top_n=50, interaction_corr_threshold=0.5,
                interaction_na_thresh=0.9, lag_days=[1, 5, 21]):

    def engineer_features(df):
        features = pd.DataFrame(index=df.index)
        series = df['Close']

        features['month'] = df.index.month
        features['day_of_week'] = df.index.dayofweek
        features['month_sin'] = np.sin(2 * np.pi * df.index.month / 12)
        features['month_cos'] = np.cos(2 * np.pi * df.index.month / 12)
        features['dow_sin'] = np.sin(2 * np.pi * df.index.dayofweek / 7)
        features['dow_cos'] = np.cos(2 * np.pi * df.index.dayofweek / 7)

        features['log_return'] = np.log(series / series.shift(1))
        features['ma_50'] = series.rolling(50).mean()
        features['ma_200'] = series.rolling(200).mean()
        features['ma_trend_signal'] = np.sign(features['ma_50'] - features['ma_200'])
        features['price_diff_50'] = series - features['ma_50']
        features['price_diff_200'] = series - features['ma_200']

        ema_12 = series.ewm(span=12, adjust=False).mean()
        ema_26 = series.ewm(span=26, adjust=False).mean()
        features['macd'] = ema_12 - ema_26
        features['macd_signal'] = features['macd'].ewm(span=9, adjust=False).mean()
        features['macd_binary'] = np.sign(features['macd'] - features['macd_signal'])

        sma = series.rolling(20).mean()
        std = series.rolling(20).std()
        upper = sma + 2 * std
        lower = sma - 2 * std
        features['bb_upper'] = upper
        features['bb_lower'] = lower
        features['bb_width'] = upper - lower
        features['bb_position'] = (series - lower) / (upper - lower)

        features['fourier_power'] = series.rolling(64).apply(lambda x: np.abs(np.fft.fft(x))[1] if len(x) >= 2 else np.nan, raw=True)
        features['wavelet_detail'] = series.rolling(64).apply(lambda x: pywt.dwt(x, 'haar')[1][-1] if len(x) >= 2 else np.nan, raw=True)

        def compute_hurst(ts):
            lags = range(2, 20)
            tau = [np.std(np.subtract(ts[lag:], ts[:-lag])) for lag in lags]
            poly = np.polyfit(np.log(lags), np.log(tau), 1)
            return poly[0] if not np.isnan(poly[0]) else np.nan
        features['hurst_exponent'] = series.rolling(window=100).apply(compute_hurst, raw=False)

        log_open = np.log(df['Open'])
        log_close = np.log(df['Close'])
        log_high = np.log(df['High'])
        log_low = np.log(df['Low'])
        r_close = log_close.diff()
        r_open = log_open - log_close.shift(1)
        sigma_c = r_close.rolling(20).var()
        sigma_o = r_open.rolling(20).var()
        rs = (log_high - log_close) * (log_high - log_open) + (log_low - log_close) * (log_low - log_open)
        sigma_rs = rs.rolling(20).mean()
        k = 0.34 / (1.34 + (20 + 1) / (20 - 1))
        yz_var = k * sigma_o + (1 - k) * sigma_c + sigma_rs
        features['yang_zhang_vol'] = np.sqrt(yz_var)

        delta = series.diff()
        gain = delta.where(delta > 0, 0.0)
        loss = -delta.where(delta < 0, 0.0)
        avg_gain = gain.rolling(14).mean()
        avg_loss = loss.rolling(14).mean()
        rs = avg_gain / avg_loss
        features['rsi'] = 100 - (100 / (1 + rs))
        features['rsi_threshold'] = features['rsi'].apply(lambda x: 1 if x > 70 else (-1 if x < 30 else 0))

        tp = (df['High'] + df['Low'] + series) / 3
        sma_tp = tp.rolling(20).mean()
        mad = tp.rolling(20).apply(lambda x: np.mean(np.abs(x - np.mean(x))))
        features['cci'] = (tp - sma_tp) / (0.015 * mad)
        features['cci_threshold'] = features['cci'].apply(lambda x: 1 if x > 100 else (-1 if x < -100 else 0))

        prev_close = series.shift(1)
        tr = pd.concat([
            df['High'] - df['Low'],
            (df['High'] - prev_close).abs(),
            (df['Low'] - prev_close).abs()
        ], axis=1).max(axis=1)
        features['atr'] = tr.rolling(14).mean()

        plus_dm = df['High'].diff()
        minus_dm = df['Low'].diff().abs()
        plus_dm[plus_dm < minus_dm] = 0
        minus_dm[minus_dm < plus_dm] = 0
        tr = pd.concat([
            df['High'] - df['Low'],
            (df['High'] - df['Close'].shift()).abs(),
            (df['Low'] - df['Close'].shift()).abs()
        ], axis=1).max(axis=1)
        atr_tr = tr.rolling(14).mean()
        plus_di = 100 * (plus_dm.rolling(14).mean() / atr_tr)
        minus_di = 100 * (minus_dm.rolling(14).mean() / atr_tr)
        dx = 100 * (plus_di - minus_di).abs() / (plus_di + minus_di)
        features['adx'] = dx.rolling(14).mean()

        daily_return = series.pct_change()
        obv = df['Volume'].copy()
        obv[series.diff() < 0] *= -1
        obv[series.diff() == 0] = 0
        features['obv'] = obv.cumsum()
        features['vpt'] = (daily_return * df['Volume']).cumsum()

        features['parkinson_vol'] = (1 / (4 * np.log(2))) * (np.log(df['High'] / df['Low']) ** 2).rolling(10).mean().apply(np.sqrt)

        kurt_df = features.rolling(window=21).kurt().add_suffix('_kurtosis')
        skew_df = features.rolling(window=21).skew().add_suffix('_skew')
        roc_1d = features.pct_change(1).add_suffix('_roc_1d')
        roc_5d = features.pct_change(5).add_suffix('_roc_5d')
        roc_30d = features.pct_change(30).add_suffix('_roc_30d')

        features = pd.concat([features, kurt_df, skew_df, roc_1d, roc_5d, roc_30d], axis=1)

        features['5d_return'] = df['Close'].pct_change(5)


        return features.dropna(thresh=int(features.shape[1] * 0.8))

    def add_interaction_features(df):
        top_features = df.var().sort_values(ascending=False).head(interaction_top_n).index.tolist()
        corr_matrix = df[top_features].corr().abs()
        interaction_features = pd.DataFrame(index=df.index)
        for i in range(len(top_features)):
            for j in range(i + 1, len(top_features)):
                col1, col2 = top_features[i], top_features[j]
                if corr_matrix.loc[col1, col2] < interaction_corr_threshold:
                    interaction_features[f'{col1}_x_{col2}'] = df[col1] * df[col2]
        enriched_df = pd.concat([df, interaction_features], axis=1)
        enriched_df = enriched_df.dropna(thresh=int(interaction_na_thresh * enriched_df.shape[1]))
        return enriched_df

    def ewma_standardize(enriched_dict):
        normalized = {}
        for label, df in enriched_dict.items():
            mean_ewma = df.ewm(span=ewma_span, adjust=False).mean()
            std_ewma = df.ewm(span=ewma_span, adjust=False).std()
            normalized[label] = (df - mean_ewma) / std_ewma
        return normalized

    def add_calendar_features(normalized_dict):
        from fredapi import Fred
        from pandas_datareader.data import DataReader
        fred = Fred(api_key=fred_api_key)
        fred_series = {
            'cpi': 'CPIAUCSL', 'gdp': 'GDP', 'nfp': 'PAYEMS', 'durable_goods': 'DGORDER',
            'pce': 'PCE', 'retail_sales': 'RSAFS', 'jolts': 'JTSJOL'
        }
        for label, df in normalized_dict.items():
            calendar_features = pd.DataFrame(index=df.index)
            for key, series_id in fred_series.items():
                try:
                    events = DataReader(series_id, 'fred', df.index.min(), df.index.max()).dropna()
                    event_dates = pd.to_datetime(events.index)
                    countdown_days = [(event_dates[event_dates >= date][0] - date).days
                                      if not event_dates[event_dates >= date].empty else np.nan
                                      for date in df.index]
                    calendar_features[f'{key}_decay'] = np.exp(-0.1 * pd.Series(countdown_days, index=df.index))
                except:
                    continue
            normalized_dict[label] = pd.concat([df, calendar_features], axis=1)
        return normalized_dict

    def select_manual_features(normalized_dict):
        selected = {}
        for label, df in normalized_dict.items():
            feats = selected_feature_dict.get(label, [])
            available = [f for f in feats if f in df.columns]
            if available:
                selected[label] = df[available].copy()
        return selected

    def create_lagged_features(manual_dict, raw_feature_dict):
        lagged_dict = {}
        for label, base_df in manual_dict.items():
            lagged_frames = [base_df]

            # ✅ Include lagged 5d_return from the original engineered features
            if label in raw_feature_dict and '5d_return' in raw_feature_dict[label].columns:
                for lag in lag_days:
                    colname = f"5d_return_lag{lag}"
                    lagged_series = raw_feature_dict[label]['5d_return'].shift(lag).rename(colname)
                    lagged_frames.append(lagged_series)

            # ✅ Add lags for selected features
            for lag in lag_days:
                lagged_frames.append(base_df.shift(lag).add_suffix(f"_lag{lag}"))

            lagged_df = pd.concat(lagged_frames, axis=1).dropna()
            lagged_dict[label] = lagged_df

        return lagged_dict


    print("🔧 Engineering base features...")
    engineered = {label: engineer_features(df) for label, df in price_data.items()}

    print("🔗 Adding interaction features...")
    enriched = {label: add_interaction_features(df) for label, df in engineered.items()}

    print("📏 Applying EWMA standardization...")
    normalized = ewma_standardize(enriched)

    print("📅 Adding calendar decay features from FRED...")
    normalized = add_calendar_features(normalized)

    print("✅ Selecting manual features...")
    manual_selected = select_manual_features(normalized)

    print("🕒 Generating lagged features...")
    lagged = create_lagged_features(manual_selected, raw_feature_dict=engineered)

    for label, df in processed_features.items():
        return_lags = [col for col in df.columns if col.startswith('5d_return_lag')]
        print(f"✅ {label} return lags: {return_lags if return_lags else '❌ Not found!'}")

    return lagged

selected_feature_dict = {
    'CL': [
        'fourier_power_x_cci_roc_1d',
        'cci_roc_1d_x_price_diff_200',
        'cci_roc_1d_x_vpt_roc_30d'
    ],
    '6E': [
        'parkinson_vol', 'cci_threshold_skew', 'atr_skew', 'parkinson_vol_skew',
        'ma_trend_signal_roc_1d', 'yang_zhang_vol_roc_30d',
        'month_cos_roc_30d_x_month_sin_roc_30d', 'month_cos_roc_30d_x_month_sin_roc_5d',
        'month_cos_roc_30d_x_vpt', 'month_cos_roc_30d_x_obv_roc_30d',
        'obv_x_vpt', 'obv_x_macd_binary_kurtosis',
        'price_diff_50_roc_5d_x_cci_roc_30d', 'price_diff_50_roc_5d_x_month_skew',
        'cci_x_month_cos_skew', 'price_diff_200_roc_30d_x_obv_roc_5d',
        'adx_x_ma_trend_signal_kurtosis', 'macd_binary_kurtosis_x_month_sin_skew',
        'close'
    ],
    'ZN': [
        'parkinson_vol_skew', 'yang_zhang_vol_roc_5d', 'parkinson_vol_roc_5d',
        'month_sin_roc_5d_x_macd_signal_roc_5d', 'month_sin_roc_5d_x_cci', 'month_sin_roc_5d_x_adx',
        'month_sin_roc_1d_x_cci_roc_30d', 'obv_x_ma_trend_signal_kurtosis', 'cci_x_obv_kurtosis',
        'price_diff_200_roc_30d_x_obv_roc_30d', 'rsi_x_obv_kurtosis', 'obv_roc_30d_x_obv_kurtosis',
        'obv_roc_30d_x_month_skew', 'month_x_yang_zhang_vol_kurtosis',
        'cci_threshold_kurtosis_x_vpt_roc_30d', 'gdp_decay'
    ],
    'ES': [
        'month_cos_roc_5d_x_price_diff_50_roc_30d', 'vpt_x_rsi_threshold_kurtosis',
        'fourier_power_x_cci', 'fourier_power_x_wavelet_detail',
        'bb_width_x_macd_binary_kurtosis', 'macd_x_rsi_threshold_kurtosis',
        'macd_signal_x_rsi_threshold_kurtosis', 'atr_x_rsi'
    ]
}

processed_features = pre_process(price_data, selected_feature_dict)


🔧 Engineering base features...


  return func(x, start, end, min_periods, *numba_args)
  roc_1d = features.pct_change(1).add_suffix('_roc_1d')
  roc_5d = features.pct_change(5).add_suffix('_roc_5d')
  roc_30d = features.pct_change(30).add_suffix('_roc_30d')
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  return func(x, start, end, min_periods, *numba_args)
  roc_1d = features.pct_change(1).add_suffix('_roc_1d')
  roc_5d = features.pct_change(5).add_suffix('_roc_5d')
  roc_30d = features.pct_change(30).add_suffix('_roc_30d')
  return func(x, start, end, min_periods, *numba_args)
  roc_1d = features.pct_change(1).add_suffix('_roc_1d')
  roc_5d = features.pct_change(5).add_suffix('_roc_5d')
  roc_30d = features.pct_change(30).add_suffix('_roc_30d')
  return func(x, start, end, min_periods, *numba_args)
  roc_1d = features.pct_change(1).add_suffix('_roc_1d')
  roc_5d = features.pct_change(5).add_suffix('_roc_5

🔗 Adding interaction features...


  interaction_features[f'{col1}_x_{col2}'] = df[col1] * df[col2]
  interaction_features[f'{col1}_x_{col2}'] = df[col1] * df[col2]
  interaction_features[f'{col1}_x_{col2}'] = df[col1] * df[col2]
  interaction_features[f'{col1}_x_{col2}'] = df[col1] * df[col2]
  interaction_features[f'{col1}_x_{col2}'] = df[col1] * df[col2]
  interaction_features[f'{col1}_x_{col2}'] = df[col1] * df[col2]
  interaction_features[f'{col1}_x_{col2}'] = df[col1] * df[col2]
  interaction_features[f'{col1}_x_{col2}'] = df[col1] * df[col2]
  interaction_features[f'{col1}_x_{col2}'] = df[col1] * df[col2]
  interaction_features[f'{col1}_x_{col2}'] = df[col1] * df[col2]
  interaction_features[f'{col1}_x_{col2}'] = df[col1] * df[col2]
  interaction_features[f'{col1}_x_{col2}'] = df[col1] * df[col2]
  interaction_features[f'{col1}_x_{col2}'] = df[col1] * df[col2]
  interaction_features[f'{col1}_x_{col2}'] = df[col1] * df[col2]
  interaction_features[f'{col1}_x_{col2}'] = df[col1] * df[col2]
  interaction_features[f'

📏 Applying EWMA standardization...
📅 Adding calendar decay features from FRED...
✅ Selecting manual features...
🕒 Generating lagged features...
✅ ES return lags: ❌ Not found!


In [13]:
def run_xgboost_predictions(final_features_lagged, model_dir='Models', output_dir='Predictions'):
    os.makedirs(output_dir, exist_ok=True)

    for label, df in final_features_lagged.items():
        model_path = os.path.join(model_dir, f"{label}.json")
        if not os.path.exists(model_path):
            print(f"⚠️ Model for {label} not found: {model_path}")
            continue

        try:
            print(f"📈 Running XGBoost prediction for {label}")
            model = xgb.XGBRegressor()
            model.load_model(model_path)

            if '5d_return' in df.columns:
                X = df.drop(columns='5d_return')
            else:
                X = df.copy()

            preds = model.predict(X)
            output_df = pd.DataFrame({
                'Date': df.index,
                'Return': preds
            })
            output_path = os.path.join(output_dir, f"{label}_forecast.csv")
            output_df.to_csv(output_path, index=False)
            print(f"  ✔️  Saved forecast to {output_path}")
        
        except Exception as e:
            print(f"❌ Failed to process {label}: {e}")

run_xgboost_predictions(processed_features)



📈 Running XGBoost prediction for ES
❌ Failed to process ES: feature_names mismatch: ['month_cos_roc_5d_x_price_diff_50_roc_30d', 'vpt_x_rsi_threshold_kurtosis', 'fourier_power_x_cci', 'fourier_power_x_wavelet_detail', 'bb_width_x_macd_binary_kurtosis', 'macd_x_rsi_threshold_kurtosis', 'atr_x_rsi', 'macd_signal_x_rsi_threshold_kurtosis', 'month_cos_roc_5d_x_price_diff_50_roc_30d_lag1', 'vpt_x_rsi_threshold_kurtosis_lag1', 'fourier_power_x_cci_lag1', 'fourier_power_x_wavelet_detail_lag1', 'bb_width_x_macd_binary_kurtosis_lag1', 'macd_x_rsi_threshold_kurtosis_lag1', 'atr_x_rsi_lag1', 'macd_signal_x_rsi_threshold_kurtosis_lag1', '5d_return_lag1', 'month_cos_roc_5d_x_price_diff_50_roc_30d_lag5', 'vpt_x_rsi_threshold_kurtosis_lag5', 'fourier_power_x_cci_lag5', 'fourier_power_x_wavelet_detail_lag5', 'bb_width_x_macd_binary_kurtosis_lag5', 'macd_x_rsi_threshold_kurtosis_lag5', 'atr_x_rsi_lag5', 'macd_signal_x_rsi_threshold_kurtosis_lag5', '5d_return_lag5', 'month_cos_roc_5d_x_price_diff_50_roc