In [1]:
import pandas as pd
import numpy as np
from hmmlearn.hmm import GaussianHMM
import datetime
from io import StringIO
from new_strategy import Asset
from pathlib import Path
from new_strategy import Asset 


In [2]:
ASSET = Asset.XAUUSD
asset_name = ASSET.value

# Load the already merged minute-level data
df = pd.read_csv(f"data/raw/{asset_name}/combined_data.csv", parse_dates=['timestamp'], index_col='timestamp')
df.index = df.index.tz_localize(None)
if 'ATR' in df.columns:
    df.drop(columns='ATR', inplace=True)

# === Ensure all days are present in index and forward-fill ===
start_date = df.index.min().normalize()
end_date = df.index.max().normalize()
full_date_range = pd.date_range(start=start_date, end=end_date, freq='D')

# Add missing days with last known values to preserve continuity for rolling metrics
df_daily_filled = df.resample('1D').agg({
    'high': 'max',
    'low': 'min',
    'close': 'last'
}).reindex(full_date_range)

df_daily_filled.ffill(inplace=True)

# Use forward-filled daily OHLC to preserve rolling consistency
daily_ohlc = df_daily_filled.copy()

#t10y
t10y = pd.read_csv("data/preprocessing_data/T10YIE.csv", parse_dates=['observation_date'], index_col='observation_date')
t10y.index = t10y.index.tz_localize(None)
t10y.rename(columns={t10y.columns[0]: 't10yie'}, inplace=True)
t10y = t10y.asfreq('D').ffill()
t10y['t10yie'] = t10y['t10yie'].shift(1)
daily_ohlc = daily_ohlc.merge(t10y, how='left', left_index=True, right_index=True)

#CPI
cpi = pd.read_csv("data/preprocessing_data/CPIAUCSL.csv", parse_dates=['observation_date'], index_col='observation_date')
cpi.index = cpi.index.tz_localize(None)
cpi.rename(columns={cpi.columns[0]: 'cpiaucsl'}, inplace=True)
cpi['cpiaucsl'] = cpi['cpiaucsl'].shift(1)
cpi = cpi.asfreq('D').ffill()
daily_ohlc = daily_ohlc.merge(cpi, how='left', left_index=True, right_index=True)

#VIX
vix = pd.read_csv("data/preprocessing_data/VIX_History.csv", parse_dates=['date'], index_col='date')
vix.index = vix.index.tz_localize(None)
vix['vix_close'] = vix['close'].shift(1)
vix = vix[['vix_close']]
daily_ohlc = daily_ohlc.merge(vix, how='left', left_index=True, right_index=True)

#DTW
dollar = pd.read_csv("data/preprocessing_data/DTWEXBGS.csv", parse_dates=['observation_date'], index_col='observation_date')
dollar.index = dollar.index.tz_localize(None)
dollar.rename(columns={dollar.columns[0]: 'dtwexbgs'}, inplace=True)
dollar['dtwexbgs'] = dollar['dtwexbgs'].shift(1)
dollar = dollar.asfreq('D').ffill()
daily_ohlc = daily_ohlc.merge(dollar, how='left', left_index=True, right_index=True)

#DGS10
dgs10 = pd.read_csv("data/preprocessing_data/DGS10.csv", parse_dates=['observation_date'], index_col='observation_date')
dgs10.index = dgs10.index.tz_localize(None)
dgs10.rename(columns={dgs10.columns[0]: 'dgs10'}, inplace=True)
dgs10['dgs10'] = dgs10['dgs10'].shift(1)
dgs10 = dgs10.asfreq('D').ffill()
daily_ohlc = daily_ohlc.merge(dgs10, how='left', left_index=True, right_index=True)

daily_ohlc['prev_close'] = daily_ohlc['close'].shift(1)
# Calculate True Range (TR)
daily_ohlc['true_range'] = daily_ohlc.apply(
    lambda row: max(
        row['high'] - row['low'],
        abs(row['high'] - row['prev_close']),
        abs(row['low'] - row['prev_close'])
    ) if pd.notnull(row['prev_close']) else None,
    axis=1
)

#Calculate 14-day ATR (excluding current day via shift)
daily_ohlc['atr_14'] = daily_ohlc['true_range'].rolling(window=14).mean().shift(1)

#Calculate moving averages on daily close (also shifted)
daily_ohlc['ma_14'] = daily_ohlc['close'].rolling(window=14).mean().shift(1)
daily_ohlc['ma_30'] = daily_ohlc['close'].rolling(window=30).mean().shift(1)
daily_ohlc['ma_100'] = daily_ohlc['close'].rolling(window=100).mean().shift(1)

# Rolling max/min of the daily close
daily_ohlc['max_14'] = daily_ohlc['close'].rolling(window=14).max().shift(1)
daily_ohlc['min_14'] = daily_ohlc['close'].rolling(window=14).min().shift(1)

daily_ohlc['max_30'] = daily_ohlc['close'].rolling(window=30).max().shift(1)
daily_ohlc['min_30'] = daily_ohlc['close'].rolling(window=30).min().shift(1)

daily_ohlc['max_100'] = daily_ohlc['close'].rolling(window=100).max().shift(1)
daily_ohlc['min_100'] = daily_ohlc['close'].rolling(window=100).min().shift(1)

# Cumulative max of close price
daily_ohlc['static_peak'] = daily_ohlc['close'].cummax()

# Static drawdown from that peak
daily_ohlc['drawdown_static'] = (
    (daily_ohlc['close'] - daily_ohlc['static_peak']) / daily_ohlc['static_peak']
).shift(1)

daily_ohlc['drawdown_30'] = (
    (daily_ohlc['close'] - daily_ohlc['close'].rolling(window=30).max()) 
    / daily_ohlc['close'].rolling(window=30).max()
).shift(1)

# Forward-fill daily values into minute-level data
df['daily_high'] = daily_ohlc['high'].reindex(df.index, method='ffill')
df['daily_low'] = daily_ohlc['low'].reindex(df.index, method='ffill')
df['daily_close'] = daily_ohlc['close'].reindex(df.index, method='ffill')
df['true_range'] = daily_ohlc['true_range'].reindex(df.index, method='ffill')
df['atr_14'] = daily_ohlc['atr_14'].reindex(df.index, method='ffill')
df['ma_14'] = daily_ohlc['ma_14'].reindex(df.index, method='ffill')
df['ma_30'] = daily_ohlc['ma_30'].reindex(df.index, method='ffill')
df['ma_100'] = daily_ohlc['ma_100'].reindex(df.index, method='ffill')

#Day and Weeknumber
df['day_of_week'] = df.index.day_name().str[:3]
df['week_number'] = df.index.isocalendar().week

df['max_price_14'] = daily_ohlc['max_14'].reindex(df.index, method='ffill')
df['min_price_14'] = daily_ohlc['min_14'].reindex(df.index, method='ffill')

df['max_price_30'] = daily_ohlc['max_30'].reindex(df.index, method='ffill')
df['min_price_30'] = daily_ohlc['min_30'].reindex(df.index, method='ffill')

df['max_price_100'] = daily_ohlc['max_100'].reindex(df.index, method='ffill')
df['min_price_100'] = daily_ohlc['min_100'].reindex(df.index, method='ffill')

#Drawdown
df['drawdown_static'] = daily_ohlc['drawdown_static'].reindex(df.index, method='ffill')
df['drawdown_30'] = daily_ohlc['drawdown_30'].reindex(df.index, method='ffill')

# Forward-fill T10YIE into minute-level data
df['t10yie'] = daily_ohlc['t10yie'].reindex(df.index, method='ffill')
#FF CPI
df['cpiaucsl'] = daily_ohlc['cpiaucsl'].reindex(df.index, method='ffill')
#FF vix
df['vix_close'] = daily_ohlc['vix_close'].reindex(df.index, method='ffill')
#FF Dollar 
df['dtwexbgs'] = daily_ohlc['dtwexbgs'].reindex(df.index, method='ffill')
#FF DGS10
df['dgs10'] = daily_ohlc['dgs10'].reindex(df.index, method='ffill')

# 1. Compute HMM Features on Daily Data
daily_ohlc['return_raw'] = daily_ohlc['close'].pct_change()
daily_ohlc['volatility_raw'] = (
    daily_ohlc['close']
    .rolling(window=10)
    .apply(lambda x: np.mean((x - x.mean())**2))
).shift(1)

# === Shifted versions for use in model (to avoid lookahead bias) ===
daily_ohlc['return'] = daily_ohlc['return_raw'].shift(1)
daily_ohlc['volatility'] = daily_ohlc['volatility_raw'].shift(1)

df['daily_return'] = daily_ohlc['return'].reindex(df.index, method='ffill')
df['daily_volatility'] = daily_ohlc['volatility'].reindex(df.index, method='ffill')

# 2. Drop NaNs (due to rolling/shift)
hmm_features = daily_ohlc[['return_raw', 'volatility_raw']].dropna()

# 3. Fit HMM on ALL data (no slicing)
model = GaussianHMM(n_components=3, covariance_type='full', n_iter=75, random_state=42)
model.fit(hmm_features)
hmm_features['regime'] = model.predict(hmm_features)

# 4. Label Regimes (Bull, Bear, Neutral) based on return means
regime_means = hmm_features.groupby('regime')['return_raw'].mean().sort_values()
regime_mapping = {
    regime_means.index[0]: 'Bear',
    regime_means.index[1]: 'Neutral',
    regime_means.index[2]: 'Bull'
}
hmm_features['regime_label'] = hmm_features['regime'].map(regime_mapping)

# 5. Merge regime info into daily_ohlc
daily_ohlc['regime'] = hmm_features['regime']
daily_ohlc['regime_label'] = hmm_features['regime_label']

# 6. Forward-fill daily regime into minute-level df
df['regime'] = daily_ohlc['regime'].reindex(df.index, method='ffill')
df['regime_label'] = daily_ohlc['regime_label'].reindex(df.index, method='ffill')

#Optional: Inspect regime distribution
print(df['regime_label'].value_counts())

# Overwrite the original file
df.index = df.index.tz_localize('UTC')
output_path = f"data/processed/{asset_name}/combined_data.csv"
Path(output_path).parent.mkdir(parents=True, exist_ok=True)
df.to_csv(output_path)
print(f"✅ Saved processed data to: {output_path}")


Bull       887040
Bear       885600
Neutral    789120
Name: regime_label, dtype: int64
✅ Saved processed data to: data/processed/XAUUSD/combined_data.csv


In [3]:
"""import pandas as pd

# === Settings ===
asset_name = "WTI"  # or "WTI"
path = f"data/raw/{asset_name}/combined_data.csv"

# === Load the data ===
df = pd.read_csv(path, parse_dates=['timestamp'], index_col='timestamp')
df.index = df.index.tz_localize(None)

# === Determine date range ===
start_date = df.index.min().normalize()
end_date = df.index.max().normalize()
full_range = pd.date_range(start=start_date, end=end_date, freq='D')

# === Actual calendar days in the data ===
actual_days = df.index.normalize().unique()
actual_days = pd.DatetimeIndex(actual_days)

# === Find missing days ===
missing_days = full_range.difference(actual_days)

# === Output ===
print(f"📆 Start date: {start_date.date()}")
print(f"📆 End date:   {end_date.date()}")
print(f"🗓️ Total calendar days: {len(full_range)}")
print(f"📉 Missing days: {len(missing_days)}\n")

if len(missing_days) > 0:
    print("🚫 Missing dates and their weekdays:")
    for day in missing_days:
        print(f" - {day.date()} ({day.strftime('%A')})")
else:
    print("✅ No missing dates in raw data.")"""


📆 Start date: 2020-01-01
📆 End date:   2024-11-01
🗓️ Total calendar days: 1767
📉 Missing days: 126

🚫 Missing dates and their weekdays:
 - 2020-06-06 (Saturday)
 - 2020-06-20 (Saturday)
 - 2020-06-28 (Sunday)
 - 2020-08-01 (Saturday)
 - 2020-08-15 (Saturday)
 - 2020-08-29 (Saturday)
 - 2020-09-12 (Saturday)
 - 2020-09-26 (Saturday)
 - 2020-10-04 (Sunday)
 - 2020-10-10 (Saturday)
 - 2020-10-24 (Saturday)
 - 2020-11-08 (Sunday)
 - 2020-11-21 (Saturday)
 - 2020-12-05 (Saturday)
 - 2020-12-11 (Friday)
 - 2020-12-13 (Sunday)
 - 2020-12-15 (Tuesday)
 - 2020-12-17 (Thursday)
 - 2020-12-19 (Saturday)
 - 2020-12-21 (Monday)
 - 2020-12-23 (Wednesday)
 - 2020-12-25 (Friday)
 - 2020-12-27 (Sunday)
 - 2020-12-29 (Tuesday)
 - 2021-02-06 (Saturday)
 - 2021-02-20 (Saturday)
 - 2021-02-28 (Sunday)
 - 2021-03-06 (Saturday)
 - 2021-03-20 (Saturday)
 - 2021-04-03 (Saturday)
 - 2021-04-17 (Saturday)
 - 2021-05-01 (Saturday)
 - 2021-05-15 (Saturday)
 - 2021-05-29 (Saturday)
 - 2021-06-12 (Saturday)
 - 2021-