In [25]:
!pip install yfinance



In [28]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import yfinance as yf
from scipy.special import gamma
import ta

In [27]:
!pip install ta

Collecting ta
  Downloading ta-0.11.0.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: ta
  Building wheel for ta (setup.py) ... [?25l[?25hdone
  Created wheel for ta: filename=ta-0.11.0-py3-none-any.whl size=29412 sha256=3577e338facdb83e86382cd1aa82b1f35b98f81fd139fc856d117dab6df7c7d9
  Stored in directory: /root/.cache/pip/wheels/a1/d7/29/7781cc5eb9a3659d032d7d15bdd0f49d07d2b24fec29f44bc4
Successfully built ta
Installing collected packages: ta
Successfully installed ta-0.11.0


In [86]:
!pip install ta statsmodels --quiet


In [3]:
# Define asset tickers by class
equities = ['AAPL', 'MSFT', 'GOOGL', 'JNJ', 'XOM']        # US large-cap equities
commodities = ['GC=F', 'CL=F', 'NG=F', 'SI=F']            # Gold, Crude Oil, Nat Gas, Silver (futures)
forex = ['EURUSD=X', 'GBPUSD=X', 'USDJPY=X', 'AUDUSD=X'] # Major forex pairs (Yahoo Finance format)

In [5]:
# Combine all assets into one list
all_assets = equities + commodities + forex

# Download historical daily data (e.g., 4 years)
data = yf.download(all_assets, start='2019-01-01', end='2023-12-31')['Close']

# Show sample
data.head()

[*********************100%***********************]  13 of 13 completed


Ticker,AAPL,AUDUSD=X,CL=F,EURUSD=X,GBPUSD=X,GC=F,GOOGL,JNJ,MSFT,NG=F,SI=F,USDJPY=X,XOM
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2019-01-01,,0.704176,,1.149306,1.273804,,,,,,,109.629997,
2019-01-02,37.617859,0.704791,46.540001,1.146171,1.275429,1281.0,52.483086,106.384315,94.945518,2.958,15.542,109.667999,51.247864
2019-01-03,33.870831,0.691898,47.09,1.131811,1.252191,1291.800049,51.02953,104.693817,91.452644,2.945,15.706,107.441002,50.461029
2019-01-04,35.316761,0.700624,47.959999,1.139108,1.262881,1282.699951,53.647015,106.450935,95.706062,3.044,15.695,107.807999,52.321499
2019-01-07,35.238152,0.712378,48.52,1.141044,1.273496,1286.800049,53.540028,105.768074,95.828102,2.944,15.669,108.522003,52.59359


In [6]:
# Drop assets or dates with too many missing values
data = data.dropna(axis=1, thresh=int(0.95 * len(data)))  # Drop assets missing >5% of data
data = data.dropna()  # Drop any rows with remaining missing values

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1257 entries, 2019-01-02 to 2023-12-29
Data columns (total 13 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   AAPL      1257 non-null   float64
 1   AUDUSD=X  1257 non-null   float64
 2   CL=F      1257 non-null   float64
 3   EURUSD=X  1257 non-null   float64
 4   GBPUSD=X  1257 non-null   float64
 5   GC=F      1257 non-null   float64
 6   GOOGL     1257 non-null   float64
 7   JNJ       1257 non-null   float64
 8   MSFT      1257 non-null   float64
 9   NG=F      1257 non-null   float64
 10  SI=F      1257 non-null   float64
 11  USDJPY=X  1257 non-null   float64
 12  XOM       1257 non-null   float64
dtypes: float64(13)
memory usage: 137.5 KB


In [23]:
def frac_diff_weights(d, n):
    """
    Compute weights for fractional differencing using binomial expansion.
    """
    w = [1.0]
    for k in range(1, n):
        w_ = -w[-1] * (d - k + 1) / k
        w.append(w_)
    return np.array(w[::-1])  # reverse for convolution

def apply_frac_diff(series, d, thresh=1e-5):
    """
    Apply fractional differencing to a single series.
    """
    # Generate weights
    T = len(series)
    # Generate weights
    w = frac_diff_weights(d, T)
    w = w[np.abs(w) > thresh]
    width = len(w)

    # Apply convolution manually
    result = [np.nan] * (width - 1)
    for i in range(width - 1, T):
        val = np.dot(w, series.iloc[i - width + 1:i + 1])
        result.append(val)

    # Wrap in a Pandas Series with the original index
    return pd.Series(result, index=series.index)

In [24]:
# Set fractional differencing order (try 0.4–0.5 for financial data)
d = 0.4

# Apply fractional differencing to each column (asset)
fdiff_data = pd.DataFrame(index=data.index)
fdiff_data = data.apply(lambda col: apply_frac_diff(col.dropna(), d), axis=0)

# Drop initial NaNs
fdiff_data = fdiff_data.dropna()


In [30]:
# Initialize empty dictionary for features
tech_features = {}

# Loop through each asset
for col in data.columns:
    df = pd.DataFrame({'close': data[col]})

    # Add common technical indicators
    df['sma_20'] = ta.trend.sma_indicator(df['close'], window=20)
    df['ema_20'] = ta.trend.ema_indicator(df['close'], window=20)

    df['rsi'] = ta.momentum.rsi(df['close'], window=14)

    macd = ta.trend.macd_diff(df['close'])
    df['macd'] = macd

    bb = ta.volatility.BollingerBands(close=df['close'], window=20, window_dev=2)
    df['bb_bbm'] = bb.bollinger_mavg()
    df['bb_bbh'] = bb.bollinger_hband()
    df['bb_bbl'] = bb.bollinger_lband()

    adx = ta.trend.adx(df['close'], df['close'], df['close'], window=14)
    df['adx'] = adx

    # Store in dict
    tech_features[col] = df

# Example: show AAPL technical features
tech_features['AAPL'].tail()

# Concatenate all features (with MultiIndex for assets)
features_all = pd.concat(tech_features, axis=1)

In [109]:
features_all.head()

Unnamed: 0_level_0,AAPL,AAPL,AAPL,AAPL,AAPL,AAPL,AAPL,AAPL,AAPL,AUDUSD=X,...,USDJPY=X,XOM,XOM,XOM,XOM,XOM,XOM,XOM,XOM,XOM
Unnamed: 0_level_1,close,sma_20,ema_20,rsi,macd,bb_bbm,bb_bbh,bb_bbl,adx,close,...,adx,close,sma_20,ema_20,rsi,macd,bb_bbm,bb_bbh,bb_bbl,adx
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2019-01-02,37.617859,,,,,,,,0.0,0.704791,...,0.0,51.247864,,,,,,,,0.0
2019-01-03,33.870831,,,,,,,,0.0,0.691898,...,0.0,50.461029,,,,,,,,0.0
2019-01-04,35.316761,,,,,,,,0.0,0.700624,...,0.0,52.321499,,,,,,,,0.0
2019-01-07,35.238152,,,,,,,,0.0,0.712378,...,0.0,52.59359,,,,,,,,0.0
2019-01-08,35.909904,,,,,,,,0.0,0.714592,...,0.0,52.975983,,,,,,,,0.0


In [79]:
def rolling_tsfresh(data, window_size=60, step_size=5):
    """
    Compute tsfresh features over rolling windows of multivariate time series data.

    Parameters:
        data (pd.DataFrame): DataFrame indexed by datetime with asset prices.
        window_size (int): Number of rows per rolling window.
        step_size (int): Step size between windows.

    Returns:
        pd.DataFrame: Extracted features with timestamps.
    """
    all_features = []

    # Ensure datetime index
    if not pd.api.types.is_datetime64_any_dtype(data.index):
        raise ValueError("Input data must have a datetime index.")

    for end in tqdm(range(window_size, len(data), step_size)):
        window = data.iloc[end - window_size:end].copy()

        if window.isnull().all().all():
            continue  # skip completely empty window

        # Reset index to expose datetime for melting
        window_reset = window.reset_index()
        datetime_col = window_reset.columns[0]  # usually the datetime index
        asset_cols = [col for col in window_reset.columns if col != datetime_col]

        # Melt so each asset becomes an 'id'
        window_long = pd.melt(
            window_reset,
            id_vars=[datetime_col],
            value_vars=asset_cols,
            var_name='id',
            value_name='value'
        )
        window_long.rename(columns={datetime_col: 'Date'}, inplace=True)

        # Drop non-numeric values and NaNs
        window_long['value'] = pd.to_numeric(window_long['value'], errors='coerce')
        window_long = window_long.dropna(subset=['value'])

        # Add time index per asset
        window_long['time'] = window_long.groupby('id').cumcount()

        # ✅ Final datetime check
        for col in ['value', 'time']:
            if np.issubdtype(window_long[col].dtype, np.datetime64):
                raise ValueError(f"Column '{col}' contains datetime values!")

        # Rename and drop Date column to avoid tsfresh errors
        window_long.rename(columns={datetime_col: 'Date'}, inplace=True)
        window_long = window_long.drop(columns=['Date'])

        # Add time index
        window_long['time'] = window_long.groupby('id').cumcount()

        # Sanity check: only expected types
        assert np.issubdtype(window_long['value'].dtype, np.number), "Non-numeric values in 'value'"
        assert np.issubdtype(window_long['time'].dtype, np.integer), "Non-integer values in 'time'"


        # Extract features using tsfresh
        features = extract_features(
            window_long,
            column_id='id',
            column_sort='time',
            default_fc_parameters=EfficientFCParameters(),
            n_jobs=0,
            disable_progressbar=True
        )

        impute(features)
        features['timestamp'] = data.index[end]
        all_features.append(features)

    if not all_features:
        raise ValueError("No features extracted. Please check your input data.")

    return pd.concat(all_features).reset_index(drop=True)

In [80]:
rolling_features = rolling_tsfresh(data, window_size=60, step_size=5)
rolling_features.head()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
 'value__fft_coefficient__attr_"real"__coeff_54'
 'value__fft_coefficient__attr_"real"__coeff_55'
 'value__fft_coefficient__attr_"real"__coeff_56'
 'value__fft_coefficient__attr_"real"__coeff_57'
 'value__fft_coefficient__attr_"real"__coeff_58'
 'value__fft_coefficient__attr_"real"__coeff_59'
 'value__fft_coefficient__attr_"real"__coeff_60'
 'value__fft_coefficient__attr_"real"__coeff_61'
 'value__fft_coefficient__attr_"real"__coeff_62'
 'value__fft_coefficient__attr_"real"__coeff_63'
 'value__fft_coefficient__attr_"real"__coeff_64'
 'value__fft_coefficient__attr_"real"__coeff_65'
 'value__fft_coefficient__attr_"real"__coeff_66'
 'value__fft_coefficient__attr_"real"__coeff_67'
 'value__fft_coefficient__attr_"real"__coeff_68'
 'value__fft_coefficient__attr_"real"__coeff_69'
 'value__fft_coefficient__attr_"real"__coeff_70'
 'value__fft_coefficient__attr_"real"__coeff_71'
 'value__fft_coefficient__attr_"real"__coeff_72'
 'va

Unnamed: 0,value__variance_larger_than_standard_deviation,value__has_duplicate_max,value__has_duplicate_min,value__has_duplicate,value__sum_values,value__abs_energy,value__mean_abs_change,value__mean_change,value__mean_second_derivative_central,value__median,...,value__fourier_entropy__bins_10,value__fourier_entropy__bins_100,value__permutation_entropy__dimension_3__tau_1,value__permutation_entropy__dimension_4__tau_1,value__permutation_entropy__dimension_5__tau_1,value__permutation_entropy__dimension_6__tau_1,value__permutation_entropy__dimension_7__tau_1,value__query_similarity_count__query_None__threshold_0.0,value__mean_n_absolute_max__number_of_maxima_7,timestamp
0,1.0,0.0,0.0,0.0,2426.282787,98719.884475,0.52294,0.127621,0.032818,40.886787,...,0.142506,0.928839,1.633226,2.699754,3.257455,3.615145,3.876606,0.0,45.393879,2019-03-29
1,0.0,0.0,0.0,1.0,42.763625,30.481281,0.003268,6.2e-05,6.6e-05,0.711911,...,0.70139,2.242574,1.755873,2.892315,3.448193,3.751944,3.911968,0.0,0.723513,2019-03-29
2,1.0,0.0,0.0,0.0,3288.579994,180874.79493,0.727458,0.216271,-0.00569,54.91,...,0.457102,1.32238,1.647131,2.759299,3.49894,3.736252,3.860623,0.0,59.512857,2019-03-29
3,0.0,0.0,0.0,1.0,68.19821,77.519701,0.003566,-0.000356,0.000104,1.136499,...,0.56342,1.850735,1.710332,2.72905,3.292357,3.635431,3.834951,0.0,1.14896,2019-03-29
4,0.0,0.0,0.0,0.0,78.197031,101.933462,0.006538,0.000675,0.00015,1.306617,...,0.56342,1.716486,1.695231,2.769615,3.486804,3.906512,3.988984,0.0,1.328004,2019-03-29


In [110]:
rolling_features.head()

Unnamed: 0,value__variance_larger_than_standard_deviation,value__has_duplicate_max,value__has_duplicate_min,value__has_duplicate,value__sum_values,value__abs_energy,value__mean_abs_change,value__mean_change,value__mean_second_derivative_central,value__median,...,value__fourier_entropy__bins_10,value__fourier_entropy__bins_100,value__permutation_entropy__dimension_3__tau_1,value__permutation_entropy__dimension_4__tau_1,value__permutation_entropy__dimension_5__tau_1,value__permutation_entropy__dimension_6__tau_1,value__permutation_entropy__dimension_7__tau_1,value__query_similarity_count__query_None__threshold_0.0,value__mean_n_absolute_max__number_of_maxima_7,timestamp
0,1.0,0.0,0.0,0.0,2426.282787,98719.884475,0.52294,0.127621,0.032818,40.886787,...,0.142506,0.928839,1.633226,2.699754,3.257455,3.615145,3.876606,0.0,45.393879,2019-03-29
1,0.0,0.0,0.0,1.0,42.763625,30.481281,0.003268,6.2e-05,6.6e-05,0.711911,...,0.70139,2.242574,1.755873,2.892315,3.448193,3.751944,3.911968,0.0,0.723513,2019-03-29
2,1.0,0.0,0.0,0.0,3288.579994,180874.79493,0.727458,0.216271,-0.00569,54.91,...,0.457102,1.32238,1.647131,2.759299,3.49894,3.736252,3.860623,0.0,59.512857,2019-03-29
3,0.0,0.0,0.0,1.0,68.19821,77.519701,0.003566,-0.000356,0.000104,1.136499,...,0.56342,1.850735,1.710332,2.72905,3.292357,3.635431,3.834951,0.0,1.14896,2019-03-29
4,0.0,0.0,0.0,0.0,78.197031,101.933462,0.006538,0.000675,0.00015,1.306617,...,0.56342,1.716486,1.695231,2.769615,3.486804,3.906512,3.988984,0.0,1.328004,2019-03-29


In [105]:
def compute_kama(price, window=10, fast=2, slow=30):
    price = price.copy()
    change = price.diff(window).abs()
    volatility = price.diff().abs().rolling(window=window).sum()
    volatility = volatility.replace(0, np.nan)
    efficiency_ratio = change / volatility
    efficiency_ratio = efficiency_ratio.fillna(0)

    fast_sc = 2 / (fast + 1)
    slow_sc = 2 / (slow + 1)

    smoothing_constant = (efficiency_ratio * (fast_sc - slow_sc) + slow_sc) ** 2

    kama = pd.Series(np.nan, index=price.index)
    kama.iloc[window] = price.iloc[window]

    for i in range(window + 1, len(price)):
        kama.iloc[i] = kama.iloc[i - 1] + smoothing_constant.iloc[i] * (price.iloc[i] - kama.iloc[i - 1])

    return kama

def compute_kama_trend_manual(series, window=10):
    kama_series = compute_kama(series, window=window)
    trend = np.where(kama_series > kama_series.shift(1), 'up',
             np.where(kama_series < kama_series.shift(1), 'down', 'flat'))
    return kama_series, trend

In [106]:
def compute_kama_trend_manual(series, window=10):
    kama_series = compute_kama(series, window=window)
    trend = np.where(kama_series > kama_series.shift(1), 'up',
             np.where(kama_series < kama_series.shift(1), 'down', 'flat'))
    return kama_series, trend

In [103]:
def estimate_volatility_regime(returns, k_regimes=2):
    model = MarkovRegression(returns.dropna(), k_regimes=k_regimes, trend='c', switching_variance=True)
    result = model.fit(disp=False)
    # Assign regime index with lower variance to "low volatility"
    low_vol_idx = result.params[-2:].argmin()
    smoothed = result.smoothed_marginal_probabilities[low_vol_idx]
    regimes = smoothed.index.to_series().map(lambda i: 0 if smoothed[i] > 0.5 else 1)
    return regimes.reindex(returns.index)

In [107]:
def kama_msr_multivariate(data, kama_window=10):
    all_results = []

    for asset in data.columns:
        df = data[[asset]].copy()
        df = df.rename(columns={asset: 'Close'})
        df['returns'] = np.log(df['Close']).diff()

        # Compute KAMA + trend
        df['KAMA'], df['Trend'] = compute_kama_trend_manual(df['Close'], window=kama_window)

        # Estimate volatility regime with MSR
        try:
            df['VolatilityRegime'] = estimate_volatility_regime(df['returns'])
        except Exception as e:
            print(f"Skipping MSR for {asset} due to error: {e}")
            continue

        # Map to market regimes
        conditions = [
            (df['Trend'] == 'up') & (df['VolatilityRegime'] == 0),
            (df['Trend'] == 'down') & (df['VolatilityRegime'] == 1)
        ]
        df['MarketRegime'] = np.select(conditions, ['bullish', 'bearish'], default='other')

        df['Asset'] = asset
        all_results.append(df[['Close', 'KAMA', 'returns', 'Trend', 'VolatilityRegime', 'MarketRegime', 'Asset']])

    combined = pd.concat(all_results)
    combined.index.name = 'Date'
    return combined.reset_index()

In [108]:
regime_df = kama_msr_multivariate(data)
regime_df.tail()

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Ticker,Date,Close,KAMA,returns,Trend,VolatilityRegime,MarketRegime,Asset
16336,2023-12-22,96.783318,95.594883,0.001768,up,0.0,bullish,XOM
16337,2023-12-26,97.00174,95.65668,0.002254,up,0.0,bullish,XOM
16338,2023-12-27,96.545891,95.734857,-0.00471,up,0.0,bullish,XOM
16339,2023-12-28,95.149834,95.72319,-0.014566,down,0.0,other,XOM
16340,2023-12-29,94.950401,95.694514,-0.002098,down,0.0,other,XOM


In [112]:
# Flatten MultiIndex columns to "Asset_Feature" format:
features_all.columns = ['{}_{}'.format(asset, feat) for asset, feat in features_all.columns]

# Reset index so 'Date' is a column
features_all = features_all.reset_index().rename(columns={'index': 'Date'})

In [113]:
rolling_features = rolling_features.rename(columns={'timestamp': 'Date'})
rolling_features['Date'] = pd.to_datetime(rolling_features['Date'])

In [115]:
# Melt to long format: from wide (one row per Date) to multiple rows per Date, one per Asset-Feature
tech_melted = features_all.melt(id_vars=['Date'], var_name='Asset_Feature', value_name='Value')

# Split 'Asset_Feature' into Asset and Feature
tech_melted[['Asset', 'Feature']] = tech_melted['Asset_Feature'].str.split('_', n=1, expand=True)

# Pivot so each feature is a separate column again, but now long on Asset
tech_long = tech_melted.pivot_table(index=['Date', 'Asset'], columns='Feature', values='Value').reset_index()

In [120]:
regime_df['Date'] = pd.to_datetime(regime_df['Date'])
full_features = pd.merge(tech_long, rolling_features, on='Date', how='left')
final_df = pd.merge(full_features, regime_df[['Date', 'Asset', 'MarketRegime']], on=['Date', 'Asset'], how='left')
# Drop rows without regime labels (optional)
final_df = final_df.dropna()

# Encode MarketRegime for ML if you want
final_df['MarketRegimeLabel'] = final_df['MarketRegime'].map({'bullish': 1, 'bearish': 0, 'other': 2})

print(final_df.head())

          Date Asset        adx     bb_bbh     bb_bbl    bb_bbm      close  \
780 2019-03-29  AAPL  32.614173  47.063387  40.834472  43.94893  45.441723   
781 2019-03-29  AAPL  32.614173  47.063387  40.834472  43.94893  45.441723   
782 2019-03-29  AAPL  32.614173  47.063387  40.834472  43.94893  45.441723   
783 2019-03-29  AAPL  32.614173  47.063387  40.834472  43.94893  45.441723   
784 2019-03-29  AAPL  32.614173  47.063387  40.834472  43.94893  45.441723   

        ema_20      macd        rsi  ...  value__fourier_entropy__bins_100  \
780  44.101846 -0.041663  65.692319  ...                          0.928839   
781  44.101846 -0.041663  65.692319  ...                          2.242574   
782  44.101846 -0.041663  65.692319  ...                          1.322380   
783  44.101846 -0.041663  65.692319  ...                          1.850735   
784  44.101846 -0.041663  65.692319  ...                          1.716486   

     value__permutation_entropy__dimension_3__tau_1  \
780    

In [122]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Separate features and target
X = final_df.drop(columns=['Date', 'Asset', 'MarketRegime', 'MarketRegimeLabel'])
y = final_df['MarketRegimeLabel']

# Optional: fill or drop any remaining NaNs in features
X = X.ffill().bfill()

# Train/test split (e.g., 80/20)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42, test_size=0.2)

In [124]:
!pip install optuna
!pip install pyfinance

Collecting optuna
  Downloading optuna-4.3.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.16.1-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.3.0-py3-none-any.whl (386 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m386.6/386.6 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.16.1-py3-none-any.whl (242 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m242.5/242.5 kB[0m [31m20.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.16.1 colorlog-6.9.0 optuna-4.3.0
Collecting pyfinance
  Downloading pyfinance-1.3.0-py3-none-any.whl.metadata (16 kB)
Collecting xmltodict (from pyfinance)
  Downloading xmltodict-0.14.2-py2.py3-none-any.whl.metadata (8.0

In [125]:
import numpy as np

def sortino_ratio(returns, risk_free_rate=0):
    # returns: array-like of strategy returns (e.g., daily)
    downside_returns = returns[returns < risk_free_rate]
    expected_return = np.mean(returns) - risk_free_rate
    downside_std = np.sqrt(np.mean(downside_returns**2)) if len(downside_returns) > 0 else 0.0
    if downside_std == 0:
        return 0
    return expected_return / downside_std

In [126]:
from sklearn.model_selection import TimeSeriesSplit

class PurgedGroupTimeSeriesSplit:
    def __init__(self, n_splits=5, group_gap=5):
        self.n_splits = n_splits
        self.group_gap = group_gap  # number of samples to purge between train/test

    def split(self, X, y=None, groups=None):
        n_samples = len(X)
        tscv = TimeSeriesSplit(n_splits=self.n_splits)
        indices = np.arange(n_samples)
        for train_idx, test_idx in tscv.split(X):
            # Purge samples within group_gap from test set in train set
            max_train = train_idx.max()
            min_test = test_idx.min()
            purge_start = max_train - self.group_gap
            if purge_start < 0:
                purge_start = 0
            train_idx = train_idx[train_idx < purge_start]
            yield train_idx, test_idx

In [127]:
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import numpy as np

In [128]:
def objective(trial):
    # Suggest hyperparameters
    n_estimators = trial.suggest_int("n_estimators", 50, 300)
    max_depth = trial.suggest_int("max_depth", 3, 20)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 10)

    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        random_state=42,
        n_jobs=-1
    )

    # Prepare data
    X = final_df.drop(columns=['Date', 'Asset', 'MarketRegime', 'MarketRegimeLabel', 'returns'])
    y = final_df['MarketRegimeLabel'].values
    returns = final_df['returns'].values  # actual returns aligned with labels

    # Initialize PGTS splitter
    pgts = PurgedGroupTimeSeriesSplit(n_splits=5, group_gap=5)

    sortino_scores = []

    for train_idx, test_idx in pgts.split(X):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        returns_test = returns[test_idx]

        model.fit(X_train, y_train)
        preds = model.predict(X_test)

        # Strategy returns based on predicted regime
        # Example: long (return) if bullish, short (negative return) if bearish, flat otherwise
        strat_returns = []
        for p, r in zip(preds, returns_test):
            if p == 1:  # bullish
                strat_returns.append(r)
            elif p == 0:  # bearish
                strat_returns.append(-r)
            else:
                strat_returns.append(0)

        strat_returns = np.array(strat_returns)
        sr = sortino_ratio(strat_returns)
        sortino_scores.append(sr)

    # Return average Sortino ratio over folds (to maximize)
    return np.mean(sortino_scores)

In [132]:
final_df['returns'] = final_df.groupby('Asset')['close'].pct_change()
final_df = final_df.dropna(subset=['returns'])  # drop rows with NA returns after pct_change

In [135]:
study = optuna.create_study(sampler=optuna.samplers.TPESampler())
study.optimize(objective, n_trials=50, n_jobs=4)

print("Best hyperparameters:", study.best_params)
print("Best Sortino ratio:", study.best_value)

[I 2025-05-31 16:54:48,512] A new study created in memory with name: no-name-3b5e695a-04b2-4a3c-862b-5b160ddf1c19
[I 2025-05-31 16:56:23,315] Trial 3 finished with value: 0.03612831324900292 and parameters: {'n_estimators': 55, 'max_depth': 6, 'min_samples_split': 9}. Best is trial 3 with value: 0.03612831324900292.
[I 2025-05-31 16:59:25,997] Trial 4 finished with value: 0.030757630539423736 and parameters: {'n_estimators': 93, 'max_depth': 7, 'min_samples_split': 6}. Best is trial 4 with value: 0.030757630539423736.
[I 2025-05-31 17:03:34,794] Trial 1 finished with value: 0.034021073918284406 and parameters: {'n_estimators': 120, 'max_depth': 19, 'min_samples_split': 8}. Best is trial 4 with value: 0.030757630539423736.
[I 2025-05-31 17:06:24,705] Trial 0 finished with value: 0.03804602068533748 and parameters: {'n_estimators': 289, 'max_depth': 9, 'min_samples_split': 10}. Best is trial 4 with value: 0.030757630539423736.
[I 2025-05-31 17:07:18,383] Trial 6 finished with value: 0.03

Best hyperparameters: {'n_estimators': 155, 'max_depth': 6, 'min_samples_split': 9}
Best Sortino ratio: 0.025772503022213244


In [136]:
best_params = study.best_params
final_model = RandomForestClassifier(**best_params, random_state=42, n_jobs=-1)

X = final_df.drop(columns=['Date', 'Asset', 'MarketRegime', 'MarketRegimeLabel', 'returns'])
y = final_df['MarketRegimeLabel']

final_model.fit(X, y)

In [139]:
!pip install wandb --quiet
import wandb

# Log in (will prompt you with a link to get your API key)
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmh-tran[0m ([33mrepres[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [141]:
wandb.init(project="regime-prediction", name="rf-sortino-optimization")


In [142]:
wandb.init(
    project="regime-prediction",
    name="rf-sortino-optimization",
    config={
        "model": "RandomForest",
        "n_estimators": 100,
        "max_depth": 8,
        "min_samples_split": 5,
    "window_size": 20,
    "feature_set": "technical + rolling + tsfresh",
    "target": "regime",
    }
)

In [143]:
import joblib
joblib.dump(final_model, "final_model.pkl")
wandb.save("final_model.pkl")


['/content/wandb/run-20250531_190000-7axcujqv/files/final_model.pkl']