In [54]:
import sys
sys.path.append("../../")

from src.analysis.cointegration import find_cointegrated_pairs, analyze_pairs, plot_cointegration_heatmap
from datetime import datetime, timedelta
import pandas as pd
import numpy as np
from statsmodels.tsa.stattools import coint
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
import warnings
import statsmodels.api as sm
warnings.filterwarnings('ignore')

DATE_CONFIG = {
    'TRAIN_START': pd.Timestamp('2021-02-02'),
    'TRAIN_END': pd.Timestamp('2024-01-01'),
    'TEST_END': pd.Timestamp('2025-01-01'),
    'TRADING_DAYS_PER_YEAR': 252  
}

def get_training_period():
    return {
        'start': DATE_CONFIG['TRAIN_START'],
        'end': DATE_CONFIG['TRAIN_END']
    }

def get_test_period():
    return {
        'start': DATE_CONFIG['TRAIN_END'],
        'end': DATE_CONFIG['TEST_END']
    }

def get_training_days():
    years = (DATE_CONFIG['TRAIN_END'] - DATE_CONFIG['TRAIN_START']).days / 365
    return int(years * DATE_CONFIG['TRADING_DAYS_PER_YEAR'])

plt.style.use('classic')
plt.rcParams['figure.figsize'] = [12, 6]
plt.rcParams['figure.dpi'] = 100

In [55]:
def load_and_prepare_data(file_path):

    df = pd.read_parquet(file_path)
    df['date'] = pd.to_datetime(df['date'])
    
    mask = (df['date'] >= DATE_CONFIG['TRAIN_START']) & \
           (df['date'] <= DATE_CONFIG['TEST_END'])
    df = df[mask]
    
    price_matrix = df.pivot(index='date', columns='symbol', values='close')
    
    symbols = price_matrix.columns.tolist()
    
    print(f"Loaded data from {DATE_CONFIG['TRAIN_START']} to {DATE_CONFIG['TEST_END']}")
    print(f"Total symbols: {len(symbols)}")
    print(f"Total trading days: {len(price_matrix)}")
    
    return price_matrix, symbols

In [56]:
def generate_pairs(cointegrated_pairs):
    return cointegrated_pairs

def calculate_returns_and_spreads(price_matrix, cointegrated_pairs):
    returns = price_matrix.pct_change().dropna()
    
    pairs = generate_pairs(cointegrated_pairs)
    
    spreads = pd.DataFrame(index=returns.index)
    for s1, s2 in pairs:
        spreads[f'{s1}_{s2}_spread'] = returns[s1] - returns[s2]
        
    return returns, spreads

In [57]:
def prepare_ml_data(returns, spreads, train_period, test_period, lookback=3):
    ml_datasets = {}
    
    for spread_col in spreads.columns:
        sym1, sym2 = spread_col.replace('_spread', '').split('_')
        
        df = pd.DataFrame({
            f'{sym1}_return': returns[sym1],
            f'{sym2}_return': returns[sym2]
        })
        
        for t in range(1, lookback+1):
            df[f'{sym1}_return_t-{t}'] = df[f'{sym1}_return'].shift(t)
            df[f'{sym2}_return_t-{t}'] = df[f'{sym2}_return'].shift(t)
        
        spread_next_day = (df[f'{sym1}_return'] - df[f'{sym2}_return']).shift(-1)
        df['target'] = np.where(spread_next_day > 0, 1, 0)
        
        feature_cols = [col for col in df.columns if 't-' in col]
        features = df[feature_cols].copy()
        
        clean_idx = features.dropna().index
        features = features.loc[clean_idx]
        target = df.loc[clean_idx, 'target']
    
        
        train_mask = (features.index >= train_period['start']) & (features.index < train_period['end'])
        test_mask = (features.index >= test_period['start']) & (features.index < test_period['end'])
        
        ml_datasets[f'{sym1}_{sym2}'] = {
            'X_train': features[train_mask],
            'X_test': features[test_mask],
            'y_train': target[train_mask],
            'y_test': target[test_mask]
        }
        
    return ml_datasets

In [58]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [59]:
def train_evaluate_models(ml_datasets, coint_results):
   results = {}
   
   for pair, data in tqdm(ml_datasets.items(), desc="Training models"):
       sym1, sym2 = pair.split('_')
       
       p_value = coint_results[
           ((coint_results['symbol1'] == sym1) & (coint_results['symbol2'] == sym2)) |
           ((coint_results['symbol1'] == sym2) & (coint_results['symbol2'] == sym1))
       ]['p_value'].iloc[0]
       
       rf = RandomForestClassifier(
           n_estimators=100,
           random_state=42
       )
       
       rf.fit(data['X_train'], data['y_train'])
       predictions = rf.predict(data['X_test'])
       
       f1 = f1_score(data['y_test'], predictions)
       
       weighted_score = 1 * f1 + 0 * (1 - p_value)
       
       results[pair] = {
           'model': rf,
           'accuracy': accuracy_score(data['y_test'], predictions),
           'precision': precision_score(data['y_test'], predictions),
           'recall': recall_score(data['y_test'], predictions),
           'f1': f1,
           'p_value': p_value,
           'weighted_score': weighted_score
       }
       
       feature_importance = pd.DataFrame({
           'feature': data['X_train'].columns,
           'importance': rf.feature_importances_
       }).sort_values('importance', ascending=False)
   
   return results

In [60]:
import toml

with open("../config.toml", "r") as f:
    config = toml.load(f)
    
price_matrix, symbols = load_and_prepare_data(config['data']['raw_data_path'])

Loaded data from 2021-02-02 00:00:00 to 2025-01-01 00:00:00
Total symbols: 100
Total trading days: 1022


In [61]:
score_matrix, pvalue_matrix, cointegrated_pairs, coint_results = analyze_pairs(price_matrix)

# plot_cointegration_heatmap(pvalue_matrix, symbols)

returns, spreads = calculate_returns_and_spreads(price_matrix, cointegrated_pairs)

train_period = get_training_period()
test_period = get_test_period()

ml_datasets = prepare_ml_data(returns, spreads, train_period, test_period)

model_results = train_evaluate_models(ml_datasets, coint_results)

Analyzing pairs: 100%|██████████| 4950/4950 [01:30<00:00, 54.49it/s]



Analysis complete!
Found 2015 cointegrated pairs
Total pairs analyzed: 4950


Training models: 100%|██████████| 2015/2015 [04:30<00:00,  7.46it/s]


In [62]:
results_df = pd.DataFrame([
   {
       'pair': pair,
       'accuracy': metrics['accuracy'],
       'precision': metrics['precision'],
       'recall': metrics['recall'],
       'f1': metrics['f1'],
       'p_value': metrics['p_value'],
       'weighted_score': metrics['weighted_score']
   }
   for pair, metrics in model_results.items()
])

top_20 = results_df.sort_values('weighted_score', ascending=False).head(20)
print(top_20[['pair', 'f1', 'p_value', 'weighted_score', 'accuracy', 'precision', 'recall']])

             pair        f1   p_value  weighted_score  accuracy  precision  \
82    AFIWZ_VHRYD  0.634483  0.002792        0.634483  0.595420   0.601307   
1548  MFKBJ_UECLL  0.622951  0.005567        0.622951  0.561069   0.562130   
1863  STEPT_VHRYD  0.614334  0.001187        0.614334  0.568702   0.592105   
1911  UJLJN_WVCYQ  0.607143  0.000872        0.607143  0.580153   0.611511   
533   FAGMW_KQFSM  0.604651  0.007477        0.604651  0.545802   0.532164   
28    ADDRO_WGELK  0.602007  0.009007        0.602007  0.545802   0.592105   
242   AYWVH_QGMXV  0.601266  0.001515        0.601266  0.519084   0.510753   
1495  LURZP_VHRYD  0.600000  0.001386        0.600000  0.572519   0.575342   
1372  KXPSW_VAUSI  0.600000  0.011910        0.600000  0.557252   0.557692   
38    AFIWZ_FVLHU  0.598007  0.012823        0.598007  0.538168   0.566038   
1198  KFZNC_KQFSM  0.598007  0.010082        0.598007  0.538168   0.555556   
1743  QIVTR_QJJFG  0.597938  0.008174        0.597938  0.553435 

In [63]:
def zscore(series):
    return (series - series.mean()) / np.std(series)

def calculate_spread(data, symbol1, symbol2, start_date=None, end_date=None):
    if start_date:
        mask = (data.index >= start_date) & (data.index <= end_date)
        data = data[mask]
    
    ratios = data[symbol1] / data[symbol2]
    zscore_ratios = zscore(ratios)
    
    return ratios, zscore_ratios

def trade(S1_train, S2_train, S1_test, S2_test, symbol1, symbol2, window=50, std_dev=1.5):
    model = sm.OLS(S1_train, S2_train)
    hedge_ratio = model.fit().params[0]
    
    spread_test = S1_test - (S2_test * hedge_ratio)
    
    trades = []
    trade_id = 0
    position = 0
    entry_prices = None
    entry_date = None
    
    prev_spread = None
    
    for i in range(len(spread_test)):
        current_date = spread_test.index[i]
        current_spread = spread_test.iloc[i]
        
        rolling_mean = spread_test.iloc[:i+1].rolling(window=window, center=False).mean().iloc[-1]
        rolling_std = spread_test.iloc[:i+1].rolling(window=window, center=False).std().iloc[-1]
        
        upper_band = rolling_mean + (rolling_std * std_dev)
        lower_band = rolling_mean - (rolling_std * std_dev)
        
        if prev_spread is not None:
            if position == 0:
                if prev_spread > lower_band and current_spread < lower_band:
                    entry_date = current_date
                    entry_prices = {
                        symbol1: {"price": S1_test.iloc[i], "type": "long"},
                        symbol2: {"price": S2_test.iloc[i], "type": "short"}
                    }
                    position = 1
                    
                elif prev_spread < upper_band and current_spread > upper_band:
                    entry_date = current_date
                    entry_prices = {
                        symbol1: {"price": S1_test.iloc[i], "type": "short"},
                        symbol2: {"price": S2_test.iloc[i], "type": "long"}
                    }
                    position = -1
                    
            elif ((position == 1 and prev_spread < upper_band and current_spread > upper_band) or 
                  (position == -1 and prev_spread > lower_band and current_spread < lower_band)):
                
                for symbol in [symbol1, symbol2]:
                    trades.append({
                        'trade_id': trade_id,
                        'symbol': symbol,
                        'entry_date': entry_date,
                        'entry_price': entry_prices[symbol]["price"],
                        'exit_date': current_date,
                        'exit_price': S1_test.iloc[i] if symbol == symbol1 else S2_test.iloc[i],
                        'position_type': entry_prices[symbol]["type"],
                        'paired_symbol': symbol2 if symbol == symbol1 else symbol1,
                        'exit_type': 'target'
                    })
                position = 0
                trade_id += 1
                
        prev_spread = current_spread
            
    return trades

def backtest_pairs(price_matrix, pairs, train_end_date):
    all_trades = []
    
    for symbol1, symbol2 in pairs:
        training_mask = price_matrix.index < train_end_date
        
        S1_train = price_matrix[symbol1][training_mask]
        S2_train = price_matrix[symbol2][training_mask]
        S1_test = price_matrix[symbol1][~training_mask]
        S2_test = price_matrix[symbol2][~training_mask]
        
        pair_trades = trade(S1_train, S2_train, S1_test, S2_test, symbol1, symbol2)
        all_trades.extend(pair_trades)
    
    trades_df = pd.DataFrame(all_trades)
    trades_df.to_parquet('../../data/results/Random-Forest_Bollinger.parquet')
    
    return trades_df

In [64]:
top_pairs = [tuple(pair.split('_')) for pair in top_20['pair']]

trades_df = backtest_pairs(price_matrix, top_pairs, train_period['end'])

print("\nBacktest Results:")
print(f"Total number of trades: {len(trades_df)}")


Backtest Results:
Total number of trades: 86
