In [22]:
import sys
sys.path.append("../")

from src.analysis.cointegration import find_cointegrated_pairs, analyze_pairs, plot_cointegration_heatmap
from datetime import datetime, timedelta
import pandas as pd
import numpy as np
from statsmodels.tsa.stattools import coint
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

DATE_CONFIG = {
    'TRAIN_START': pd.Timestamp('2021-02-02'),
    'TRAIN_END': pd.Timestamp('2024-01-01'),
    'TEST_END': pd.Timestamp('2025-01-01'),
    'TRADING_DAYS_PER_YEAR': 252  
}

def get_training_period():
    return {
        'start': DATE_CONFIG['TRAIN_START'],
        'end': DATE_CONFIG['TRAIN_END']
    }

def get_test_period():
    return {
        'start': DATE_CONFIG['TRAIN_END'],
        'end': DATE_CONFIG['TEST_END']
    }

def get_training_days():
    years = (DATE_CONFIG['TRAIN_END'] - DATE_CONFIG['TRAIN_START']).days / 365
    return int(years * DATE_CONFIG['TRADING_DAYS_PER_YEAR'])

plt.style.use('classic')
plt.rcParams['figure.figsize'] = [12, 6]
plt.rcParams['figure.dpi'] = 100

In [23]:
def load_and_prepare_data(file_path):

    df = pd.read_parquet(file_path)
    df['date'] = pd.to_datetime(df['date'])
    
    mask = (df['date'] >= DATE_CONFIG['TRAIN_START']) & \
           (df['date'] <= DATE_CONFIG['TEST_END'])
    df = df[mask]
    
    price_matrix = df.pivot(index='date', columns='symbol', values='close')
    
    symbols = price_matrix.columns.tolist()
    
    print(f"Loaded data from {DATE_CONFIG['TRAIN_START']} to {DATE_CONFIG['TEST_END']}")
    print(f"Total symbols: {len(symbols)}")
    print(f"Total trading days: {len(price_matrix)}")
    
    return price_matrix, symbols

In [24]:
def generate_pairs(cointegrated_pairs):
    return cointegrated_pairs

def calculate_returns_and_spreads(price_matrix, cointegrated_pairs):
    returns = price_matrix.pct_change().dropna()
    
    pairs = generate_pairs(cointegrated_pairs)
    
    spreads = pd.DataFrame(index=returns.index)
    for s1, s2 in pairs:
        spreads[f'{s1}_{s2}_spread'] = returns[s1] - returns[s2]
        
    return returns, spreads

In [25]:
def prepare_ml_data(returns, spreads, train_period, test_period, lookback=3):
    ml_datasets = {}
    
    for spread_col in spreads.columns:
        sym1, sym2 = spread_col.replace('_spread', '').split('_')
        
        df = pd.DataFrame({
            f'{sym1}_return': returns[sym1],
            f'{sym2}_return': returns[sym2]
        })
        
        for t in range(1, lookback+1):
            df[f'{sym1}_return_t-{t}'] = df[f'{sym1}_return'].shift(t)
            df[f'{sym2}_return_t-{t}'] = df[f'{sym2}_return'].shift(t)
        
        spread_next_day = (df[f'{sym1}_return'] - df[f'{sym2}_return']).shift(-1)
        df['target'] = np.where(spread_next_day > 0, 1, 0)
        
        feature_cols = [col for col in df.columns if 't-' in col]
        features = df[feature_cols].copy()
        
        clean_idx = features.dropna().index
        features = features.loc[clean_idx]
        target = df.loc[clean_idx, 'target']
    
        
        train_mask = (features.index >= train_period['start']) & (features.index < train_period['end'])
        test_mask = (features.index >= test_period['start']) & (features.index < test_period['end'])
        
        ml_datasets[f'{sym1}_{sym2}'] = {
            'X_train': features[train_mask],
            'X_test': features[test_mask],
            'y_train': target[train_mask],
            'y_test': target[test_mask]
        }
        
    return ml_datasets

In [26]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [27]:
def train_evaluate_models(ml_datasets, coint_results):
   results = {}
   
   for pair, data in tqdm(ml_datasets.items(), desc="Training models"):
       sym1, sym2 = pair.split('_')
       
       p_value = coint_results[
           ((coint_results['symbol1'] == sym1) & (coint_results['symbol2'] == sym2)) |
           ((coint_results['symbol1'] == sym2) & (coint_results['symbol2'] == sym1))
       ]['p_value'].iloc[0]
       
       rf = RandomForestClassifier(
           n_estimators=100,
           random_state=42
       )
       
       rf.fit(data['X_train'], data['y_train'])
       predictions = rf.predict(data['X_test'])
       
       f1 = f1_score(data['y_test'], predictions)
       
       weighted_score = 0.6 * f1 + 0.4 * (1 - p_value)
       
       results[pair] = {
           'model': rf,
           'accuracy': accuracy_score(data['y_test'], predictions),
           'precision': precision_score(data['y_test'], predictions),
           'recall': recall_score(data['y_test'], predictions),
           'f1': f1,
           'p_value': p_value,
           'weighted_score': weighted_score
       }
       
       feature_importance = pd.DataFrame({
           'feature': data['X_train'].columns,
           'importance': rf.feature_importances_
       }).sort_values('importance', ascending=False)
   
   return results

In [28]:
price_matrix, symbols = load_and_prepare_data('../data/raw/nasdaq_daily.parquet')

score_matrix, pvalue_matrix, cointegrated_pairs, coint_results = analyze_pairs(price_matrix)

# plot_cointegration_heatmap(pvalue_matrix, symbols)

returns, spreads = calculate_returns_and_spreads(price_matrix, cointegrated_pairs)

train_period = get_training_period()
test_period = get_test_period()

ml_datasets = prepare_ml_data(returns, spreads, train_period, test_period)

model_results = train_evaluate_models(ml_datasets, coint_results)

Loaded data from 2021-02-02 00:00:00 to 2025-01-01 00:00:00
Total symbols: 94
Total trading days: 985


Analyzing pairs: 100%|██████████| 4371/4371 [00:52<00:00, 83.16it/s]



Analysis complete!
Found 45 cointegrated pairs
Total pairs analyzed: 4371


Training models: 100%|██████████| 45/45 [00:05<00:00,  8.07it/s]


In [29]:
results_df = pd.DataFrame([
   {
       'pair': pair,
       'accuracy': metrics['accuracy'],
       'precision': metrics['precision'],
       'recall': metrics['recall'],
       'f1': metrics['f1'],
       'p_value': metrics['p_value'],
       'weighted_score': metrics['weighted_score']
   }
   for pair, metrics in model_results.items()
])

top_20 = results_df.sort_values('weighted_score', ascending=False).head(20)
print(top_20[['pair', 'f1', 'p_value', 'weighted_score', 'accuracy', 'precision', 'recall']])

         pair        f1   p_value  weighted_score  accuracy  precision  \
30   ISRG_TTD  0.559387  0.003670        0.734164  0.543651   0.536765   
32   KHC_MSFT  0.549020  0.009724        0.725522  0.543651   0.522388   
4    ADSK_HON  0.547445  0.009983        0.724474  0.507937   0.576923   
31   KDP_MNST  0.541176  0.000629        0.724454  0.535714   0.547619   
27     EA_TTD  0.544715  0.006853        0.724088  0.555556   0.531746   
5   AMAT_NXPI  0.539007  0.000091        0.723368  0.484127   0.503311   
15   CSX_TTWO  0.539062  0.009163        0.719772  0.531746   0.543307   
13    CSX_MDB  0.529183  0.009500        0.713710  0.519841   0.544000   
28   GOOG_TTD  0.526718  0.007768        0.712923  0.507937   0.489362   
37  LRCX_NXPI  0.521739  0.001418        0.712476  0.476190   0.483221   
25    EA_NFLX  0.515873  0.002453        0.708543  0.515873   0.442177   
8   CDNS_PCAR  0.517110  0.006162        0.707802  0.496032   0.496350   
38   MAR_PANW  0.508475  0.000692     

In [None]:
def zscore(series):
    return (series - series.mean()) / np.std(series)

def calculate_spread(data, symbol1, symbol2, start_date=None, end_date=None):
    if start_date:
        mask = (data.index >= start_date) & (data.index <= end_date)
        data = data[mask]
    
    # Calculate ratio and z-score
    ratios = data[symbol1] / data[symbol2]
    zscore_ratios = zscore(ratios)
    
    return ratios, zscore_ratios

def trade(S1_train, S2_train, S1_test, S2_test, symbol1, symbol2, window1=5, window2=60):
    ratios_train = S1_train / S2_train
    ma2_train = ratios_train.rolling(window=window2, center=False).mean()
    std_train = ratios_train.rolling(window=window2, center=False).std()
    
    ratios_test = S1_test / S2_test
    trades = []
    trade_id = 0
    position = 0
    entry_prices = None
    entry_date = None
    
    for i in range(len(ratios_test)):
        current_ratio = ratios_test.iloc[i]
        current_date = ratios_test.index[i]
        
        ma2_test = ratios_test.iloc[:i+1].rolling(window=window2, center=False).mean().iloc[-1]
        std_test = ratios_test.iloc[:i+1].rolling(window=window2, center=False).std().iloc[-1]
        zscore = (current_ratio - ma2_test) / std_test
        
        if position == 0:
            if zscore > 1.0:
                entry_date = current_date
                entry_prices = {
                    symbol1: {"price": S1_test.iloc[i], "type": "short"},
                    symbol2: {"price": S2_test.iloc[i], "type": "long"}
                }
                position = -1
                
            elif zscore < -1.0:
                entry_date = current_date
                entry_prices = {
                    symbol1: {"price": S1_test.iloc[i], "type": "long"},
                    symbol2: {"price": S2_test.iloc[i], "type": "short"}
                }
                position = 1
                
        elif abs(zscore) < 0.5 and position != 0:
            for symbol in [symbol1, symbol2]:
                trades.append({
                    'trade_id': trade_id,
                    'symbol': symbol,
                    'entry_date': entry_date,
                    'entry_price': entry_prices[symbol]["price"],
                    'exit_date': current_date,
                    'exit_price': S1_test.iloc[i] if symbol == symbol1 else S2_test.iloc[i],
                    'position_type': entry_prices[symbol]["type"],
                    'paired_symbol': symbol2 if symbol == symbol1 else symbol1,
                    'exit_type': 'target'
                })
            position = 0
            trade_id += 1
            
    return trades

def backtest_pairs(price_matrix, pairs, train_end_date):
    all_trades = []
    
    for symbol1, symbol2 in pairs:
        training_mask = price_matrix.index < train_end_date
        
        S1_train = price_matrix[symbol1][training_mask]
        S2_train = price_matrix[symbol2][training_mask]
        S1_test = price_matrix[symbol1][~training_mask]
        S2_test = price_matrix[symbol2][~training_mask]
        
        pair_trades = trade(S1_train, S2_train, S1_test, S2_test, symbol1, symbol2)
        all_trades.extend(pair_trades)
    
    trades_df = pd.DataFrame(all_trades)
    trades_df.to_parquet('../data/results/ranforest_zscore_results.parquet')
    
    return trades_df

In [33]:
top_pairs = [tuple(pair.split('_')) for pair in top_20['pair']]

trades_df = backtest_pairs(price_matrix, top_pairs, train_period['end'])

print("\nBacktest Results:")
print(f"Total number of trades: {len(trades_df)}")

OSError: Cannot save file into a non-existent directory: '..data/results'