In [None]:
import sys
sys.path.append("../../")
from statsmodels.tsa.stattools import adfuller
from src.statistics.cointegration import find_cointegrated_pairs, analyze_pairs, plot_cointegration_heatmap
from datetime import datetime, timedelta
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

DATE_CONFIG = {
    'TRAIN_START': pd.Timestamp('2021-02-02'),
    'TRAIN_END': pd.Timestamp('2024-01-01'),
    'TEST_END': pd.Timestamp('2025-01-01'),
    'TRADING_DAYS_PER_YEAR': 252  
}

plt.style.use('classic')
plt.rcParams['figure.figsize'] = [12, 6]
plt.rcParams['figure.dpi'] = 100

In [None]:
p_threshold = 0.05
min_pairs = 20

window_shifts = 12
shift_size = 1

entry_threshold = 2.0
exit_threshold = 0.5
window1 = 5 
window2 = 60

base_input_path = "../../data/raw/" 
input_filename = "nasdaq_daily.parquet" 
base_output_path = "../../data/results/" 
output_filename = "CoInt_Z-Score_Sliding.parquet"

In [None]:
def load_and_prepare_data(file_path):
    df = pd.read_parquet(file_path)
    df['date'] = pd.to_datetime(df['date'])
    mask = (df['date'] >= DATE_CONFIG['TRAIN_START']) & (df['date'] <= DATE_CONFIG['TEST_END'])
    df = df[mask]
    price_matrix = df.pivot(index='date', columns='symbol', values='close')
    
    price_matrix = price_matrix.ffill().bfill()
    
    symbols = price_matrix.columns.tolist()
    print(f"Loaded data from {DATE_CONFIG['TRAIN_START']} to {DATE_CONFIG['TEST_END']}")
    print(f"Total symbols: {len(symbols)}")
    print(f"Total trading days: {len(price_matrix)}")
    return price_matrix, symbols

In [None]:
input_data_path = f"{base_input_path}{input_filename}"
output_path = f"{base_output_path}{output_filename}"

price_matrix, symbols = load_and_prepare_data(input_data_path)

In [None]:
def get_top_pairs(price_matrix, train_start, train_end, symbols):
    train_length = (train_end - train_start).days
    pair_selection_end = train_start + pd.Timedelta(days=int(train_length * 0.7))
    
    selection_mask = (price_matrix.index >= train_start) & (price_matrix.index <= pair_selection_end)
    selection_data = price_matrix[selection_mask]
    
    selection_data = selection_data.ffill().bfill()
    
    if selection_data.isnull().any().any() or np.isinf(selection_data).any().any():
        print("Warning: Data contains NaN or Inf values after cleaning")
        selection_data = selection_data.replace([np.inf, -np.inf], np.nan).fillna(method='ffill').fillna(method='bfill')
    
    score_matrix, pvalue_matrix, pairs, summary_df = analyze_pairs(selection_data, pvalue_threshold=p_threshold)
            
    return pairs[:min_pairs] if len(pairs) >= min_pairs else pairs

In [None]:
def backtest_pairs_rolling(price_matrix, symbols, window_shifts=window_shifts, p_threshold=p_threshold, shift_size=shift_size):
    all_trades = []
    ongoing_trades = []  
    test_start = DATE_CONFIG['TRAIN_END']
    window_size = pd.DateOffset(months=shift_size)
    
    for window_number in range(window_shifts):
        month_start = test_start + window_number * window_size
        month_end = month_start + window_size
        train_start = DATE_CONFIG['TRAIN_START'] + window_number * window_size
        train_end = test_start + window_number * window_size
        
        print(f"\nAnalyse {window_number+1}/{window_shifts}")
        print(f"Training period: {train_start} to {train_end}")
        print(f"Testing period: {month_start} to {month_end}")
        
        updated_ongoing_trades = []
        closed_trades = []
        
        for open_trade in ongoing_trades:
            symbol1, symbol2 = open_trade['symbol1'], open_trade['symbol2']
            
            if symbol1 in price_matrix.columns and symbol2 in price_matrix.columns:
                test_mask = (price_matrix.index >= month_start) & (price_matrix.index < month_end)
                S1_test = price_matrix[symbol1][test_mask]
                S2_test = price_matrix[symbol2][test_mask]
                
                if S1_test.empty or S2_test.empty:
                    updated_ongoing_trades.append(open_trade)
                    continue
                    
                train_data_mask = (price_matrix.index >= train_start) & (price_matrix.index < train_end) 
                S1_train = price_matrix[symbol1][train_data_mask]
                S2_train = price_matrix[symbol2][train_data_mask]
                
                train_ratio = S1_train / S2_train
                train_mean = train_ratio.rolling(window=window2).mean().iloc[-1] 
                train_std = train_ratio.rolling(window=window2).std().iloc[-1]
                
                closed = False
                for i in range(len(S1_test)):
                    current_date = S1_test.index[i]
                    current_ratio = S1_test.iloc[i] / S2_test.iloc[i]
                    zscore = (current_ratio - train_mean) / train_std
                    
                    if (open_trade['position'] == -1 and zscore < exit_threshold) or \
                       (open_trade['position'] == 1 and zscore > -exit_threshold):

                        for symbol in [symbol1, symbol2]:
                            current_price = S1_test.iloc[i] if symbol == symbol1 else S2_test.iloc[i]
                            closed_trades.append({
                                'trade_id': open_trade['trade_id'],
                                'symbol': symbol,
                                'entry_date': open_trade['entry_date'],
                                'entry_price': open_trade['entry_prices'][symbol]['price'],
                                'exit_date': current_date,
                                'exit_price': current_price,
                                'position_type': open_trade['entry_prices'][symbol]['type'],
                                'paired_symbol': symbol2 if symbol == symbol1 else symbol1,
                                'exit_type': 'target',
                                'window': open_trade['window']
                            })
                        closed = True
                        break
                
                if not closed:
                    updated_ongoing_trades.append(open_trade)
        
        all_trades.extend(closed_trades)
        ongoing_trades = updated_ongoing_trades
        
        top_pairs = get_top_pairs(price_matrix, train_start, train_end, symbols)
        
        for pair_idx, (symbol1, symbol2) in enumerate(top_pairs):
            training_mask = (price_matrix.index >= train_start) & (price_matrix.index < train_end)
            test_mask = (price_matrix.index >= month_start) & (price_matrix.index < month_end)
            
            S1_train = price_matrix[symbol1][training_mask]
            S2_train = price_matrix[symbol2][training_mask]
            S1_test = price_matrix[symbol1][test_mask]
            S2_test = price_matrix[symbol2][test_mask]
            
            if S1_test.empty or S2_test.empty:
                continue
                
            train_ratio = S1_train / S2_train
            train_mean = train_ratio.rolling(window=window2).mean().iloc[-1] 
            train_std = train_ratio.rolling(window=window2).std().iloc[-1]
            
            next_trade_id = len(all_trades) + len(ongoing_trades) + len(closed_trades)
            
            for i in range(len(S1_test)):
                current_ratio = S1_test.iloc[i] / S2_test.iloc[i]
                current_date = S1_test.index[i]
                
                zscore = (current_ratio - train_mean) / train_std

                if zscore > entry_threshold:
                    entry_date = current_date
                    entry_prices = {
                        symbol1: {"price": S1_test.iloc[i], "type": "short"},
                        symbol2: {"price": S2_test.iloc[i], "type": "long"}
                    }

                    ongoing_trades.append({
                        'trade_id': next_trade_id,
                        'symbol1': symbol1,
                        'symbol2': symbol2,
                        'entry_date': entry_date,
                        'entry_prices': entry_prices,
                        'position': -1,
                        'window': window_number + 1
                    })
                    next_trade_id += 1
                    
                elif zscore < -entry_threshold:
                    entry_date = current_date
                    entry_prices = {
                        symbol1: {"price": S1_test.iloc[i], "type": "long"},
                        symbol2: {"price": S2_test.iloc[i], "type": "short"}
                    }

                    ongoing_trades.append({
                        'trade_id': next_trade_id,
                        'symbol1': symbol1,
                        'symbol2': symbol2,
                        'entry_date': entry_date,
                        'entry_prices': entry_prices,
                        'position': 1,
                        'window': window_number + 1
                    })
                    next_trade_id += 1

                trades_to_close = []
                for idx, trade in enumerate(ongoing_trades):
                    if trade['symbol1'] == symbol1 and trade['symbol2'] == symbol2:
                        if (trade['position'] == -1 and zscore < exit_threshold) or \
                           (trade['position'] == 1 and zscore > -exit_threshold):
                            
                            for symbol in [symbol1, symbol2]:
                                all_trades.append({
                                    'trade_id': trade['trade_id'],
                                    'symbol': symbol,
                                    'entry_date': trade['entry_date'],
                                    'entry_price': trade['entry_prices'][symbol]["price"],
                                    'exit_date': current_date,
                                    'exit_price': S1_test.iloc[i] if symbol == symbol1 else S2_test.iloc[i],
                                    'position_type': trade['entry_prices'][symbol]["type"],
                                    'paired_symbol': symbol2 if symbol == symbol1 else symbol1,
                                    'exit_type': 'target',
                                    'window': trade['window']
                                })
                            
                            trades_to_close.append(idx)
                
                for idx in sorted(trades_to_close, reverse=True):
                    ongoing_trades.pop(idx)
        
        print(f"Closed trades in this window: {len(closed_trades)}")
        print(f"Currently open trades: {len(ongoing_trades)}")
    

    trades_df = pd.DataFrame(all_trades)
    
    if len(trades_df) > 0:
        trades_df.to_parquet(output_path)
        
        print("\nTrading Summary:")
        print(f"Total trades: {len(trades_df)}")
        print(f"Unique pairs traded: {len(trades_df[['symbol', 'paired_symbol']].drop_duplicates())}")
        print(f"Period: {trades_df['entry_date'].min()} to {trades_df['exit_date'].max()}")
        
        print("\nTrades per window:")
        print(trades_df['window'].value_counts().sort_index())
        
        if 'exit_type' in trades_df.columns:
            print("\nExit types:")
            print(trades_df['exit_type'].value_counts())
    else:
        print("No trades generated!")
        
    return trades_df

In [None]:
trades_df = backtest_pairs_rolling(
    price_matrix=price_matrix,
    symbols=symbols,
    window_shifts=window_shifts
)