In [1]:
import sys
sys.path.append("../../")
from statsmodels.tsa.stattools import adfuller
from src.statistics.cointegration import find_cointegrated_pairs, analyze_pairs, plot_cointegration_heatmap
from datetime import datetime, timedelta
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

DATE_CONFIG = {
    'TRAIN_START': pd.Timestamp('2021-02-02'),
    'TRAIN_END': pd.Timestamp('2024-01-01'),
    'TEST_END': pd.Timestamp('2025-01-01'),
    'TRADING_DAYS_PER_YEAR': 252  
}

plt.style.use('classic')
plt.rcParams['figure.figsize'] = [12, 6]
plt.rcParams['figure.dpi'] = 100

In [2]:
p_threshold = 0.05
min_pairs = 20

window_shifts = 12
shift_size = 1

entry_threshold = 2.0
exit_threshold = 0.5
window1 = 5 
window2 = 60

base_input_path = "../../data/raw/" 
input_filename = "nasdaq_daily.parquet" 
base_output_path = "../../data/results/" 
output_filename = "CoInt_Z-Score_Sliding.parquet"

In [3]:
def load_and_prepare_data(file_path):
    df = pd.read_parquet(file_path)
    df['date'] = pd.to_datetime(df['date'])
    mask = (df['date'] >= DATE_CONFIG['TRAIN_START']) & (df['date'] <= DATE_CONFIG['TEST_END'])
    df = df[mask]
    price_matrix = df.pivot(index='date', columns='symbol', values='close')
    
    price_matrix = price_matrix.ffill().bfill()
    
    symbols = price_matrix.columns.tolist()
    print(f"Loaded data from {DATE_CONFIG['TRAIN_START']} to {DATE_CONFIG['TEST_END']}")
    print(f"Total symbols: {len(symbols)}")
    print(f"Total trading days: {len(price_matrix)}")
    return price_matrix, symbols

In [4]:
input_data_path = f"{base_input_path}{input_filename}"
output_path = f"{base_output_path}{output_filename}"

price_matrix, symbols = load_and_prepare_data(input_data_path)

Loaded data from 2021-02-02 00:00:00 to 2025-01-01 00:00:00
Total symbols: 94
Total trading days: 985


In [5]:
def get_top_pairs(price_matrix, train_start, train_end, symbols):
    train_length = (train_end - train_start).days
    pair_selection_end = train_start + pd.Timedelta(days=int(train_length * 0.7))
    
    selection_mask = (price_matrix.index >= train_start) & (price_matrix.index <= pair_selection_end)
    selection_data = price_matrix[selection_mask]
    
    selection_data = selection_data.ffill().bfill()
    
    if selection_data.isnull().any().any() or np.isinf(selection_data).any().any():
        print("Warning: Data contains NaN or Inf values after cleaning")
        selection_data = selection_data.replace([np.inf, -np.inf], np.nan).fillna(method='ffill').fillna(method='bfill')
    
    score_matrix, pvalue_matrix, pairs, summary_df = analyze_pairs(selection_data, pvalue_threshold=p_threshold)
            
    return pairs[:min_pairs] if len(pairs) >= min_pairs else pairs

In [6]:
def backtest_pairs_rolling(price_matrix, symbols, window_shifts=window_shifts, p_threshold=p_threshold, shift_size=shift_size):
    all_trades = []
    ongoing_trades = []  
    test_start = DATE_CONFIG['TRAIN_END']
    window_size = pd.DateOffset(months=shift_size)
    
    for window_number in range(window_shifts):
        month_start = test_start + window_number * window_size
        month_end = month_start + window_size
        train_start = DATE_CONFIG['TRAIN_START'] + window_number * window_size
        train_end = test_start + window_number * window_size
        
        print(f"\nAnalyse {window_number+1}/{window_shifts}")
        print(f"Training period: {train_start} to {train_end}")
        print(f"Testing period: {month_start} to {month_end}")
        
        updated_ongoing_trades = []
        closed_trades = []
        
        for open_trade in ongoing_trades:
            symbol1, symbol2 = open_trade['symbol1'], open_trade['symbol2']
            
            if symbol1 in price_matrix.columns and symbol2 in price_matrix.columns:
                test_mask = (price_matrix.index >= month_start) & (price_matrix.index < month_end)
                S1_test = price_matrix[symbol1][test_mask]
                S2_test = price_matrix[symbol2][test_mask]
                
                if S1_test.empty or S2_test.empty:
                    updated_ongoing_trades.append(open_trade)
                    continue
                    
                train_data_mask = (price_matrix.index >= train_start) & (price_matrix.index < train_end) 
                S1_train = price_matrix[symbol1][train_data_mask]
                S2_train = price_matrix[symbol2][train_data_mask]
                
                train_ratio = S1_train / S2_train
                train_mean = train_ratio.rolling(window=window2).mean().iloc[-1] 
                train_std = train_ratio.rolling(window=window2).std().iloc[-1]
                
                closed = False
                for i in range(len(S1_test)):
                    current_date = S1_test.index[i]
                    current_ratio = S1_test.iloc[i] / S2_test.iloc[i]
                    zscore = (current_ratio - train_mean) / train_std
                    
                    if (open_trade['position'] == -1 and zscore < exit_threshold) or \
                       (open_trade['position'] == 1 and zscore > -exit_threshold):

                        for symbol in [symbol1, symbol2]:
                            current_price = S1_test.iloc[i] if symbol == symbol1 else S2_test.iloc[i]
                            closed_trades.append({
                                'trade_id': open_trade['trade_id'],
                                'symbol': symbol,
                                'entry_date': open_trade['entry_date'],
                                'entry_price': open_trade['entry_prices'][symbol]['price'],
                                'exit_date': current_date,
                                'exit_price': current_price,
                                'position_type': open_trade['entry_prices'][symbol]['type'],
                                'paired_symbol': symbol2 if symbol == symbol1 else symbol1,
                                'exit_type': 'target',
                                'window': open_trade['window']
                            })
                        closed = True
                        break
                
                if not closed:
                    updated_ongoing_trades.append(open_trade)
        
        all_trades.extend(closed_trades)
        ongoing_trades = updated_ongoing_trades
        
        top_pairs = get_top_pairs(price_matrix, train_start, train_end, symbols)
        
        for pair_idx, (symbol1, symbol2) in enumerate(top_pairs):
            training_mask = (price_matrix.index >= train_start) & (price_matrix.index < train_end)
            test_mask = (price_matrix.index >= month_start) & (price_matrix.index < month_end)
            
            S1_train = price_matrix[symbol1][training_mask]
            S2_train = price_matrix[symbol2][training_mask]
            S1_test = price_matrix[symbol1][test_mask]
            S2_test = price_matrix[symbol2][test_mask]
            
            if S1_test.empty or S2_test.empty:
                continue
                
            train_ratio = S1_train / S2_train
            train_mean = train_ratio.rolling(window=window2).mean().iloc[-1] 
            train_std = train_ratio.rolling(window=window2).std().iloc[-1]
            
            next_trade_id = len(all_trades) + len(ongoing_trades) + len(closed_trades)
            
            for i in range(len(S1_test)):
                current_ratio = S1_test.iloc[i] / S2_test.iloc[i]
                current_date = S1_test.index[i]
                
                zscore = (current_ratio - train_mean) / train_std

                if zscore > entry_threshold:
                    entry_date = current_date
                    entry_prices = {
                        symbol1: {"price": S1_test.iloc[i], "type": "short"},
                        symbol2: {"price": S2_test.iloc[i], "type": "long"}
                    }

                    ongoing_trades.append({
                        'trade_id': next_trade_id,
                        'symbol1': symbol1,
                        'symbol2': symbol2,
                        'entry_date': entry_date,
                        'entry_prices': entry_prices,
                        'position': -1,
                        'window': window_number + 1
                    })
                    next_trade_id += 1
                    
                elif zscore < -entry_threshold:
                    entry_date = current_date
                    entry_prices = {
                        symbol1: {"price": S1_test.iloc[i], "type": "long"},
                        symbol2: {"price": S2_test.iloc[i], "type": "short"}
                    }

                    ongoing_trades.append({
                        'trade_id': next_trade_id,
                        'symbol1': symbol1,
                        'symbol2': symbol2,
                        'entry_date': entry_date,
                        'entry_prices': entry_prices,
                        'position': 1,
                        'window': window_number + 1
                    })
                    next_trade_id += 1

                trades_to_close = []
                for idx, trade in enumerate(ongoing_trades):
                    if trade['symbol1'] == symbol1 and trade['symbol2'] == symbol2:
                        if (trade['position'] == -1 and zscore < exit_threshold) or \
                           (trade['position'] == 1 and zscore > -exit_threshold):
                            
                            for symbol in [symbol1, symbol2]:
                                all_trades.append({
                                    'trade_id': trade['trade_id'],
                                    'symbol': symbol,
                                    'entry_date': trade['entry_date'],
                                    'entry_price': trade['entry_prices'][symbol]["price"],
                                    'exit_date': current_date,
                                    'exit_price': S1_test.iloc[i] if symbol == symbol1 else S2_test.iloc[i],
                                    'position_type': trade['entry_prices'][symbol]["type"],
                                    'paired_symbol': symbol2 if symbol == symbol1 else symbol1,
                                    'exit_type': 'target',
                                    'window': trade['window']
                                })
                            
                            trades_to_close.append(idx)
                
                for idx in sorted(trades_to_close, reverse=True):
                    ongoing_trades.pop(idx)
        
        print(f"Closed trades in this window: {len(closed_trades)}")
        print(f"Currently open trades: {len(ongoing_trades)}")
    

    trades_df = pd.DataFrame(all_trades)
    
    if len(trades_df) > 0:
        trades_df.to_parquet(output_path)
        
        print("\nTrading Summary:")
        print(f"Total trades: {len(trades_df)}")
        print(f"Unique pairs traded: {len(trades_df[['symbol', 'paired_symbol']].drop_duplicates())}")
        print(f"Period: {trades_df['entry_date'].min()} to {trades_df['exit_date'].max()}")
        
        print("\nTrades per window:")
        print(trades_df['window'].value_counts().sort_index())
        
        if 'exit_type' in trades_df.columns:
            print("\nExit types:")
            print(trades_df['exit_type'].value_counts())
    else:
        print("No trades generated!")
        
    return trades_df

In [7]:
trades_df = backtest_pairs_rolling(
    price_matrix=price_matrix,
    symbols=symbols,
    window_shifts=window_shifts
)


Analyse 1/12
Training period: 2021-02-02 00:00:00 to 2024-01-01 00:00:00
Testing period: 2024-01-01 00:00:00 to 2024-02-01 00:00:00


Analyzing pairs: 100%|██████████| 4371/4371 [00:27<00:00, 156.23it/s]



Analysis complete!
Found 276 cointegrated pairs
Total pairs analyzed: 4371
Closed trades in this window: 0
Currently open trades: 34

Analyse 2/12
Training period: 2021-03-02 00:00:00 to 2024-02-01 00:00:00
Testing period: 2024-02-01 00:00:00 to 2024-03-01 00:00:00


Analyzing pairs: 100%|██████████| 4371/4371 [00:28<00:00, 152.66it/s]



Analysis complete!
Found 307 cointegrated pairs
Total pairs analyzed: 4371
Closed trades in this window: 50
Currently open trades: 182

Analyse 3/12
Training period: 2021-04-02 00:00:00 to 2024-03-01 00:00:00
Testing period: 2024-03-01 00:00:00 to 2024-04-01 00:00:00


Analyzing pairs: 100%|██████████| 4371/4371 [00:27<00:00, 160.72it/s]



Analysis complete!
Found 265 cointegrated pairs
Total pairs analyzed: 4371
Closed trades in this window: 74
Currently open trades: 314

Analyse 4/12
Training period: 2021-05-02 00:00:00 to 2024-04-01 00:00:00
Testing period: 2024-04-01 00:00:00 to 2024-05-01 00:00:00


Analyzing pairs: 100%|██████████| 4371/4371 [00:27<00:00, 160.90it/s]



Analysis complete!
Found 193 cointegrated pairs
Total pairs analyzed: 4371
Closed trades in this window: 246
Currently open trades: 272

Analyse 5/12
Training period: 2021-06-02 00:00:00 to 2024-05-01 00:00:00
Testing period: 2024-05-01 00:00:00 to 2024-06-01 00:00:00


Analyzing pairs: 100%|██████████| 4371/4371 [00:27<00:00, 160.39it/s]



Analysis complete!
Found 283 cointegrated pairs
Total pairs analyzed: 4371
Closed trades in this window: 226
Currently open trades: 315

Analyse 6/12
Training period: 2021-07-02 00:00:00 to 2024-06-01 00:00:00
Testing period: 2024-06-01 00:00:00 to 2024-07-01 00:00:00


Analyzing pairs: 100%|██████████| 4371/4371 [00:26<00:00, 162.88it/s]



Analysis complete!
Found 228 cointegrated pairs
Total pairs analyzed: 4371
Closed trades in this window: 448
Currently open trades: 197

Analyse 7/12
Training period: 2021-08-02 00:00:00 to 2024-07-01 00:00:00
Testing period: 2024-07-01 00:00:00 to 2024-08-01 00:00:00


Analyzing pairs: 100%|██████████| 4371/4371 [00:27<00:00, 160.42it/s]



Analysis complete!
Found 259 cointegrated pairs
Total pairs analyzed: 4371
Closed trades in this window: 306
Currently open trades: 68

Analyse 8/12
Training period: 2021-09-02 00:00:00 to 2024-08-01 00:00:00
Testing period: 2024-08-01 00:00:00 to 2024-09-01 00:00:00


Analyzing pairs: 100%|██████████| 4371/4371 [00:26<00:00, 162.83it/s]



Analysis complete!
Found 332 cointegrated pairs
Total pairs analyzed: 4371
Closed trades in this window: 90
Currently open trades: 147

Analyse 9/12
Training period: 2021-10-02 00:00:00 to 2024-09-01 00:00:00
Testing period: 2024-09-01 00:00:00 to 2024-10-01 00:00:00


Analyzing pairs: 100%|██████████| 4371/4371 [00:27<00:00, 161.82it/s]



Analysis complete!
Found 298 cointegrated pairs
Total pairs analyzed: 4371
Closed trades in this window: 60
Currently open trades: 159

Analyse 10/12
Training period: 2021-11-02 00:00:00 to 2024-10-01 00:00:00
Testing period: 2024-10-01 00:00:00 to 2024-11-01 00:00:00


Analyzing pairs: 100%|██████████| 4371/4371 [00:27<00:00, 158.88it/s]



Analysis complete!
Found 371 cointegrated pairs
Total pairs analyzed: 4371
Closed trades in this window: 210
Currently open trades: 205

Analyse 11/12
Training period: 2021-12-02 00:00:00 to 2024-11-01 00:00:00
Testing period: 2024-11-01 00:00:00 to 2024-12-01 00:00:00


Analyzing pairs: 100%|██████████| 4371/4371 [00:27<00:00, 160.75it/s]



Analysis complete!
Found 266 cointegrated pairs
Total pairs analyzed: 4371
Closed trades in this window: 58
Currently open trades: 277

Analyse 12/12
Training period: 2022-01-02 00:00:00 to 2024-12-01 00:00:00
Testing period: 2024-12-01 00:00:00 to 2025-01-01 00:00:00


Analyzing pairs: 100%|██████████| 4371/4371 [00:27<00:00, 161.35it/s]



Analysis complete!
Found 501 cointegrated pairs
Total pairs analyzed: 4371
Closed trades in this window: 168
Currently open trades: 313

Trading Summary:
Total trades: 2118
Unique pairs traded: 104
Period: 2024-01-02 00:00:00 to 2024-12-18 00:00:00

Trades per window:
window
1      70
2     348
3     342
4     164
5     366
6     230
7      72
8     180
9      90
10    214
11     42
Name: count, dtype: int64

Exit types:
exit_type
target    2118
Name: count, dtype: int64
