In [1]:
import sys
sys.path.append("../../")
from statsmodels.tsa.stattools import adfuller
from src.analysis.cointegration import find_cointegrated_pairs, analyze_pairs, plot_cointegration_heatmap
from datetime import datetime, timedelta
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

DATE_CONFIG = {
    'TRAIN_START': pd.Timestamp('2021-02-02'),
    'TRAIN_END': pd.Timestamp('2024-01-01'),
    'TEST_END': pd.Timestamp('2025-01-01'),
    'TRADING_DAYS_PER_YEAR': 252  
}

plt.style.use('classic')
plt.rcParams['figure.figsize'] = [12, 6]
plt.rcParams['figure.dpi'] = 100

In [2]:
def load_and_prepare_data(file_path):
    df = pd.read_parquet(file_path)
    df['date'] = pd.to_datetime(df['date'])
    mask = (df['date'] >= DATE_CONFIG['TRAIN_START']) & (df['date'] <= DATE_CONFIG['TEST_END'])
    df = df[mask]
    price_matrix = df.pivot(index='date', columns='symbol', values='close')
    symbols = price_matrix.columns.tolist()
    print(f"Loaded data from {DATE_CONFIG['TRAIN_START']} to {DATE_CONFIG['TEST_END']}")
    print(f"Total symbols: {len(symbols)}")
    print(f"Total trading days: {len(price_matrix)}")
    return price_matrix, symbols

import toml
with open("../config.toml", "r") as f:
    config = toml.load(f)
    
price_matrix, symbols = load_and_prepare_data(config['data']['raw_data_path'])


Loaded data from 2021-02-02 00:00:00 to 2025-01-01 00:00:00
Total symbols: 98
Total trading days: 987


In [3]:
def zscore(series):
    return (series - series.mean()) / np.std(series)

def get_top_pairs(price_matrix, train_start, train_end, symbols):
    train_length = (train_end - train_start).days
    pair_selection_end = train_start + pd.Timedelta(days=int(train_length * 0.7))
    
    selection_mask = (price_matrix.index >= train_start) & (price_matrix.index <= pair_selection_end)
    selection_data = price_matrix[selection_mask]
    
    score_matrix, pvalue_matrix, pairs, summary_df = analyze_pairs(selection_data, pvalue_threshold=0.05)
            
    return pairs[:20] 

In [4]:
def trade(S1_train, S2_train, S1_test, S2_test, symbol1, symbol2, window1=5, window2=60):
    trades = []
    trade_id = 0
    position = 0
    entry_prices = None
    entry_date = None
    
    # Berechne mean und std nur aus Training Daten
    train_ratio = S1_train / S2_train
    train_mean = train_ratio.rolling(window=window2).mean().iloc[-1] 
    train_std = train_ratio.rolling(window=window2).std().iloc[-1]
    
    # Teste nur auf Test-Daten
    test_ratio = S1_test / S2_test
    
    for i in range(len(test_ratio)):
        current_ratio = test_ratio.iloc[i]
        current_date = test_ratio.index[i]
        
        # Z-Score nur mit historischem mean/std
        zscore = (current_ratio - train_mean) / train_std
        
        if position == 0:
            if zscore > 1.0:
                entry_date = current_date
                entry_prices = {
                    symbol1: {"price": S1_test.iloc[i], "type": "short"},
                    symbol2: {"price": S2_test.iloc[i], "type": "long"}
                }
                position = -1
            elif zscore < -1.0:
                entry_date = current_date  
                entry_prices = {
                    symbol1: {"price": S1_test.iloc[i], "type": "long"},
                    symbol2: {"price": S2_test.iloc[i], "type": "short"}
                }
                position = 1
        elif abs(zscore) < 0.5 and position != 0:
            for symbol in [symbol1, symbol2]:
                trades.append({
                    'trade_id': trade_id,
                    'symbol': symbol,
                    'entry_date': entry_date,
                    'entry_price': entry_prices[symbol]["price"],
                    'exit_date': current_date,
                    'exit_price': S1_test.iloc[i] if symbol == symbol1 else S2_test.iloc[i],
                    'position_type': entry_prices[symbol]["type"],
                    'paired_symbol': symbol2 if symbol == symbol1 else symbol1,
                    'exit_type': 'target'
                })
            position = 0
            trade_id += 1
            
    return trades

In [5]:
def backtest_pairs_rolling(price_matrix, symbols):
    all_trades = []
    test_start = DATE_CONFIG['TRAIN_END']
    months = pd.date_range(test_start, DATE_CONFIG['TEST_END'], freq='M')
    window_size = pd.DateOffset(months=1)
    
    for i, month_end in enumerate(months):
        month_start = test_start + i * window_size
        train_start = DATE_CONFIG['TRAIN_START'] + i * window_size
        train_end = test_start + i * window_size
        
        top_pairs = get_top_pairs(price_matrix, train_start, train_end, symbols)
        
        for symbol1, symbol2 in top_pairs:
            training_mask = (price_matrix.index >= train_start) & (price_matrix.index < train_end)
            test_mask = (price_matrix.index >= month_start) & (price_matrix.index < month_end)
            
            S1_train = price_matrix[symbol1][training_mask]
            S2_train = price_matrix[symbol2][training_mask]
            S1_test = price_matrix[symbol1][test_mask]
            S2_test = price_matrix[symbol2][test_mask]
            
            if not S1_test.empty and not S2_test.empty:
                pair_trades = trade(S1_train, S2_train, S1_test, S2_test, symbol1, symbol2)
                all_trades.extend(pair_trades)
    
    trades_df = pd.DataFrame(all_trades)
    trades_df.to_parquet('../../data/results/CoInt_Z-Score.parquet')
    return trades_df

In [6]:
trades_df = backtest_pairs_rolling(price_matrix, symbols)

Analyzing pairs:   2%|▏         | 95/4753 [00:00<00:25, 180.98it/s]


MissingDataError: exog contains inf or nans