In [77]:
import sys
sys.path.append("../../")

from src.analysis.cointegration import find_cointegrated_pairs, analyze_pairs, plot_cointegration_heatmap
from datetime import datetime, timedelta
import pandas as pd
import numpy as np
from statsmodels.tsa.stattools import coint
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

DATE_CONFIG = {
    'TRAIN_START': pd.Timestamp('2021-02-02'),
    'TRAIN_END': pd.Timestamp('2024-01-01'),
    'TEST_END': pd.Timestamp('2025-01-01'),
    'TRADING_DAYS_PER_YEAR': 252  
}

def get_training_period():
    return {
        'start': DATE_CONFIG['TRAIN_START'],
        'end': DATE_CONFIG['TRAIN_END']
    }

def get_test_period():
    return {
        'start': DATE_CONFIG['TRAIN_END'],
        'end': DATE_CONFIG['TEST_END']
    }

def get_training_days():
    years = (DATE_CONFIG['TRAIN_END'] - DATE_CONFIG['TRAIN_START']).days / 365
    return int(years * DATE_CONFIG['TRADING_DAYS_PER_YEAR'])

plt.style.use('classic')
plt.rcParams['figure.figsize'] = [12, 6]
plt.rcParams['figure.dpi'] = 100

In [78]:
def load_and_prepare_data(file_path):
    df = pd.read_parquet(file_path)
    df['date'] = pd.to_datetime(df['date'])
    
    mask = (df['date'] >= DATE_CONFIG['TRAIN_START']) & \
           (df['date'] <= DATE_CONFIG['TEST_END'])
    df = df[mask]
    
    price_matrix = df.pivot(index='date', columns='symbol', values='close')
    volume_matrix = df.pivot(index='date', columns='symbol', values='volume')
    
    symbols = price_matrix.columns.tolist()
    
    print(f"Loaded data from {DATE_CONFIG['TRAIN_START']} to {DATE_CONFIG['TEST_END']}")
    print(f"Total symbols: {len(symbols)}")
    print(f"Total trading days: {len(price_matrix)}")
    
    return price_matrix, volume_matrix, symbols

In [79]:
def generate_pairs(cointegrated_pairs):
    return cointegrated_pairs

In [80]:
import toml

with open("../config.toml", "r") as f:
    config = toml.load(f)
    
price_matrix, volume_matrix, symbols = load_and_prepare_data(config['data']['raw_data_path'])

Loaded data from 2021-02-02 00:00:00 to 2025-01-01 00:00:00
Total symbols: 94
Total trading days: 985


In [81]:
def calculate_returns_and_spreads(price_matrix, cointegrated_pairs):
    returns = price_matrix.pct_change().dropna()
    
    pairs = generate_pairs(cointegrated_pairs)
    
    spreads = pd.DataFrame(index=returns.index)
    for s1, s2 in pairs:
        spreads[f'{s1}_{s2}_spread'] = returns[s1] - returns[s2]
        
    return returns, spreads

In [82]:
def prepare_ml_data(price_matrix, volume_matrix, spreads, train_period, test_period, lookback=5):
   ml_datasets = {}
   returns = price_matrix.pct_change().dropna()
   
   for spread_col in spreads.columns:
       sym1, sym2 = spread_col.replace('_spread', '').split('_')
       
       df = pd.DataFrame({
           f'{sym1}_volume': volume_matrix[sym1],
           f'{sym2}_volume': volume_matrix[sym2],
           f'{sym1}_return': returns[sym1],
           f'{sym2}_return': returns[sym2]
       })
       
       for t in range(1, lookback+1):
           df[f'{sym1}_volume_t-{t}'] = df[f'{sym1}_volume'].shift(t)
           df[f'{sym2}_volume_t-{t}'] = df[f'{sym2}_volume'].shift(t)
           df[f'{sym1}_return_t-{t}'] = df[f'{sym1}_return'].shift(t)
           df[f'{sym2}_return_t-{t}'] = df[f'{sym2}_return'].shift(t)
       
       spread_next_day = (returns[sym1] - returns[sym2]).shift(-1)
       df['target'] = spread_next_day
       
       feature_cols = [col for col in df.columns if 't-' in col]
       features = df[feature_cols].copy()
       
       clean_idx = features.dropna().index
       features = features.loc[clean_idx]
       target = df.loc[clean_idx, 'target']
   
       train_mask = (features.index >= train_period['start']) & (features.index < train_period['end'])
       test_mask = (features.index >= test_period['start']) & (features.index < pd.Timestamp('2024-12-31'))
       
       ml_datasets[f'{sym1}_{sym2}'] = {
           'X_train': features[train_mask],
           'X_test': features[test_mask],
           'y_train': target[train_mask],
           'y_test': target[test_mask]
       }
       
   return ml_datasets

In [83]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np
from tqdm import tqdm

In [84]:
def train_evaluate_models(ml_datasets, coint_results):
    results = {}
    
    param_grid = {
        'kernel': ['rbf', 'linear', 'poly'],
        'C': [0.1, 1, 10, 100],
        'epsilon': [0.001, 0.01, 0.1, 1],
        'gamma': ['scale', 'auto', 0.1, 0.01],
        'degree': [2, 3, 4]
    }
    
    for pair, data in tqdm(ml_datasets.items(), desc="Training models"):
        sym1, sym2 = pair.split('_')
        
        p_value = coint_results[
            ((coint_results['symbol1'] == sym1) & (coint_results['symbol2'] == sym2)) |
            ((coint_results['symbol1'] == sym2) & (coint_results['symbol2'] == sym1))
        ]['p_value'].iloc[0]
        
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(data['X_train'])
        X_test_scaled = scaler.transform(data['X_test'])
        
        svr = SVR()
        grid_search = GridSearchCV(
            svr, 
            param_grid,
            cv=5,
            scoring='r2',
            n_jobs=-1
        )
        
        grid_search.fit(X_train_scaled, data['y_train'])
        
        best_model = grid_search.best_estimator_
        predictions = best_model.predict(X_test_scaled)
        
        r2 = r2_score(data['y_test'], predictions)
        mse = mean_squared_error(data['y_test'], predictions)
        rmse = np.sqrt(mse)
        
        weighted_score = 0.7 * r2 + 0.3 * (1 - p_value)
        
        results[pair] = {
            'model': best_model,
            'scaler': scaler,
            'r2': r2,
            'rmse': rmse,
            'p_value': p_value,
            'weighted_score': weighted_score,
            'best_params': grid_search.best_params_
        }
    
    return results

In [85]:
score_matrix, pvalue_matrix, cointegrated_pairs, coint_results = analyze_pairs(price_matrix)

# Sortiere nach p-value und nimm die Top 5
top_5_pairs = coint_results.sort_values('p_value').head(1)
top_5_pairs_list = [(row['symbol1'], row['symbol2']) for _, row in top_5_pairs.iterrows()]

# Berechne returns und spreads nur für diese 5 Paare
returns, spreads = calculate_returns_and_spreads(price_matrix, top_5_pairs_list)

train_period = get_training_period()
test_period = get_test_period()

# Bereite ML Daten nur für diese 5 Paare vor
ml_datasets = prepare_ml_data(price_matrix, volume_matrix, spreads, train_period, test_period)

# Training nur für diese 5 Paare
model_results = train_evaluate_models(ml_datasets, top_5_pairs)

Analyzing pairs: 100%|██████████| 4371/4371 [01:00<00:00, 72.82it/s]



Analysis complete!
Found 244 cointegrated pairs
Total pairs analyzed: 4371


Training models: 100%|██████████| 1/1 [09:25<00:00, 565.57s/it]


In [86]:
for pair, res in model_results.items():
    print(f"{pair}: {res['best_params']}")

AMAT_NXPI: {'C': 0.1, 'degree': 2, 'epsilon': 0.001, 'gamma': 0.01, 'kernel': 'rbf'}


In [87]:
results_df = pd.DataFrame([
   {
       'pair': pair,
       'r2': metrics['r2'],
       'rmse': metrics['rmse'],
       'p_value': metrics['p_value'],
       'weighted_score': metrics['weighted_score']
   }
   for pair, metrics in model_results.items()
])

top_20 = results_df.sort_values('weighted_score', ascending=False).head(20)
print(top_20[['pair', 'r2', 'rmse', 'p_value', 'weighted_score']])

        pair        r2     rmse   p_value  weighted_score
0  AMAT_NXPI -0.030879  0.01933  0.000091        0.278357
