In [None]:
import sys
sys.path.append("../../")

from src.analysis.cointegration import find_cointegrated_pairs, analyze_pairs, plot_cointegration_heatmap
from datetime import datetime, timedelta
import pandas as pd
import numpy as np
from statsmodels.tsa.stattools import coint
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

DATE_CONFIG = {
    'TRAIN_START': pd.Timestamp('2021-02-02'),
    'TRAIN_END': pd.Timestamp('2024-01-01'),
    'TEST_END': pd.Timestamp('2025-01-01'),
    'TRADING_DAYS_PER_YEAR': 252  
}

def get_training_period():
    return {
        'start': DATE_CONFIG['TRAIN_START'],
        'end': DATE_CONFIG['TRAIN_END']
    }

def get_test_period():
    return {
        'start': DATE_CONFIG['TRAIN_END'],
        'end': DATE_CONFIG['TEST_END']
    }

def get_training_days():
    years = (DATE_CONFIG['TRAIN_END'] - DATE_CONFIG['TRAIN_START']).days / 365
    return int(years * DATE_CONFIG['TRADING_DAYS_PER_YEAR'])

plt.style.use('classic')
plt.rcParams['figure.figsize'] = [12, 6]
plt.rcParams['figure.dpi'] = 100

In [None]:
def load_and_prepare_data(file_path):
    df = pd.read_parquet(file_path)
    df['date'] = pd.to_datetime(df['date'])
    
    mask = (df['date'] >= DATE_CONFIG['TRAIN_START']) & \
           (df['date'] <= DATE_CONFIG['TEST_END'])
    df = df[mask]
    
    price_matrix = df.pivot(index='date', columns='symbol', values='close')
    volume_matrix = df.pivot(index='date', columns='symbol', values='volume')
    
    symbols = price_matrix.columns.tolist()
    
    print(f"Loaded data from {DATE_CONFIG['TRAIN_START']} to {DATE_CONFIG['TEST_END']}")
    print(f"Total symbols: {len(symbols)}")
    print(f"Total trading days: {len(price_matrix)}")
    
    return price_matrix, volume_matrix, symbols

In [None]:
def generate_pairs(cointegrated_pairs):
    return cointegrated_pairs

In [None]:
import toml

with open("../config.toml", "r") as f:
    config = toml.load(f)
    
price_matrix, volume_matrix, symbols = load_and_prepare_data(config['data']['raw_data_path'])

In [None]:
def calculate_returns_and_spreads(price_matrix, cointegrated_pairs):
    returns = price_matrix.pct_change().dropna()
    
    pairs = generate_pairs(cointegrated_pairs)
    
    spreads = pd.DataFrame(index=returns.index)
    for s1, s2 in pairs:
        spreads[f'{s1}_{s2}_spread'] = returns[s1] - returns[s2]
        
    return returns, spreads

In [None]:
def prepare_ml_data(price_matrix, volume_matrix, spreads, train_period, test_period, lookback=5):
   ml_datasets = {}
   returns = price_matrix.pct_change().dropna()
   
   for spread_col in spreads.columns:
       sym1, sym2 = spread_col.replace('_spread', '').split('_')
       
       df = pd.DataFrame({
           # Volume Features
           f'{sym1}_volume': volume_matrix[sym1],
           f'{sym2}_volume': volume_matrix[sym2],
           # Return Features 
           f'{sym1}_return': returns[sym1],
           f'{sym2}_return': returns[sym2]
       })
       
       # Volume History
       for t in range(1, lookback+1):
           df[f'{sym1}_volume_t-{t}'] = df[f'{sym1}_volume'].shift(t)
           df[f'{sym2}_volume_t-{t}'] = df[f'{sym2}_volume'].shift(t)
           
       # Return History
       for t in range(1, lookback+1):
           df[f'{sym1}_return_t-{t}'] = df[f'{sym1}_return'].shift(t)
           df[f'{sym2}_return_t-{t}'] = df[f'{sym2}_return'].shift(t)
       
       # Target bleibt gleich
       spread_next_day = (returns[sym1] - returns[sym2]).shift(-1)
       spread_next_day = spread_next_day.reindex(df.index)
       df['target'] = np.where(spread_next_day > 0, 1, 0)
       
       feature_cols = [col for col in df.columns if 't-' in col]
       features = df[feature_cols].copy()
       
       clean_idx = features.dropna().index
       features = features.loc[clean_idx]
       target = df.loc[clean_idx, 'target']
   
       train_mask = (features.index >= train_period['start']) & (features.index < train_period['end'])
       test_mask = (features.index >= test_period['start']) & (features.index < test_period['end'])
       
       ml_datasets[f'{sym1}_{sym2}'] = {
           'X_train': features[train_mask],
           'X_test': features[test_mask],
           'y_train': target[train_mask],
           'y_test': target[test_mask]
       }
       
   return ml_datasets

In [None]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
def train_evaluate_models(ml_datasets, coint_results):
   results = {}
   
   for pair, data in tqdm(ml_datasets.items(), desc="Training models"):
       sym1, sym2 = pair.split('_')
       
       # Debug: Klassen-Balance
       print(f"\nAnalyzing pair: {pair}")
       print("Class distribution (train):", np.bincount(data['y_train']) / len(data['y_train']))
       print("Class distribution (test):", np.bincount(data['y_test']) / len(data['y_test']))
       
       p_value = coint_results[
           ((coint_results['symbol1'] == sym1) & (coint_results['symbol2'] == sym2)) |
           ((coint_results['symbol1'] == sym2) & (coint_results['symbol2'] == sym1))
       ]['p_value'].iloc[0]
       
       scaler = StandardScaler()
       X_train_scaled = scaler.fit_transform(data['X_train'])
       X_test_scaled = scaler.transform(data['X_test'])
       
       # Angepasste Parameter
       svm = SVC(
           kernel='rbf',
           C=0.1,  # Reduzierter C-Wert
           class_weight='balanced',
           random_state=42
       )
       svm.fit(X_train_scaled, data['y_train'])
       predictions = svm.predict(X_test_scaled)
       
       # Debug: Predictions Verteilung
       print("\nPrediction distribution:", np.bincount(predictions) / len(predictions))
       
       f1 = f1_score(data['y_test'], predictions)
       weighted_score = 1 * f1 + 0 * (1 - p_value)
       
       results[pair] = {
           'model': svm,
           'scaler': scaler,
           'accuracy': accuracy_score(data['y_test'], predictions),
           'precision': precision_score(data['y_test'], predictions),
           'recall': recall_score(data['y_test'], predictions),
           'f1': f1,
           'p_value': p_value,
           'weighted_score': weighted_score
       }
   
   return results

In [None]:
score_matrix, pvalue_matrix, cointegrated_pairs, coint_results = analyze_pairs(price_matrix)

returns, spreads = calculate_returns_and_spreads(price_matrix, cointegrated_pairs)

train_period = get_training_period()
test_period = get_test_period()

ml_datasets = prepare_ml_data(price_matrix, volume_matrix, spreads, train_period, test_period)

model_results = train_evaluate_models(ml_datasets, coint_results)

In [None]:
results_df = pd.DataFrame([
   {
       'pair': pair,
       'accuracy': metrics['accuracy'],
       'precision': metrics['precision'],
       'recall': metrics['recall'],
       'f1': metrics['f1'],
       'p_value': metrics['p_value'],
       'weighted_score': metrics['weighted_score']
   }
   for pair, metrics in model_results.items()
])

top_20 = results_df.sort_values('weighted_score', ascending=False).head(20)
print(top_20[['pair', 'f1', 'p_value', 'weighted_score', 'accuracy', 'precision', 'recall']])