In [92]:
from datetime import datetime, timedelta
import pandas as pd
import numpy as np
from statsmodels.tsa.stattools import coint
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

DATE_CONFIG = {
    'TRAIN_START': pd.Timestamp('2021-02-02'),
    'TRAIN_END': pd.Timestamp('2024-01-01'),
    'TEST_END': pd.Timestamp('2025-01-01'),
    'TRADING_DAYS_PER_YEAR': 252  
}

def get_training_period():
    return {
        'start': DATE_CONFIG['TRAIN_START'],
        'end': DATE_CONFIG['TRAIN_END']
    }

def get_test_period():
    return {
        'start': DATE_CONFIG['TRAIN_END'],
        'end': DATE_CONFIG['TEST_END']
    }

def get_training_days():
    years = (DATE_CONFIG['TRAIN_END'] - DATE_CONFIG['TRAIN_START']).days / 365
    return int(years * DATE_CONFIG['TRADING_DAYS_PER_YEAR'])

plt.style.use('classic')
plt.rcParams['figure.figsize'] = [12, 6]
plt.rcParams['figure.dpi'] = 100

In [93]:
def load_and_prepare_data(file_path):

    df = pd.read_parquet(file_path)
    df['date'] = pd.to_datetime(df['date'])
    
    mask = (df['date'] >= DATE_CONFIG['TRAIN_START']) & \
           (df['date'] <= DATE_CONFIG['TEST_END'])
    df = df[mask]
    
    price_matrix = df.pivot(index='date', columns='symbol', values='close')
    
    symbols = price_matrix.columns.tolist()
    
    print(f"Loaded data from {DATE_CONFIG['TRAIN_START']} to {DATE_CONFIG['TEST_END']}")
    print(f"Total symbols: {len(symbols)}")
    print(f"Total trading days: {len(price_matrix)}")
    
    return price_matrix, symbols

In [94]:
def generate_pairs(symbols):
   return [(s1, s2) for i, s1 in enumerate(symbols) for s2 in symbols[i+1:]]

def calculate_returns_and_spreads(price_matrix):
    returns = price_matrix.pct_change().dropna()
    
    pairs = generate_pairs(price_matrix.columns)
    
    spreads = pd.DataFrame(index=returns.index)
    for s1, s2 in pairs:
        spreads[f'{s1}_{s2}_spread'] = returns[s1] - returns[s2]
        
    return returns, spreads

In [95]:
def prepare_ml_data(returns, spreads, train_period, test_period, lookback=3):
    ml_datasets = {}
    
    for spread_col in spreads.columns:
        sym1, sym2 = spread_col.replace('_spread', '').split('_')
        
        # Basis-Features
        df = pd.DataFrame({
            f'{sym1}_return': returns[sym1],
            f'{sym2}_return': returns[sym2]
        })
        
        for t in range(1, lookback+1):
            df[f'{sym1}_return_t-{t}'] = df[f'{sym1}_return'].shift(t)
            df[f'{sym2}_return_t-{t}'] = df[f'{sym2}_return'].shift(t)
        
        # Target ist der Spread vom nächsten Tag
        spread_next_day = (df[f'{sym1}_return'] - df[f'{sym2}_return']).shift(-1)
        df['target'] = np.where(spread_next_day > 0, 1, 0)
        
        # Features sind nur die historischen Werte
        feature_cols = [col for col in df.columns if 't-' in col]
        features = df[feature_cols].copy()
        
        # NaN-Zeilen entfernen
        clean_idx = features.dropna().index
        features = features.loc[clean_idx]
        target = df.loc[clean_idx, 'target']
    
        
        train_mask = (features.index >= train_period['start']) & (features.index < train_period['end'])
        test_mask = (features.index >= test_period['start']) & (features.index < test_period['end'])
        
        ml_datasets[f'{sym1}_{sym2}'] = {
            'X_train': features[train_mask],
            'X_test': features[test_mask],
            'y_train': target[train_mask],
            'y_test': target[test_mask]
        }
        
    return ml_datasets

In [96]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [97]:
def train_evaluate_models(ml_datasets):
    results = {}
    
    for pair, data in tqdm(ml_datasets.items(), desc="Training models"):
        # Keine extra Bereinigung mehr nötig, da schon in prepare_ml_data gemacht
        rf = RandomForestClassifier(
            n_estimators=100,
            random_state=42
        )
        
        rf.fit(data['X_train'], data['y_train'])
        predictions = rf.predict(data['X_test'])
        
        results[pair] = {
            'model': rf,
            'accuracy': accuracy_score(data['y_test'], predictions),
            'precision': precision_score(data['y_test'], predictions),
            'recall': recall_score(data['y_test'], predictions),
            'f1': f1_score(data['y_test'], predictions)
        }
        
        
        # Erste 10 Predictions vs. wahre Werte
        #print("\nErste 10 Vorhersagen vs. wahre Werte:")
        #for i in range(min(10, len(predictions))):
        #    print(f"Wahr: {data['y_test'].iloc[i]} | Vorhersage: {predictions[i]}")
            
        # Feature Importance
        feature_importance = pd.DataFrame({
            'feature': data['X_train'].columns,
            'importance': rf.feature_importances_
        }).sort_values('importance', ascending=False)
        
    
    return results

In [98]:
# Zuerst Daten laden
price_matrix, symbols = load_and_prepare_data('./nasdaq_daily.parquet')

# Dann Returns und Spreads berechnen
returns, spreads = calculate_returns_and_spreads(price_matrix)

# Perioden definieren
train_period = get_training_period()
test_period = get_test_period()

# ML Daten vorbereiten
ml_datasets = prepare_ml_data(returns, spreads, train_period, test_period)

# Training und Evaluation
model_results = train_evaluate_models(ml_datasets)

Loaded data from 2021-02-02 00:00:00 to 2025-01-01 00:00:00
Total symbols: 94
Total trading days: 985


Training models: 100%|██████████| 4371/4371 [09:02<00:00,  8.05it/s]


In [99]:
# Top 20 Paare nach F1-Score sortieren und anzeigen
results_df = pd.DataFrame([
    {
        'pair': pair,
        'accuracy': metrics['accuracy'],
        'precision': metrics['precision'],
        'recall': metrics['recall'],
        'f1': metrics['f1']
    }
    for pair, metrics in model_results.items()
])

top_20 = results_df.sort_values('f1', ascending=False).head(20)
print("\nTop 20 Pairs by F1-Score:")
print(top_20)


Top 20 Pairs by F1-Score:
           pair  accuracy  precision    recall        f1
2793  FTNT_MELI  0.583333   0.565217  0.806202  0.664537
1132  AXON_BIIB  0.539683   0.595506  0.706667  0.646341
1150    AXON_EA  0.571429   0.615385  0.666667  0.640000
1940  COST_ROST  0.583333   0.632653  0.645833  0.639175
1443   BKNG_XEL  0.571429   0.584906  0.688889  0.632653
1138  AXON_CHTR  0.579365   0.616438  0.642857  0.629371
1722    CDW_PEP  0.579365   0.585526  0.674242  0.626761
2032    CRWD_EA  0.567460   0.568750  0.694656  0.625430
1210    AXON_ZS  0.563492   0.558282  0.705426  0.623288
1424   BKNG_PEP  0.551587   0.596154  0.650350  0.622074
1903   COST_HON  0.535714   0.592593  0.653061  0.621359
1954   COST_XEL  0.539683   0.586420  0.659722  0.620915
2294    CTAS_EA  0.543651   0.566265  0.686131  0.620462
1920  COST_MNST  0.563492   0.622378  0.613793  0.618056
287    ADP_BIIB  0.543651   0.560241  0.688889  0.617940
3379   KLAC_XEL  0.579365   0.578231  0.658915  0.615942
351 