In [None]:
!pip install joblib tqdm clickhouse-driver pandas numpy statsmodels seaborn matplotlib pyarrow fastparquet

In [None]:
from datetime import datetime, timedelta
from clickhouse_driver import Client
import pandas as pd
import numpy as np
from statsmodels.tsa.stattools import coint
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm

DATE_CONFIG = {
    'TRAIN_START': datetime(2015, 2, 2).date(),
    'TRAIN_END': datetime(2020, 2, 2).date(),
    'TEST_END': datetime(2025, 1, 1).date(),
    'TRADING_DAYS_PER_YEAR': 252  
}

def get_training_period():
    return {
        'start': DATE_CONFIG['TRAIN_START'],
        'end': DATE_CONFIG['TRAIN_END']
    }

def get_test_period():
    return {
        'start': DATE_CONFIG['TRAIN_END'],
        'end': DATE_CONFIG['TEST_END']
    }

def get_training_days():
    years = (DATE_CONFIG['TRAIN_END'] - DATE_CONFIG['TRAIN_START']).days / 365
    return int(years * DATE_CONFIG['TRADING_DAYS_PER_YEAR'])

SELECTED_SECTOR = 'Information Technology'

#### Hauptfunktion
Die Hauptfunktion find_cointegrated_pairs():
Diese Funktion ist das Herzstück der Analyse. Sie erstellt zwei Matrizen:
- Eine für Statistik-Scores (score_matrix)
- Eine für p-Werte (pvalue_matrix)

Der Code durchläuft dann jedes mögliche Aktienpaar und führt einen Kointegrationstest durch.

**Zeitbereich:**
Im Code wird der Zeitbereich so festgelegt:
```python
start_date = client.execute('SELECT MIN(date) FROM stock_data')[0][0]
end_date = start_date + timedelta(days=48*30)
```
Das bedeutet, der Code nimmt das früheste verfügbare Datum in der Datenbank und analysiert von dort aus die nächsten 48 Monate (also 4 Jahre). Die Analyse läuft also über einen 4-Jahres-Zeitraum, beginnend mit dem ersten verfügbaren Datenpunkt.

1. Teststatistik:
Stellen Sie sich die Teststatistik wie ein Thermometer vor. Sie misst, wie stark zwei Aktien miteinander verbunden sind. Je negativer der Wert ist, desto stärker ist die Verbindung. Das ist wie bei einem Thermometer, das unter Null fällt - je weiter unter Null, desto "kälter" oder in unserem Fall, desto stärker verbunden sind die Aktien.

2. P-Wert:
Der P-Wert ist wie eine Wahrscheinlichkeitsangabe auf einer Skala von 0 bis 1. Er sagt uns, wie verlässlich unsere Beobachtung ist:
- Ein p-Wert von 0.05 bedeutet eine 5% Chance, dass wir uns irren
- Je kleiner der p-Wert, desto sicherer können wir sein
- Im Code werden Aktienpaare mit p-Wert < 0.05 als bedeutsam eingestuft
Das ist wie bei einer Wettervorhersage: Wenn die Regenwahrscheinlichkeit bei 5% liegt, sind wir ziemlich sicher, dass es nicht regnen wird.

3. Kritische Werte:
Die kritischen Werte sind wie Grenzlinien. Sie helfen uns zu entscheiden, ob die Teststatistik bedeutsam ist:
- Es gibt typischerweise drei kritische Werte (1%, 5% und 10% Niveau)
- Wenn unsere Teststatistik kleiner (negativer) ist als diese Werte, haben wir einen signifikanten Fund
Das ist wie beim Hochsprung: Die kritischen Werte sind wie verschiedene Höhen der Latte. Wenn ein Springer darüber kommt, ist es eine bedeutende Leistung.

Ein praktisches Beispiel:
Nehmen wir an, wir analysieren zwei Energieaktien:
- Teststatistik: -3.5
- P-Wert: 0.02
- Kritische Werte: [-3.4, -2.9, -2.6]

Das würde bedeuten:
- Die Teststatistik (-3.5) ist kleiner als der strengste kritische Wert (-3.4)
- Der p-Wert (0.02 oder 2%) ist kleiner als 0.05 (5%)
- Schlussfolgerung: Diese Aktien haben eine sehr starke, statistisch signifikante Verbindung

Diese Analyse ist besonders wichtig für Händler, die Pairs-Trading-Strategien entwickeln wollen, wo sie auf die Annäherung von zeitweise auseinandergelaufenen, aber grundsätzlich verbundenen Aktien setzen.

Möchten Sie, dass ich einen dieser Aspekte noch genauer erkläre oder sollen wir uns ansehen, wie diese Werte praktisch interpretiert werden können?

----

#### Visualisierung:
```python
plt.figure(figsize=(12, 8))
mask = (pvalues >= 0.98)
sns.heatmap(pvalues, 
            xticklabels=data.columns, 
            yticklabels=data.columns, 
            cmap='RdYlGn_r',
            mask=mask)
```
Die Heatmap zeigt p-Werte für alle Aktienpaare:
- Die Maske blendet sehr hohe p-Werte (≥ 0.98) aus
- Die Farbskala geht von Rot (hohe p-Werte, keine Kointegration) zu Grün (niedrige p-Werte, starke Kointegration)
- Die Achsen zeigen die Aktiensymbole

In [None]:
def find_cointegrated_pairs(data):
    n = data.shape[1]
    score_matrix = np.zeros((n, n))
    pvalue_matrix = np.ones((n, n))
    keys = data.keys()
    pairs = []
    results = []  # Für alle Paare

    total_iterations = sum(range(n))
    
    with tqdm(total=total_iterations, desc="Analyzing pairs") as pbar:
        for i in range(n):
            for j in range(i+1, n):
                S1 = data[keys[i]]
                S2 = data[keys[j]]
                result = coint(S1, S2)
                score = result[0]
                pvalue = result[1]
                score_matrix[i, j] = score
                pvalue_matrix[i, j] = pvalue
                
                results.append({
                    'symbol1': keys[i],
                    'symbol2': keys[j],
                    'p_value': pvalue,
                    'score': score
                })
                
                if pvalue <= 0.01:
                    pairs.append((keys[i], keys[j]))
                pbar.update(1)

    return score_matrix, pvalue_matrix, pairs, results

def update_pairs_stats(client, results):
    for result in tqdm(results, desc="Updating database"):
        sector_query = f"""
        SELECT sector 
        FROM stock_pairs 
        WHERE symbol1 = '{result['symbol1']}' AND symbol2 = '{result['symbol2']}'
        LIMIT 1
        """
        sector = client.execute(sector_query)[0][0]
        sector_clean = sector.replace(' ', '_').upper()
        pair_key = f"{sector_clean}_{result['symbol1']}_{result['symbol2']}"
        
        update_query = f"""
        ALTER TABLE stock_pairs 
        UPDATE p_value = {result['p_value']},
               cointegration_score = {result['score']}
        WHERE pair_key = '{pair_key}'
        """
        client.execute(update_query)

scores, pvalues, pairs, results = find_cointegrated_pairs(data)
update_pairs_stats(client, results)


In [None]:
def get_sector_pairs():
    print(f"Fetching pairs for sector: {SELECTED_SECTOR if SELECTED_SECTOR else 'ALL'}")
    query = '''
    SELECT pair_key, sector, symbol1, symbol2 
    FROM stock_pairs
    '''
    if SELECTED_SECTOR:
        query += f" WHERE sector = '{SELECTED_SECTOR}'"
    pairs_df = pd.DataFrame(client.execute(query), 
                          columns=['pair_key', 'sector', 'symbol1', 'symbol2'])
    print(f"Found {len(pairs_df)} pairs")
    return pairs_df

def get_stock_data(symbols):
    placeholders = ', '.join(f"'{s}'" for s in symbols)
    query = f'''
    SELECT symbol, date, close 
    FROM stock_data
    WHERE symbol IN ({placeholders})
    AND date BETWEEN '{DATE_CONFIG['TRAIN_START']}' AND '{DATE_CONFIG['TEST_END']}'
    ORDER BY symbol, date
    '''
    print("Loading stock data...")
    df = pd.DataFrame(
        client.execute(query), 
        columns=['symbol', 'date', 'close']
    )
    print(f"Loaded data for {len(symbols)} symbols")
    return df.pivot(columns='symbol', values='close', index='date')

In [None]:
print("Starting analysis...")
pairs_df = get_sector_pairs()
unique_symbols = pd.concat([pairs_df['symbol1'], pairs_df['symbol2']]).unique()
data = get_stock_data(unique_symbols)

print("Cleaning data...")
min_periods = len(data) * 0.95

print(f"Analysis will be performed on {data.shape[1]} symbols")

print("Starting cointegration analysis...")
scores, pvalues, pairs = find_cointegrated_pairs(data)

# %%
print("Creating visualization...")
plt.figure(figsize=(12, 8))
mask = (pvalues >= 0.98)
sns.heatmap(pvalues, 
            xticklabels=data.columns, 
            yticklabels=data.columns, 
            cmap='RdYlGn_r',
            mask=mask)
plt.xticks(rotation=90)
plt.yticks(rotation=0)
plt.title('Cointegration p-values heatmap')
plt.tight_layout()
plt.show()

print("\nCointegrated Pairs (p-value < 0.01):")
for pair in pairs:
    print(f"{pair[0]} - {pair[1]}")

print("\nAnalysis complete!")

In [None]:
def plot_pairs(data, pairs):
    for s1, s2 in pairs:
        S1 = data[s1]
        S2 = data[s2]
        score, pvalue, _ = coint(S1, S2)
        ratios = S1 / S2
        
        plt.figure(figsize=(15,7))
        ratios.plot()
        plt.axhline(ratios.mean(), color='r')
        plt.title(f'{s1} / {s2} Price Ratio (p-value: {pvalue:.4f})')
        plt.legend(['Price Ratio', 'Mean'])
        plt.show()
        print(f"\n{s1} - {s2} p-value: {pvalue}")

plot_pairs(data, pairs)

In [None]:
def zscore(series):
    return (series - series.mean()) / np.std(series)

for pair in pairs:
    symbol1, symbol2 = pair
    
    ratios = data[symbol1] / data[symbol2]
    
    plt.figure(figsize=(15,7))
    zscore(ratios).plot()
    plt.axhline(zscore(ratios).mean(), color='black')
    plt.axhline(1.0, color='red', linestyle='--')
    plt.axhline(-1.0, color='green', linestyle='--')
    plt.title(f'Z-Score: {symbol1} vs {symbol2}')
    plt.legend(['Ratio z-score', 'Mean', '+1', '-1'])
    plt.show()

In [None]:
for pair in pairs:
    symbol1, symbol2 = pair
    
    query = f"""
    WITH pair_data AS (
        SELECT symbol, date, close 
        FROM stock_data
        WHERE symbol IN ('{symbol1}', '{symbol2}')
        AND date BETWEEN '{DATE_CONFIG['TRAIN_START']}' AND '{DATE_CONFIG['TEST_END']}'
        ORDER BY symbol, date
    )
    SELECT * FROM pair_data
    """
    
    df = pd.DataFrame(
        client.execute(query),
        columns=['symbol', 'date', 'close']
    )
    
    pair_data = df.pivot(columns='symbol', values='close', index='date')
    ratios = pair_data[symbol1] / pair_data[symbol2]
    training_days = get_training_days()
    train = ratios[:training_days]
    test = ratios[training_days:]
    
    ratios_mavg5 = train.rolling(window=5, center=False).mean()
    ratios_mavg60 = train.rolling(window=60, center=False).mean()
    std_60 = train.rolling(window=60, center=False).std()
    zscore_60_5 = (ratios_mavg5 - ratios_mavg60)/std_60
    
    plt.figure(figsize=(15,7))
    plt.plot(train.index, train.values)
    plt.plot(ratios_mavg5.index, ratios_mavg5.values)
    plt.plot(ratios_mavg60.index, ratios_mavg60.values)
    
    plt.title(f'Moving Averages: {symbol1} vs {symbol2}')
    plt.legend(['Ratio','5d Ratio MA', '60d Ratio MA'])
    plt.ylabel('Ratio')
    plt.show()

In [None]:
for pair in pairs:
    symbol1, symbol2 = pair
    
    query = f"""
    WITH pair_data AS (
        SELECT symbol, date, close 
        FROM stock_data
        WHERE symbol IN ('{symbol1}', '{symbol2}')
        AND date BETWEEN '{DATE_CONFIG['TRAIN_START']}' AND '{DATE_CONFIG['TEST_END']}'
        ORDER BY symbol, date
    )
    SELECT * FROM pair_data
    """
    
    df = pd.DataFrame(
        client.execute(query),
        columns=['symbol', 'date', 'close']
    )
    
    pair_data = df.pivot(columns='symbol', values='close', index='date')
    ratios = pair_data[symbol1] / pair_data[symbol2]
    training_days = get_training_days()
    train = ratios[:training_days]
    
    ratios_mavg5 = train.rolling(window=5, center=False).mean()
    ratios_mavg60 = train.rolling(window=60, center=False).mean()
    std_60 = train.rolling(window=60, center=False).std()
    zscore_60_5 = (ratios_mavg5 - ratios_mavg60)/std_60
    
    buy = train.copy()
    sell = train.copy()
    buy[zscore_60_5>-1] = 0
    sell[zscore_60_5<1] = 0
    
    plt.figure(figsize=(18,9))
    S1 = pair_data[symbol1].iloc[:training_days]
    S2 = pair_data[symbol2].iloc[:training_days]
    
    S1[60:].plot(color='b')
    S2[60:].plot(color='c')
    
    buyR = 0*S1.copy()
    sellR = 0*S1.copy()
    
    buyR[buy!=0] = S1[buy!=0]
    sellR[buy!=0] = S2[buy!=0]
    buyR[sell!=0] = S2[sell!=0]
    sellR[sell!=0] = S1[sell!=0]
    
    buyR[60:].plot(color='g', linestyle='None', marker='^')
    sellR[60:].plot(color='r', linestyle='None', marker='^')
    
    x1,x2,y1,y2 = plt.axis()
    plt.axis((x1,x2,min(S1.min(),S2.min()),max(S1.max(),S2.max())))
    
    plt.title(f'Price Action: {symbol1} vs {symbol2}')
    plt.legend([symbol1, symbol2, 'Buy Signal', 'Sell Signal'])
    plt.show()

1. **Ratios (ratios_train = S1_train/S2_train)**
- **Bedeutung**: Zeigt die relative Preisbeziehung zwischen zwei Aktien
- **Eigenschaft**: Sollte theoretisch um einen Mittelwert schwanken 
- **Nutzen**: Identifiziert, wenn Aktien aus ihrer historischen Preisbeziehung ausbrechen
- **Beispiel**: Wenn zwei Banken normalerweise im Verhältnis 2:1 handeln und plötzlich auf 2.5:1 steigen, könnte dies eine Handelsmöglichkeit sein

2. **Moving Averages (ma1_train, ma2_train)**
- **Kurzer MA (5 Tage)**:
  - **Eigenschaft**: Reagiert schnell auf Preisänderungen
  - **Nutzen**: Zeigt kurzfristige Trends
  - **Aussage**: Hilft kurzfristige Abweichungen zu erkennen

- **Langer MA (60 Tage)**:
  - **Eigenschaft**: Glättet Preisschwankungen stark
  - **Nutzen**: Bestimmt den langfristigen "normalen" Zustand
  - **Aussage**: Dient als Referenzpunkt für "faire" Bewertung

3. **Standardabweichung (std_train)**
- **Bedeutung**: Misst die "normale" Schwankungsbreite der Ratios
- **Eigenschaft**: Höhere Werte bedeuten mehr Volatilität
- **Nutzen**: Hilft zu bestimmen, ob eine Abweichung signifikant ist
- **Beispiel**: Wenn Ratio normalerweise ±5% schwankt, ist eine 15% Abweichung bedeutend

4. **Z-Score**
- **Bedeutung**: Standardisierte Abweichung vom Durchschnitt
- **Eigenschaften**:
  - \>1: Stark überdurchschnittlich
  - <-1: Stark unterdurchschnittlich
  - Zwischen -0.5 und 0.5: "Normal"
- **Nutzen**: Automatisierte Handelsentscheidungen
- **Beispiel**: 
  - Z-Score = 2 bedeutet: Ratio ist 2 Standardabweichungen über normal
  - Deutet auf überbewertet/unterbewertet hin

5. **Position Tracking (countS1, countS2)**
- **Bedeutung**: Aktuelle Handelsposition
- **Eigenschaften**: 
  - Positiv: Long-Position
  - Negativ: Short-Position
- **Nutzen**: Verfolgt offene Positionen und deren Größe
- **Beispiel**: 
  - countS1 = -1, countS2 = +2 bedeutet:
  - Short 1 Einheit von S1
  - Long 2 Einheiten von S2

6. **Money (Gewinn/Verlust)**
- **Bedeutung**: Kumulierter Handelserfolg
- **Eigenschaft**: Summe aller realisierten Gewinne/Verluste
- **Nutzen**: Misst Strategie-Performance
- **Beispiel**: 
  - Positive Werte: Profitable Strategie
  - Negative Werte: Verlustbringende Strategie

Die Strategie basiert auf der Annahme, dass extreme Abweichungen (gemessen durch Z-Score) sich wieder normalisieren werden. Wenn der Z-Score extrem wird (>1 oder <-1), wird eine Position aufgebaut, und wenn sich das Verhältnis normalisiert (Z-Score zwischen -0.5 und 0.5), wird die Position geschlossen.

In [None]:
def trade(S1_train, S2_train, S1_test, S2_test, window1, window2, symbol1, symbol2):
    # Original ratios und MAs
    ratios_train = S1_train/S2_train
    ma1_train = ratios_train.rolling(window=window1, center=False).mean()
    ma2_train = ratios_train.rolling(window=window2, center=False).mean()
    std_train = ratios_train.rolling(window=window2, center=False).std()
    
    ratios_test = S1_test/S2_test
    
    last_ma1 = ma1_train.iloc[-1]
    last_ma2 = ma2_train.iloc[-1]
    last_std = std_train.iloc[-1]
    
    trades = []
    
    # Feature-Berechnung für ML
    for i in range(len(ratios_test)):
        current_ratio = ratios_test.iloc[i]
        zscore = (current_ratio - last_ma2)/last_std
        
        # Erweiterte Features für ML
        trade_info = {
            'date': ratios_test.index[i],
            'zscore': zscore,
            'ratio': current_ratio,
            'ma1': last_ma1,
            'ma2': last_ma2,
            'S1_price': S1_test.iloc[i],
            'S2_price': S2_test.iloc[i],
            # Neue Features für ML:
            'std_ratio': std_train.iloc[-1],          # Volatilität des Ratios
            'ratio_trend': current_ratio/last_ma2,    # Trend des Ratios
            'ma_spread': (last_ma1 - last_ma2),       # Spread zwischen MAs
            'price_momentum_s1': S1_test.iloc[i]/S1_test.iloc[max(0,i-5)], # 5-Tage Momentum
            'price_momentum_s2': S2_test.iloc[i]/S2_test.iloc[max(0,i-5)]  # 5-Tage Momentum
        }
        
        # Original Trading-Logik bleibt gleich
        if zscore > 1.5:
            trade_info['action'] = 'SHORT'
            trades.append(trade_info)
            
        elif zscore < -1.5:
            trade_info['action'] = 'LONG'
            trades.append(trade_info)
            
        elif abs(zscore) < 0.5:
            trade_info['action'] = 'EXIT'
            trades.append(trade_info)
            
        last_ma1 = 0.8 * last_ma1 + 0.2 * current_ratio
        last_ma2 = 0.983 * last_ma2 + 0.017 * current_ratio
    
    trades_df = pd.DataFrame(trades)
    trades_df['pair'] = f"{symbol1}-{symbol2}"
    
    return trades_df

In [None]:
all_trades = []

for pair in pairs:
    symbol1, symbol2 = pair
    
    query = f"""
    WITH pair_data AS (
        SELECT 
            symbol, 
            date, 
            close,
            volume,
            high,
            low
        FROM stock_data
        WHERE symbol IN ('{symbol1}', '{symbol2}')
        AND date BETWEEN '{DATE_CONFIG['TRAIN_START']}' AND '{DATE_CONFIG['TEST_END']}'
        ORDER BY symbol, date
    )
    SELECT * FROM pair_data
    """
    
    df = pd.DataFrame(
        client.execute(query),
        columns=['symbol', 'date', 'close', 'volume', 'high', 'low']
    )
    
    # Da wir nur close für die pairs brauchen:
    pair_data = df.pivot(columns='symbol', values='close', index='date')
    
    # Zusätzliche features als separate DataFrames wenn nötig  
    volume_data = df.pivot(columns='symbol', values='volume', index='date')
    
    training_mask = pair_data.index < DATE_CONFIG['TRAIN_END']
    
    S1_train = pair_data[symbol1][training_mask]
    S2_train = pair_data[symbol2][training_mask]
    S1_test = pair_data[symbol1][~training_mask]
    S2_test = pair_data[symbol2][~training_mask]
    
    trades_df = trade(S1_train, S2_train, S1_test, S2_test, 5, 60, symbol1, symbol2)
    all_trades.append(trades_df)

final_trades = pd.concat(all_trades)
final_trades.to_parquet('../data/processed/01_static_cointegraton_trading_results.parquet')

# Trading Simulation Environment

In [None]:
import pandas as pd
import numpy as np
from scipy import stats

def simulate_trading(trades_df, initial_capital=10000, commission_per_share=0.005, variable_fee=0.00018, 
                    bid_ask_spread=0.0002, price_impact_coef=0.002):
    capital = initial_capital
    positions = {}
    trades_history = []
    daily_returns = []
    last_date = None
    daily_capital = initial_capital
    
    for _, trade in trades_df.iterrows():
        if last_date is None:
            last_date = trade['date']
        elif trade['date'] != last_date:
            daily_returns.append({
                'date': last_date,
                'return': (capital - daily_capital) / daily_capital
            })
            daily_capital = capital
            last_date = trade['date']
            
        risk_amount = capital * 0.01
        s1_symbol, s2_symbol = trade['pair'].split('-')
        
        if trade['pair'] in positions:
            old_pos = positions[trade['pair']]
            pnl = 0
            hold_time = trade['date'] - old_pos['entry_date']
            
            if old_pos['type'] == 'LONG':
                s1_pnl = old_pos['s1_shares'] * (trade['S1_price'] - old_pos['s1_entry'])
                s2_pnl = old_pos['s2_shares'] * (old_pos['s2_entry'] - trade['S2_price'])
                pnl = s1_pnl + s2_pnl
            else:
                s1_pnl = old_pos['s1_shares'] * (old_pos['s1_entry'] - trade['S1_price'])
                s2_pnl = old_pos['s2_shares'] * (trade['S2_price'] - old_pos['s2_entry'])
                pnl = s1_pnl + s2_pnl
            
            s1_trade_value = old_pos['s1_shares'] * trade['S1_price']
            s2_trade_value = old_pos['s2_shares'] * trade['S2_price']
            total_value = s1_trade_value + s2_trade_value
            
            commission_cost = (old_pos['s1_shares'] + old_pos['s2_shares']) * commission_per_share
            variable_cost = total_value * variable_fee
            spread_cost = total_value * bid_ask_spread
            price_impact = total_value * price_impact_coef * np.sqrt(total_value/1000000)
            total_costs = commission_cost + variable_cost + spread_cost + price_impact
            
            capital += pnl - total_costs
            del positions[trade['pair']]
            
            trades_history.append({
                'date': trade['date'],
                'pair': trade['pair'],
                'type': 'CLOSE',
                'pnl': pnl,
                'capital': capital,
                'transaction_costs': total_costs,
                'hold_time': hold_time.total_seconds() / 3600,
                'return': pnl / risk_amount
            })
        
        if trade['action'] in ['LONG', 'SHORT']:
            s1_shares = risk_amount / trade['S1_price']
            s2_shares = s1_shares * trade['ratio']
            
            s1_trade_value = s1_shares * trade['S1_price']
            s2_trade_value = s2_shares * trade['S2_price']
            total_value = s1_trade_value + s2_trade_value
            
            commission_cost = (s1_shares + s2_shares) * commission_per_share
            variable_cost = total_value * variable_fee
            spread_cost = total_value * bid_ask_spread
            price_impact = total_value * price_impact_coef * np.sqrt(total_value/1000000)
            total_costs = commission_cost + variable_cost + spread_cost + price_impact
            
            cost_adjustment = 1 - (total_costs / risk_amount)
            s1_shares *= cost_adjustment
            s2_shares *= cost_adjustment
            
            positions[trade['pair']] = {
                'type': trade['action'],
                's1_shares': s1_shares,
                's2_shares': s2_shares,
                's1_entry': trade['S1_price'],
                's2_entry': trade['S2_price'],
                'entry_date': trade['date']
            }
            
            trades_history.append({
                'date': trade['date'],
                'pair': trade['pair'],
                'type': 'OPEN',
                'action': trade['action'],
                'capital': capital,
                'transaction_costs': total_costs
            })
    
    trades_df = pd.DataFrame(trades_history)
    daily_returns_df = pd.DataFrame(daily_returns)
    
    closed_trades = trades_df[trades_df['type'] == 'CLOSE']
    metrics = calculate_metrics(trades_df, daily_returns_df, initial_capital)
    
    return trades_df, metrics

def calculate_metrics(trades_df, daily_returns_df, initial_capital):
    closed_trades = trades_df[trades_df['type'] == 'CLOSE']
    
    metrics = {
        'total_trades': len(trades_df[trades_df['type'] == 'OPEN']),
        'total_pnl': closed_trades['pnl'].sum(),
        'total_return': (trades_df['capital'].iloc[-1] / initial_capital - 1) * 100,
        'win_rate': (closed_trades['pnl'] > 0).mean() * 100,
        'avg_profit_per_trade': closed_trades['pnl'].mean(),
        'avg_win': closed_trades[closed_trades['pnl'] > 0]['pnl'].mean(),
        'avg_loss': closed_trades[closed_trades['pnl'] < 0]['pnl'].mean(),
        'largest_win': closed_trades['pnl'].max(),
        'largest_loss': closed_trades['pnl'].min(),
        'profit_factor': abs(closed_trades[closed_trades['pnl'] > 0]['pnl'].sum() / 
                           closed_trades[closed_trades['pnl'] < 0]['pnl'].sum()),
        'avg_hold_time': closed_trades['hold_time'].mean(),
        'total_transaction_costs': trades_df['transaction_costs'].sum(),
        'avg_transaction_costs': trades_df['transaction_costs'].mean()
    }
    
    daily_returns = daily_returns_df['return']
    metrics.update({
        'sharpe_ratio': np.sqrt(252) * (daily_returns.mean() / daily_returns.std()),
        'volatility_annual': daily_returns.std() * np.sqrt(252) * 100,
        'max_drawdown': calculate_max_drawdown(trades_df['capital']),
        'skewness': stats.skew(daily_returns),
        'kurtosis': stats.kurtosis(daily_returns)
    })
    
    pair_metrics = calculate_pair_metrics(closed_trades)
    metrics['pair_metrics'] = pair_metrics
    
    return metrics

def calculate_max_drawdown(capital_series):
    peak = capital_series.expanding(min_periods=1).max()
    drawdown = (capital_series - peak) / peak
    return drawdown.min() * 100

def calculate_pair_metrics(closed_trades):
    pair_metrics = {}
    for pair in closed_trades['pair'].unique():
        pair_trades = closed_trades[closed_trades['pair'] == pair]
        pair_metrics[pair] = {
            'total_trades': len(pair_trades),
            'win_rate': (pair_trades['pnl'] > 0).mean() * 100,
            'total_pnl': pair_trades['pnl'].sum(),
            'avg_profit': pair_trades['pnl'].mean(),
            'avg_hold_time': pair_trades['hold_time'].mean()
        }
    return pair_metrics

trades_df = pd.read_parquet('../data/processed/01_static_cointegraton_trading_results.parquet')
results, metrics = simulate_trading(trades_df)

print(f"=== Grundlegende Metriken ===")
print(f"Endkapital: ${results['capital'].iloc[-1]:.2f}")
print(f"Gesamtrendite: {metrics['total_return']:.2f}%")
print(f"Anzahl Trades: {metrics['total_trades']}")
print(f"\n=== Performance Metriken ===")
print(f"Win Rate: {metrics['win_rate']:.2f}%")
print(f"Profit Faktor: {metrics['profit_factor']:.2f}")
print(f"Sharpe Ratio: {metrics['sharpe_ratio']:.2f}")
print(f"Max Drawdown: {metrics['max_drawdown']:.2f}%")
print(f"Jährliche Volatilität: {metrics['volatility_annual']:.2f}%")
print(f"\n=== Trade Statistiken ===")
print(f"Durchschn. Gewinn: ${metrics['avg_win']:.2f}")
print(f"Durchschn. Verlust: ${metrics['avg_loss']:.2f}")
print(f"Größter Gewinn: ${metrics['largest_win']:.2f}")
print(f"Größter Verlust: ${metrics['largest_loss']:.2f}")
print(f"Durchschn. Haltezeit: {metrics['avg_hold_time']:.2f} Stunden")
print(f"\n=== Kosten ===")
print(f"Gesamte Transaktionskosten: ${metrics['total_transaction_costs']:.2f}")
print(f"Durchschn. Kosten/Trade: ${metrics['avg_transaction_costs']:.2f}")
print(f"\n=== Verteilungsmetriken ===")
print(f"Schiefe: {metrics['skewness']:.2f}")
print(f"Kurtosis: {metrics['kurtosis']:.2f}")

print("\n=== Performance nach Pairs ===")
for pair, pair_metric in metrics['pair_metrics'].items():
    print(f"\n{pair}:")
    print(f"Trades: {pair_metric['total_trades']}")
    print(f"Win Rate: {pair_metric['win_rate']:.2f}%")
    print(f"Gesamt P&L: ${pair_metric['total_pnl']:.2f}")
    print(f"Durchschn. Profit: ${pair_metric['avg_profit']:.2f}")
    print(f"Durchschn. Haltezeit: {pair_metric['avg_hold_time']:.2f} Stunden")

# Maschine Learning

In [36]:
!pip install imbalanced-learn

Collecting imbalanced-learn
  Downloading imbalanced_learn-0.13.0-py3-none-any.whl.metadata (8.8 kB)
Collecting sklearn-compat<1,>=0.1 (from imbalanced-learn)
  Downloading sklearn_compat-0.1.3-py3-none-any.whl.metadata (18 kB)
Downloading imbalanced_learn-0.13.0-py3-none-any.whl (238 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m238.4/238.4 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading sklearn_compat-0.1.3-py3-none-any.whl (18 kB)
Installing collected packages: sklearn-compat, imbalanced-learn
Successfully installed imbalanced-learn-0.13.0 sklearn-compat-0.1.3


In [37]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, f1_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

In [None]:
def enhance_features(features_df):
   df = features_df.copy()
   
   # Core Features mit verschiedenen Zeitfenstern
   df['zscore_abs'] = np.abs(df['zscore'])
   df['zscore_trend_short'] = df.groupby('pair')['zscore'].ewm(span=5).mean().reset_index(0, drop=True)
   df['zscore_trend'] = df.groupby('pair')['zscore'].ewm(span=10).mean().reset_index(0, drop=True)
   df['zscore_trend_long'] = df.groupby('pair')['zscore'].ewm(span=20).mean().reset_index(0, drop=True)
   
   # Reversion Strength in verschiedenen Zeitfenstern
   df['ratio_momentum_short'] = df.groupby('pair')['ratio'].pct_change(3)
   df['ratio_momentum'] = df.groupby('pair')['ratio'].pct_change(5)
   df['ratio_momentum_long'] = df.groupby('pair')['ratio'].pct_change(10)
   
   df['reversion_strength_short'] = df['zscore'] * df['ratio_momentum_short']
   df['reversion_strength'] = df['zscore'] * df['ratio_momentum']
   df['reversion_strength_long'] = df['zscore'] * df['ratio_momentum_long']
   
   # Signal Quality
   df['signal_consistency'] = df.groupby('pair')['zscore'].rolling(10).std().reset_index(0, drop=True)
   
   df = df.fillna(method='ffill').fillna(0)
   
   feature_columns = [
       'zscore', 'zscore_abs',
       'zscore_trend_short', 'zscore_trend', 'zscore_trend_long',
       'reversion_strength_short', 'reversion_strength', 'reversion_strength_long',
       'signal_consistency'
   ]
   
   return df, feature_columns
def prepare_ml_data(trades_df, simulation_results):
    print("Initial columns in trades_df:", trades_df.columns.tolist())
    merged_data = pd.merge(
        trades_df,
        simulation_results[['date', 'pair', 'pnl', 'return', 'hold_time']],
        on=['date', 'pair'],
        how='inner'
    )
    print("Columns after merge:", merged_data.columns.tolist())
    
    features_df = merged_data.copy()
    features_df['profitable'] = features_df['pnl'] > 0
    
    # Generate enhanced features
    features_df, feature_columns = enhance_features(features_df)
    
    return features_df, feature_columns, 'profitable'

In [39]:
def balance_data(X, y):
    over = SMOTE(sampling_strategy=0.8, random_state=42)
    under = RandomUnderSampler(sampling_strategy=0.9, random_state=42)
    
    steps = [('o', over), ('u', under)]
    pipeline = Pipeline(steps=steps)
    
    return pipeline.fit_resample(X, y)

In [40]:
def train_ml_model(features_df, feature_columns, target_column, test_size=0.2):
    X = features_df[feature_columns]
    y = features_df[target_column]
    
    split_idx = int(len(features_df) * (1-test_size))
    X_train = X[:split_idx]
    y_train = y[:split_idx]
    X_test = X[split_idx:]
    y_test = y[split_idx:]
    
    X_balanced, y_balanced = balance_data(X_train, y_train)
    
    param_grid = {
        'n_estimators': [100, 200],
        'learning_rate': [0.05, 0.1],
        'max_depth': [3, 4, 5],
        'min_samples_leaf': [50, 100]
    }
    
    model = GridSearchCV(
        GradientBoostingClassifier(random_state=42),
        param_grid,
        cv=5,
        scoring='f1',
        n_jobs=-1
    )
    
    model.fit(X_balanced, y_balanced)
    best_model = model.best_estimator_
    
    print("\nBest Parameters:", model.best_params_)
    print("\nModel Evaluation on Test Set:")
    y_pred = best_model.predict(X_test)
    print(classification_report(y_test, y_pred))
    
    feature_importance = pd.DataFrame({
        'feature': feature_columns,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print("\nFeature Importance:")
    print(feature_importance)
    
    return best_model, feature_importance, (X_train, X_test, y_train, y_test)

In [49]:
def analyze_predictions(model, X_test, y_test):
    predictions = model.predict_proba(X_test)
    probs = predictions[:, 1]  # Wahrscheinlichkeit für die positive Klasse
    thresholds = np.arange(0.3, 0.9, 0.1)
    
    print("\nPrediction quality at different confidence levels:")
    for threshold in thresholds:
        mask = probs > threshold
        if mask.any():
            y_filtered = y_test[mask]  # Nur die relevanten y-Werte
            preds = np.ones(len(y_filtered))  # Alle über dem Threshold sind 1
            acc = (preds == y_filtered).mean()
            n_preds = mask.sum()
            print(f"Threshold {threshold:.1f}: Accuracy = {acc:.2%} (on {n_preds} predictions)")
    
    prediction_df = pd.DataFrame({
        'actual': y_test,
        'probability': probs,
        'high_conf_pred': probs > 0.7
    })
    
    return prediction_df

In [45]:
def clean_data(features_df, feature_columns):
    # Kopie erstellen
    cleaned_df = features_df.copy()
    
    # Forward fill für rolling calculations
    for col in feature_columns:
        if cleaned_df[col].isna().any():
            cleaned_df[col] = cleaned_df.groupby('pair')[col].fillna(method='ffill')
            cleaned_df[col] = cleaned_df.groupby('pair')[col].fillna(method='bfill')
            
    # Restliche NaNs mit 0 füllen
    cleaned_df = cleaned_df.fillna(0)
    
    return cleaned_df

In [64]:
print("Preparing ML data...")
features_df, feature_columns, target_column = prepare_ml_data(trades_df, simulation_results)

print("Cleaning data...")
features_df = clean_data(features_df, feature_columns)

print("\nTraining ML model...")
model, feature_importance, (X_train, X_test, y_train, y_test) = train_ml_model(
    features_df, 
    feature_columns, 
    target_column
)

print("\nAnalyzing predictions...")
prediction_analysis = analyze_predictions(model, X_test, y_test)

Preparing ML data...
Initial columns in trades_df: ['date', 'zscore', 'ratio', 'ma1', 'ma2', 'S1_price', 'S2_price', 'std_ratio', 'ratio_trend', 'ma_spread', 'pair', 'action']
Columns after merge: ['date', 'zscore', 'ratio', 'ma1', 'ma2', 'S1_price', 'S2_price', 'std_ratio', 'ratio_trend', 'ma_spread', 'pair', 'action', 'pnl', 'return', 'hold_time']
Cleaning data...

Training ML model...


  df = df.fillna(method='ffill').fillna(0)



Best Parameters: {'learning_rate': 0.1, 'max_depth': 5, 'min_samples_leaf': 50, 'n_estimators': 200}

Model Evaluation on Test Set:
              precision    recall  f1-score   support

       False       0.81      0.79      0.80       261
        True       0.44      0.47      0.46        93

    accuracy                           0.70       354
   macro avg       0.62      0.63      0.63       354
weighted avg       0.71      0.70      0.71       354


Feature Importance:
                feature  importance
5    reversion_strength    0.189498
2          zscore_trend    0.155599
1            zscore_abs    0.148331
6    signal_consistency    0.099936
3            vol_change    0.089776
7            zscore_vol    0.086961
0                zscore    0.073864
4        ratio_momentum    0.064992
9  momentum_consistency    0.048378
8        trend_strength    0.042666

Analyzing predictions...

Prediction quality at different confidence levels:
Threshold 0.3: Accuracy = 36.78% (on 174 pred