Select for best 5 combinations for rnn

In [None]:
import pandas as pd
import numpy as np
from itertools import combinations
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf

# === Excel einlesen ===
file_path = r"C:\Users\41799\Desktop\Kopie von market_data.xlsx"
df = pd.read_excel(file_path)

# === Zielvariable & CNN-geeignete Feature-Kandidaten ===
target_col = "_MKT"
allowed_features = [
    "EMP",       # Beschäftigungstrend
    "GDP",       # Wirtschaftswachstum
    "UN",        # Arbeitslosigkeit
    "CPI",       # Inflation
    "M2",        # Geldmengenwachstum
    "Y02",       # Kurzfristige Rendite
    "Y10",       # Langfristige Rendite
    "STP",       # Steilheit Zinskurve
    "IR",        # Nominalzins
    "RR",        # Realzins
    "MOV",       # Volatilität
    "NYF",       # New York Fed Index
    "_TY",       # Treasury Markt
    "_OIL",      # Ölpreis
    "_DXY",      # Dollar Index
    "_LCP",      # Large Cap Index
    "_AU"        # Goldpreis
]

# === Datum verarbeiten ===
if "Date" in df.columns:
    df["Date"] = pd.to_datetime(df["Date"])
    df = df.sort_values("Date")
    df = df.set_index("Date")

# === Nur numerische Daten & Normalisieren ===
df = df.select_dtypes(include=["number"]).dropna()
scaler = MinMaxScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns, index=df.index)

# === Split: 15% Training, Rest Validierung ===
split_index = int(len(df_scaled) * 0.15)
train_df = df_scaled[:split_index]
val_df = df_scaled[split_index:]

# === Zeitreihen-Daten generieren ===
def create_dataset(X, y, seq_len=5):
    Xs, ys = [], []
    for i in range(len(X) - seq_len):
        Xs.append(X[i:i + seq_len])
        ys.append(y[i + seq_len])
    return np.array(Xs), np.array(ys)

# === RNN (LSTM) testen mit allen 3er-Kombinationen ===
results = []
for combo in combinations(allowed_features, 3):
    combo = list(combo)
    try:
        X_train, y_train = create_dataset(train_df[combo].values, train_df[target_col].values)
        X_val, y_val = create_dataset(val_df[combo].values, val_df[target_col].values)

        model = tf.keras.Sequential([
            tf.keras.layers.LSTM(64, return_sequences=False, input_shape=(X_train.shape[1], X_train.shape[2])),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(1)
        ])
        model.compile(optimizer="adam", loss="mse")
        early_stop = tf.keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True)

        history = model.fit(X_train, y_train,
                            validation_data=(X_val, y_val),
                            epochs=50,
                            batch_size=16,
                            verbose=0,
                            callbacks=[early_stop])

        val_loss = min(history.history["val_loss"])
        results.append((combo, val_loss))
        print(f"✅ Getestet (LSTM): {combo} | val_loss: {val_loss:.5f}")

    except Exception as e:
        print(f"⚠️ Fehler bei Kombination {combo}: {str(e)}")

# === Beste 5 Kombinationen anzeigen ===
results.sort(key=lambda x: x[1])
print("\n🏆 Beste 5 Kombinationen mit genau 3 Features (LSTM):")
for i, (combo, loss) in enumerate(results[:5], 1):
    print(f"{i}. {combo} ➞ val_loss: {loss:.5f}")

besz results  
1. ['GDP', '_TY', '_DXY'] ➞ val_loss: 0.06024
2. ['CPI', '_TY', '_LCP'] ➞ val_loss: 0.06063
3. ['UN', 'Y02', '_TY'] ➞ val_loss: 0.06120
4. ['_TY', '_DXY', '_LCP'] ➞ val_loss: 0.06159
5. ['Y02', '_TY', '_DXY'] ➞ val_loss: 0.06295


best arhitecture for each result

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from scipy.stats import pearsonr
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, BatchNormalization, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from itertools import product
import random

# === Reproduzierbarkeit ===
random.seed(42)
np.random.seed(42)
tf.random.set_seed(42)

# === Daten einlesen ===
df = pd.read_excel(r"C:\Users\41799\Desktop\Kopie von market_data.xlsx")
df = df.dropna()
df["Date"] = pd.to_datetime(df["Date"])
df = df.sort_values("Date").reset_index(drop=True)

df_numeric = df.select_dtypes(include=[np.number])
target_col = "_MKT"

# === Feature-Kombinationen ===
combinations_to_test = [
    ['GDP', '_TY', '_DXY'],       # ➞ val_loss: 0.06024
    ['CPI', '_TY', '_LCP'],       # ➞ val_loss: 0.06063
    ['UN', 'Y02', '_TY'],         # ➞ val_loss: 0.06120
    ['_TY', '_DXY', '_LCP'],      # ➞ val_loss: 0.06159
    ['Y02', '_TY', '_DXY']        # ➞ val_loss: 0.06295
]


# === WindowGenerator ===
class WindowGenerator():
    def __init__(self, input_width, label_width, shift, input_columns=None, label_columns=None, df_train=None):
        self.label_columns = label_columns
        self.input_columns = input_columns
        self.input_width = input_width
        self.label_width = label_width
        self.shift = shift
        self.total_window_size = input_width + shift
        self.input_slice = slice(0, input_width)
        self.label_start = self.total_window_size - self.label_width

        if df_train is not None:
            self.train_input_indices = {name: i for i, name in enumerate(df_train.columns)}
            self.train_label_indices = {name: i for i, name in enumerate(df_train.columns)}

    def split_window(self, features):
        inputs = features[:, self.input_slice, :]
        labels = features[:, self.label_start:, :]
        if self.input_columns:
            inputs = tf.stack([inputs[:, :, self.train_input_indices[name]] for name in self.input_columns], axis=-1)
        if self.label_columns:
            labels = tf.stack([labels[:, :, self.train_label_indices[name]] for name in self.label_columns], axis=-1)
        return inputs, labels

    def make_dataset(self, data, shuffle=False, batchsize=64):
        data = np.array(data, dtype=np.float32)
        ds = tf.keras.preprocessing.timeseries_dataset_from_array(
            data=data,
            targets=None,
            sequence_length=self.total_window_size,
            sequence_stride=1,
            sampling_rate=1,
            shuffle=shuffle,
            batch_size=batchsize
        )
        return ds.map(self.split_window)

# === Hyperparameter-Space erweitern ===
hyperparams = list(product(
    [10, 20, 30, 45],
    [(32, 64, 128), (64, 64, 64), (128, 64, 32)],
    [(0.1, 0.3), (0.2, 0.4), (0.3, 0.5)],
    [32, 64, 128, 256]
))
hyperparams = random.sample(hyperparams, 40)

# === Ergebnisliste ===
final_results = []

# === Loop über Feature-Kombinationen ===
for features in combinations_to_test:
    print(f"\n🧪 Testing: {features}")
    selected_cols = features + [target_col]
    data = df_numeric[selected_cols].copy()
    scaler = MinMaxScaler()
    data_scaled = pd.DataFrame(scaler.fit_transform(data), columns=selected_cols)

    split = int(len(data_scaled) * 0.8)
    train_df = data_scaled[:split]
    val_df = data_scaled[split:]

    best_loss = np.inf
    best_corr = -1
    best_sharpe = -np.inf
    best_config = None

    for input_width, units, drops, dense in hyperparams:
        window = WindowGenerator(input_width=input_width, label_width=1, shift=1,
                                 input_columns=features, label_columns=[target_col], df_train=train_df)
        train_data = window.make_dataset(train_df, shuffle=True)
        val_data = window.make_dataset(val_df)

        model = Sequential([
            LSTM(units=units[0], return_sequences=True),
            BatchNormalization(),
            Dropout(drops[0]),
            LSTM(units=units[1], return_sequences=True),
            BatchNormalization(),
            Dropout(drops[1]),
            LSTM(units=units[2], return_sequences=False),
            BatchNormalization(),
            Dense(dense, activation='relu'),
            Dropout(0.2),
            Dense(1)
        ])

        model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005), loss='mse')
        early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

        history = model.fit(
            train_data,
            validation_data=val_data,
            epochs=50,
            callbacks=[early_stop],
            verbose=0
        )

        y_pred_val = model.predict(val_data)
        y_true_val = np.concatenate([y for x, y in val_data], axis=0)

        if y_pred_val.ndim == 3:
            y_pred_val = y_pred_val[:, -1, :]
        if y_true_val.ndim == 3:
            y_true_val = y_true_val[:, -1, :]

        corr, _ = pearsonr(np.ravel(y_true_val), np.ravel(y_pred_val))
        val_loss = min(history.history['val_loss'])

        # Lineare Regression zur Korrektur
        reg = LinearRegression().fit(y_pred_val.reshape(-1, 1), y_true_val.reshape(-1, 1))
        y_pred_corrected = reg.predict(y_pred_val.reshape(-1, 1))

        # Sharpe Ratio berechnen
        returns = y_true_val[1:] - y_true_val[:-1]
        position = np.sign(y_pred_corrected[1:] - y_pred_corrected[:-1])
        strategy_returns = position * returns
        sharpe_ratio = np.mean(strategy_returns) / (np.std(strategy_returns) + 1e-6)

        if sharpe_ratio > best_sharpe or (sharpe_ratio == best_sharpe and corr > best_corr):
            best_loss = val_loss
            best_corr = corr
            best_sharpe = sharpe_ratio
            best_config = (input_width, units, drops, dense)

    print(f"✅ Best Config: {best_config} | loss: {best_loss:.5f} | corr: {best_corr:.3f} | Sharpe: {best_sharpe:.3f}")
    final_results.append((features, best_loss, best_corr, best_config, best_sharpe))

# === Ergebnisse anzeigen (Sharpe-basiert sortiert) ===
final_results.sort(key=lambda x: (x[4], x[2]), reverse=True)
print("\n🏁 Best Feature Combinations:")
for i, (feat, loss, corr, cfg, sharpe) in enumerate(final_results, 1):
    print(f"{i}. {feat} ➔ val_loss: {loss:.5f} | corr: {corr:.3f} | Sharpe: {sharpe:.3f} | config: {cfg}")


FileNotFoundError: [Errno 2] No such file or directory: '/content/Kopie von market_data.xlsx'

1. ['_TY', '_DXY', '_LCP'] ➔ val_loss: 0.11024 | corr: 0.694 | Sharpe: 0.127 | config: (20, (128, 64, 32), (0.1, 0.3), 128)
2. ['CPI', '_TY', '_LCP'] ➔ val_loss: 0.25088 | corr: 0.876 | Sharpe: 0.126 | config: (20, (64, 64, 64), (0.2, 0.4), 256)
3. ['GDP', '_TY', '_DXY'] ➔ val_loss: 0.39701 | corr: 0.347 | Sharpe: 0.099 | config: (20, (32, 64, 128), (0.2, 0.4), 256)
4. ['UN', 'Y02', '_TY'] ➔ val_loss: 0.24832 | corr: 0.384 | Sharpe: 0.087 | config: (30, (128, 64, 32), (0.2, 0.4), 256)
5. ['Y02', '_TY', '_DXY'] ➔ val_loss: 0.31353 | corr: 0.523 | Sharpe: 0.061 | config: (30, (128, 64, 32), (0.1, 0.3), 64)

only number 1 and 2 are relevant because of sharp ratio and korelation

now thind the best traiding strategy for 1 and 2

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.regularizers import l2
from scipy.stats import spearmanr

# === 1. Excel laden ===
file_path = r"C:\Users\41799\Desktop\Kopie von market_data.xlsx"
df = pd.read_excel(file_path)

# === 2. Feature-Set + Parameter-Kombis definieren ===
experiments = [
    {
        "features": ['_TY', '_DXY', '_LCP'],
        "lstm_units": (128, 64, 32),
        "dropouts": (0.1, 0.3),
        "batch_size": 128
    },
    {
        "features": ['CPI', '_TY', '_LCP'],
        "lstm_units": (64, 64, 64),
        "dropouts": (0.2, 0.4),
        "batch_size": 256
    }
]

target = '_MKT'
seq_len = 20

# === 3. Schleife über Experimente ===
for exp in experiments:
    features = exp["features"]
    lstm_units = exp["lstm_units"]
    dropouts = exp["dropouts"]
    batch_size = exp["batch_size"]

    print("="*70)
    print(f"🔍 Features: {features}")
    print(f"🧠 Config: LSTM={lstm_units}, Dropout={dropouts}, Batch={batch_size}")
    print("="*70)

    # --- Daten ---
    df_clean = df[features + [target]].dropna().reset_index(drop=True)
    df_ret = df_clean.diff().dropna()
    df_ret[target] = (df_ret[target] > 0).astype(int)

    # --- Skalieren ---
    scaler = StandardScaler()
    scaled = scaler.fit_transform(df_ret[features])
    scaled_df = pd.DataFrame(scaled, columns=features)
    scaled_df[target] = df_ret[target].values

    # --- Sequenzen ---
    X, y = [], []
    for i in range(len(scaled_df) - seq_len):
        X.append(scaled_df.iloc[i:i+seq_len][features].values)
        y.append(scaled_df.iloc[i+seq_len][target])
    X, y = np.array(X), np.array(y)

    # --- Split ---
    split = int(len(X) * 0.7)
    X_train, X_test = X[:split], X[split:]
    y_train, y_test = y[:split], y[split:]

    # --- Modell ---
    model = Sequential([
        LSTM(lstm_units[0], return_sequences=True, input_shape=(seq_len, len(features)), kernel_regularizer=l2(0.001)),
        Dropout(dropouts[0]),
        LSTM(lstm_units[1], return_sequences=True, kernel_regularizer=l2(0.001)),
        Dropout(dropouts[1]),
        LSTM(lstm_units[2], kernel_regularizer=l2(0.001)),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=Adam(0.001), loss='binary_crossentropy', metrics=['accuracy'])

    # --- Training ---
    early_stop = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)
    history = model.fit(
        X_train, y_train,
        validation_split=0.2,
        epochs=50,
        batch_size=batch_size,
        callbacks=[early_stop],
        verbose=0
    )

    # --- Loss-Plot ---
    plt.figure(figsize=(10, 5))
    plt.plot(history.history['loss'], label='Train Loss')
    plt.plot(history.history['val_loss'], label='Val Loss')
    plt.title(f"Train vs Validation Loss\n{features}")
    plt.xlabel("Epochs")
    plt.ylabel("Loss (binary_crossentropy)")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

    # --- Vorhersage ---
    y_pred_proba = model.predict(X_test).flatten()
    y_pred = (y_pred_proba > 0.5).astype(int)

    plt.figure(figsize=(14, 6))
    plt.plot(y_pred_proba, label="Predicted Signal (OOS)", color='steelblue')
    plt.plot(y_test, label="True Labels (OOS)", color='darkorange', alpha=0.5)
    plt.axhline(np.mean(y_pred_proba), linestyle='--', color='gray', label="Mean Prediction (Sharpe ref)")
    plt.axvline(0, linestyle=':', color='black', label="Train/Test Split")
    plt.title(f"Prediction vs. True (Test)\n{features}")
    plt.xlabel("Zeitindex")
    plt.ylabel("Signal-Wahrscheinlichkeit")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

    # --- Strategy Return ---
    true_returns = df_ret.iloc[-len(y_test):][target].values
    signal = 2 * y_pred_proba - 1
    strategy_return = signal * true_returns
    cum_strat = np.cumsum(strategy_return)
    cum_mkt = np.cumsum(true_returns)

    plt.figure(figsize=(14, 6))
    plt.plot(cum_strat, label="Strategy PnL", linewidth=2)
    plt.plot(cum_mkt, label="Market Return", linewidth=2)
    plt.axhline(np.mean(cum_strat), linestyle='--', color='gray', label="Mean Strategy PnL")
    plt.title(f"Strategy Backtest\n{features}")
    plt.xlabel("Zeitindex")
    plt.ylabel("Kumulierte Rendite")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

    # --- Rolling Sharpe ---
    rolling = pd.Series(strategy_return)
    rolling_sharpe = rolling.rolling(20).mean() / rolling.rolling(20).std()

    plt.figure(figsize=(14, 6))
    plt.plot(rolling_sharpe, label='Rolling Sharpe Ratio')
    plt.axhline(rolling_sharpe.mean(), linestyle='--', color='gray', label='⟨Sharpe⟩')
    plt.title(f"Rolling Sharpe Ratio\n{features}")
    plt.xlabel("Zeitindex")
    plt.ylabel("Sharpe Ratio (20d)")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

    # --- Metriken ---
    ic, _ = spearmanr(y_test, y_pred_proba)
    mean_ret = np.mean(strategy_return)
    std_ret = np.std(strategy_return)
    sharpe = mean_ret / std_ret if std_ret > 0 else 0
    val_loss = history.history['val_loss'][-1]

    # --- Output ---
    print(f"📊 Abschluss für Features {features}")
    print(f"   → Validation Loss: {val_loss:.5f}")
    print(f"   → Information Coefficient (IC): {ic:.3f}")
    print(f"   → Sharpe Ratio: {sharpe:.3f}")
    print()


18/18 [==============================] - 1s 6ms/step
📊 Abschluss für Features ['_TY', '_DXY', '_LCP']
   → Validation Loss: 0.70697
   → Information Coefficient (IC): 0.021
   → Sharpe Ratio: 0.718

======================================================================
🔍 Features: ['CPI', '_TY', '_LCP']
🧠 Config: LSTM=(64, 64, 64), Dropout=(0.2, 0.4), Batch=256
======================================================================
18/18 [==============================] - 1s 5ms/step
📊 Abschluss für Features ['CPI', '_TY', '_LCP']
   → Validation Loss: 0.70564
   → Information Coefficient (IC): 0.029
   → Sharpe Ratio: 0.937