# Interview Notebook — ARIMA-GARCH Thesis

**Scope:** kompakte Version für Gespräch, Tech-Screening und Case-Deep-Dive.  
**Assets:** BTC, ETH, DOGE, SOL  
**Periode:** 2020-05-11 bis 2024-04-20

## Was dieses Notebook zeigt

1. Datenzugriff und Return-Engineering  
2. ARIMA (Mean) + GARCH (Volatility) in einem Pipeline-Schritt  
3. Rolling Forecast-Konzept  
4. Risiko-Validierung über 5% VaR Backtest

In [None]:
CONFIG = {
    "symbol": "BTC-USD",
    "start": "2020-05-11",
    "end": "2024-04-20",
    "split": (0.70, 0.15, 0.15),
    "rolling_window": 60,
    "var_alpha": 0.05,
    "ewma_lambda": 0.94,
}

CONFIG

: 

In [None]:
import numpy as np
import pandas as pd
import yfinance as yf

from scipy.stats import norm, chi2
from statsmodels.tsa.arima.model import ARIMA
from arch import arch_model

: 

In [None]:
def load_series(symbol: str, start: str, end: str) -> pd.DataFrame:
    raw = yf.download(symbol, start=start, end=end, auto_adjust=True, progress=False)
    data = raw[["Close"]].rename(columns={"Close": "close"}).copy()
    data["log_return"] = np.log(data["close"] / data["close"].shift(1))
    data["sq_return"] = data["log_return"] ** 2
    return data.dropna()

series = load_series(CONFIG["symbol"], CONFIG["start"], CONFIG["end"])
series.head()

: 

In [None]:
def split_series(df: pd.DataFrame, split=(0.70, 0.15, 0.15)):
    n = len(df)
    n_train = int(n * split[0])
    n_val = int(n * split[1])
    train = df.iloc[:n_train]
    val = df.iloc[n_train:n_train+n_val]
    test = df.iloc[n_train+n_val:]
    return train, val, test

train, val, test = split_series(series, CONFIG["split"])
pd.Series({"train": len(train), "val": len(val), "test": len(test)})

In [None]:
def fit_arima_garch(returns: pd.Series, arima_order=(1,0,1), garch_type="FIGARCH", dist="t"):
    r = returns.dropna().astype(float)

    arima_res = ARIMA(r, order=arima_order, enforce_stationarity=False, enforce_invertibility=False).fit()
    resid_scaled = arima_res.resid * 100.0

    if garch_type == "FIGARCH":
        garch = arch_model(resid_scaled, vol="FIGARCH", p=1, q=1, dist=dist, mean="Zero", rescale=False)
    elif garch_type == "EGARCH":
        garch = arch_model(resid_scaled, vol="EGARCH", p=1, o=1, q=1, dist=dist, mean="Zero", rescale=False)
    elif garch_type == "GJR":
        garch = arch_model(resid_scaled, vol="GARCH", p=1, o=1, q=1, dist=dist, mean="Zero", rescale=False)
    else:
        garch = arch_model(resid_scaled, vol="GARCH", p=1, q=1, dist=dist, mean="Zero", rescale=False)

    garch_res = garch.fit(disp="off", show_warning=False)
    return arima_res, garch_res

fit_returns = pd.concat([train["log_return"], val["log_return"]])
arima_res, garch_res = fit_arima_garch(fit_returns)

{
    "arima_aic": float(arima_res.aic),
    "garch_aic": float(garch_res.aic),
    "n_obs": int(len(fit_returns)),
}

In [None]:
def one_step_forecast(arima_res, garch_res):
    mean_fc = float(arima_res.get_forecast(steps=1).predicted_mean.iloc[0])
    var_fc_scaled = float(garch_res.forecast(horizon=1, reindex=False).variance.iloc[0, 0])
    var_fc = max(var_fc_scaled / (100.0 ** 2), 0.0)
    return mean_fc, var_fc

mu_1d, var_1d = one_step_forecast(arima_res, garch_res)
mu_1d, var_1d

In [None]:
def rmse(y_true, y_pred):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    return float(np.sqrt(np.mean((y_true - y_pred) ** 2)))

def qlike(actual_var, forecast_var, eps=1e-8):
    a = np.asarray(actual_var, dtype=float)
    f = np.asarray(forecast_var, dtype=float)
    a = np.clip(a, eps, None)
    f = np.clip(f, eps, None)
    return float(np.mean((a / f) - np.log(a / f) - 1))

In [None]:
def rolling_backtest(test_returns: pd.Series, full_returns: pd.Series, window=60):
    full_r = full_returns.dropna().astype(float)
    test_r = test_returns.dropna().astype(float)

    start_idx = full_r.index.get_loc(test_r.index[0])
    rows = []

    for i in range(start_idx, len(full_r) - 1):
        hist = full_r.iloc[max(0, i-window):i]
        if len(hist) < window:
            continue

        ar, gr = fit_arima_garch(hist)
        mu, var_model = one_step_forecast(ar, gr)

        actual_next = float(full_r.iloc[i + 1])
        naive_next = float(full_r.iloc[i])

        ewma_var = float(hist.var())
        for r in hist:
            ewma_var = CONFIG["ewma_lambda"] * ewma_var + (1 - CONFIG["ewma_lambda"]) * (r ** 2)

        rows.append({
            "date": full_r.index[i + 1],
            "actual": actual_next,
            "model_return": mu,
            "naive_return": naive_next,
            "actual_var": actual_next ** 2,
            "model_var": var_model,
            "ewma_var": ewma_var,
        })

    return pd.DataFrame(rows)

bt = rolling_backtest(test["log_return"], pd.concat([train["log_return"], val["log_return"], test["log_return"]]), window=CONFIG["rolling_window"])
bt.head()

In [None]:
def parametric_var(mean, var, alpha=0.05):
    sigma = np.sqrt(max(var, 1e-12))
    return float(mean + norm.ppf(alpha) * sigma)

def kupiec_pof(violations: np.ndarray, alpha=0.05):
    x = int(violations.sum())
    n = int(len(violations))
    if n == 0:
        return np.nan
    p_hat = np.clip(x / n, 1e-8, 1 - 1e-8)
    ll_h0 = (n - x) * np.log(1 - alpha) + x * np.log(alpha)
    ll_h1 = (n - x) * np.log(1 - p_hat) + x * np.log(p_hat)
    lr = -2 * (ll_h0 - ll_h1)
    return float(1 - chi2.cdf(lr, 1))

bt = bt.copy()
bt["model_var_5"] = bt.apply(lambda x: parametric_var(x["model_return"], x["model_var"], CONFIG["var_alpha"]), axis=1)
bt["ewma_var_5"] = bt.apply(lambda x: parametric_var(0.0, x["ewma_var"], CONFIG["var_alpha"]), axis=1)

viol_model = (bt["actual"] < bt["model_var_5"]).astype(int).values
viol_ewma = (bt["actual"] < bt["ewma_var_5"]).astype(int).values

summary = pd.DataFrame({
    "metric": ["RMSE return", "RMSE variance", "QLIKE variance", "Kupiec p-value"],
    "ARIMA-GARCH": [
        rmse(bt["actual"], bt["model_return"]),
        rmse(bt["actual_var"], bt["model_var"]),
        qlike(bt["actual_var"], bt["model_var"]),
        kupiec_pof(viol_model, CONFIG["var_alpha"]),
    ],
    "Benchmark": [
        rmse(bt["actual"], bt["naive_return"]),
        rmse(bt["actual_var"], bt["ewma_var"]),
        qlike(bt["actual_var"], bt["ewma_var"]),
        kupiec_pof(viol_ewma, CONFIG["var_alpha"]),
    ],
})

summary

## Gesprächs-Storyline (2–3 Minuten)

- **Problem:** 24/7-Volatilität in Krypto erschwert robuste Risikoabschätzung.  
- **Ansatz:** ARIMA für Mean-Dynamik, GARCH-Familie für Volatilitätscluster.  
- **Vergleich:** gegen Naive-Return und EWMA-Volatilität.  
- **Validierung:** Rolling Backtest + VaR-Backtests (Kupiec).  
- **Takeaway:** Return-Edge ist oft klein, Risk-Modelling-Mehrwert ist klarer.