In [None]:
# ==============================================================
# Binance BTC/USDT hourly data loader and cleaner
# --------------------------------------------------------------
# • Reads all zipped monthly 1‑hour klines
# • Detects and fixes timestamp unit inconsistencies (ms/µs/ns)
# • Builds exact hourly timeline
# • Fills small gaps (ffill) and longer gaps (log‑linear interpolation)
# ==============================================================

import pandas as pd
import zipfile
from pathlib import Path
import numpy as np

# --------------------------------------------------------------
# 1) Configuration
# --------------------------------------------------------------
path = Path("data/spot/monthly/klines/BTCUSDT/1h/2017-01-01_2025-10-04")

columns = [
    "Open Time",
    "Open",
    "High",
    "Low",
    "Close",
    "Volume",
    "Close Time",
    "Quote Asset Volume",
    "Number of Trades",
    "Taker Buy Base Volume",
    "Taker Buy Quote Volume",
    "Ignore",
]


# --------------------------------------------------------------
# 2) Helper: detect and normalize timestamp units
# --------------------------------------------------------------
def normalize_timestamp_units(series: pd.Series, label: str = "") -> pd.Series:
    """
    Detect timestamps in milliseconds, microseconds, or nanoseconds
    and convert everything to milliseconds.
    Also print how many fall into each category so the totals check out.
    """
    s = series.astype(float)

    mask_us = s > 1e14  # likely microseconds
    mask_ns = s > 1e17  # likely nanoseconds (rare)
    mask_ms = ~(mask_us | mask_ns)  # the rest are milliseconds

    n_total = len(s)
    n_ms, n_us, n_ns = mask_ms.sum(), mask_us.sum(), mask_ns.sum()
    label_str = f" for {label}" if label else ""
    print(
        f"🕓 Timestamp units{label_str}: "
        f"{n_ms:,} ms  |  {n_us:,} µs  |  {n_ns:,} ns  |  total = {n_total:,}"
    )

    if n_us:
        s.loc[mask_us] /= 1_000.0
    if n_ns:
        s.loc[mask_ns] /= 1_000_000.0
    return s


# --------------------------------------------------------------
# 3) Read all zipped monthly files
# --------------------------------------------------------------
frames = []
for zip_path in path.glob("*.zip"):
    with zipfile.ZipFile(zip_path) as z:
        csv_name = z.namelist()[0]
        with z.open(csv_name) as f:
            df = pd.read_csv(
                f,
                header=None,
                names=columns,
                dtype={
                    "Open Time": "float64",
                    "Close Time": "float64",
                    "Open": "float64",
                    "High": "float64",
                    "Low": "float64",
                    "Close": "float64",
                    "Volume": "float64",
                    "Quote Asset Volume": "float64",
                    "Number of Trades": "int32",
                    "Taker Buy Base Volume": "float64",
                    "Taker Buy Quote Volume": "float64",
                    "Ignore": "float64",
                },
            )
            frames.append(df)

df = pd.concat(frames, ignore_index=True)
print(f"\n📦 Loaded {len(df):,} hourly rows")

# --------------------------------------------------------------
# 4) Normalize timestamps and convert to datetime
# --------------------------------------------------------------
df["Open Time"] = normalize_timestamp_units(df["Open Time"], "Open Time")
df["Close Time"] = normalize_timestamp_units(df["Close Time"], "Close Time")

df["Open Time"] = pd.to_datetime(df["Open Time"], unit="ms", errors="coerce")
df["Close Time"] = pd.to_datetime(df["Close Time"], unit="ms", errors="coerce")

# --------------------------------------------------------------
# 5) Set index, remove duplicates, regularize to 1‑hour grid
# --------------------------------------------------------------
df = df.set_index("Open Time").sort_index()

if not df.index.is_unique:
    dups = df.index.duplicated().sum()
    print(f"⚠️  Found {dups:,} duplicate timestamps → dropping duplicates")
    df = df[~df.index.duplicated(keep="first")]

df = df.asfreq("1h")
print(
    f"🕒 Time range: {df.index.min()} → {df.index.max()} | {len(df):,} rows after asfreq"
)

# --------------------------------------------------------------
# 6) Detect and fill missing hourly data
# --------------------------------------------------------------
missing_before = df.isna().any(axis=1).sum()
if missing_before == 0:
    print("\n✅ No missing hourly candles detected.")
else:
    print(f"\n🚧 Found {missing_before:,} missing hourly candles → filling...")

    # Simple forward fill for short gaps and edges
    df = df.ffill()

    # Log‑linear interpolation for price columns
    price_cols = ["Open", "High", "Low", "Close"]
    num_cols = df.select_dtypes(include=[np.number]).columns

    df[price_cols] = np.exp(
        np.log(df[price_cols]).interpolate(method="time", limit_direction="both")
    )

    # Linear interpolation for other numeric columns
    other_cols = num_cols.difference(price_cols)
    df[other_cols] = df[other_cols].interpolate(method="time", limit_direction="both")

    still_missing = df.isna().any(axis=1).sum()
    if still_missing == 0:
        print("✅ All missing values filled successfully.")
    else:
        print(
            f"⚠️  {still_missing} rows remain partially missing (likely at dataset edges)."
        )

# --------------------------------------------------------------
# 7) Final summary
# --------------------------------------------------------------
print("\n===== FINAL SUMMARY =====")
print(f"Shape: {df.shape}")
print(f"Time span: {df.index.min()} → {df.index.max()}")
print(f"Total hours: {len(df):,}")
print(f"Columns: {df.columns}\n")
print("\nPreview:")
print(df.head(), "\n...\n", df.tail())

### How to choose hyperparameters for event count score?

In [None]:
# --- parameters -------------------------------------------------------------
d = 20  # expected cycle length in days

# --- dataset information ---------------------------------------------------
start, end = df.index.min(), df.index.max()
n_days = (end - start).days
n_years = n_days / 365.25  # more precise year length

# total expected number of DC cycles
res = (len(df["Close"]) / 24) / d  # ≈ expected number of events

print(
    f"If you expect one full directional‑change cycle every {d} days, "
    f"you’d have roughly {res:.0f} events over a {n_years:.1f}‑year interval.\n"
)

# --- range types ------------------------------------------------------------
range_types = {
    # comment or delete lines for ranges you don’t want to use
    "narrow": (0.8, 1.2),  # very strict bounds
    "medium": (0.5, 1.5),  # balanced bounds
    "broad": (0.3, 2.0),  # tolerant bounds
}

# --- compute and print derived parameters ----------------------------------
for name, (a1, a2) in range_types.items():
    N_min = int(a1 * res)
    N_max = int(a2 * res)
    print(f"{name.capitalize()} range → α₁={a1}, α₂={a2}")
    print(f"  N_min={N_min:<5d} N_max={N_max:<5d} (target ~{int(res)} events)\n")

### How to choose hyperparameters for up_down_asymmetry?
the up–down asymmetry measure has no theoretical bounds or closed‑form scaling constant.
Its magnitude depends on the realized overshoots (`OSV_EXT`) computed from
the directional–change segmentation, which vary strongly with the threshold θ,
the instrument’s volatility, and the sampling frequency.

In [None]:
from core.dc import compute_directional_change_events, attach_OSV_EXT_to_runs

thetas = np.linspace(0.01, 0.30, 30)
prices = df["Close"].to_numpy()

all_osv = []
for theta in thetas:
    events, runs = compute_directional_change_events(prices, theta)
    runs = attach_OSV_EXT_to_runs(runs, theta)

    # gather all overshoot values
    all_osv.extend(r["OSV_EXT"] for r in runs if r.get("OSV_EXT") is not None)

osv_min, osv_max = np.min(all_osv), np.max(all_osv)
print(f"Global OSV range across θ: min={osv_min:.4f}, max={osv_max:.4f}")

In [None]:
import numpy as np
import plotly.graph_objects as go
from core.dc import (
    compute_directional_change_events,
    attach_OSV_EXT_to_runs,
    attach_TMV_EXT_to_runs,
    attach_T_to_runs,
    attach_R_to_runs,
)
from core.opt import event_count_score

# thresholds from 1 % to 30 %
thetas = np.linspace(0.01, 0.30, 30)
prices = df["Close"].to_numpy()

penalty_vals, counts = [], []

for theta in thetas:
    events, runs = compute_directional_change_events(prices, theta)
    runs = attach_TMV_EXT_to_runs(runs, theta)
    runs = attach_OSV_EXT_to_runs(runs, theta)
    runs = attach_T_to_runs(runs)
    runs = attach_R_to_runs(runs, theta)

    penalty_vals.append(event_count_score(runs, N_min=48, N_max=146, p=1.5))
    counts.append(len(runs))
    print(f"θ={theta:.3f} | runs={len(runs):4d} | penalty={penalty_vals[-1]:.4f}")

x = thetas * 100  # percent for x-axis

# --- Build interactive Plotly figure ---
fig = go.Figure()

# Left axis: Event-count penalty
fig.add_trace(
    go.Scatter(
        x=x,
        y=penalty_vals,
        name="Penalty",
        mode="lines+markers",
        marker=dict(color="blue"),
        line=dict(color="blue"),
        hovertemplate="θ=%{x:.2f}%<br>Penalty=%{y:.4f}",
    )
)

# Right axis: Number of runs
fig.add_trace(
    go.Scatter(
        x=x,
        y=counts,
        name="Event count",
        mode="lines+markers",
        marker=dict(color="red", symbol="square"),
        line=dict(color="red", dash="dash"),
        yaxis="y2",
        hovertemplate="θ=%{x:.2f}%<br>Runs=%{y}",
    )
)

# Layout with proper axis definitions
fig.update_layout(
    title="Event‑count penalty (left) vs Number of runs (right)",
    xaxis=dict(title="Threshold θ (%)"),
    yaxis=dict(
        title=dict(text="Event‑count penalty", font=dict(color="blue")),
        tickfont=dict(color="blue"),
    ),
    yaxis2=dict(
        title=dict(text="Number of runs", font=dict(color="red")),
        tickfont=dict(color="red"),
        overlaying="y",
        side="right",
    ),
    legend=dict(x=0.02, y=0.98, bgcolor="rgba(0,0,0,0)"),
    template="plotly_white",
)

fig.show()

In [None]:
from core.opt import up_down_asymmetry

asym_vals, counts = [], []

for theta in thetas:
    # run DC segmentation and attach indicators
    events, runs = compute_directional_change_events(prices, theta)
    runs = attach_TMV_EXT_to_runs(runs, theta)
    runs = attach_OSV_EXT_to_runs(runs, theta)
    runs = attach_T_to_runs(runs)
    runs = attach_R_to_runs(runs, theta)

    # compute normalized asymmetry: returns (mu_up_norm, mu_down_norm)
    mu_up_norm, mu_down_norm = up_down_asymmetry(runs, osv_min, osv_max)
    # total score could be combined (for example, mean of both directions)
    score = (mu_up_norm + mu_down_norm) / 2.0

    asym_vals.append(score)
    counts.append(len(runs))
    print(
        f"θ={theta:.3f} | runs={len(runs):4d} | μ_up={mu_up_norm:.4f} | μ_down={mu_down_norm:.4f} | score={score:.4f}"
    )

# --- 3️⃣  Build interactive Plotly figure -------------------------------
x = thetas * 100  # percent for the x‑axis
fig = go.Figure()

# main: asymmetry score
fig.add_trace(
    go.Scatter(
        x=x,
        y=asym_vals,
        name="Up–Down Asymmetry Score (avg of μ↑, μ↓)",
        mode="lines+markers",
        marker=dict(color="dodgerblue"),
        line=dict(color="dodgerblue"),
        hovertemplate="θ = %{x:.2f}%<br>Asymmetry Score = %{y:.4f}",
    )
)

# optional secondary trace: event count
fig.add_trace(
    go.Scatter(
        x=x,
        y=counts,
        name="Event Count",
        mode="lines+markers",
        marker=dict(color="red", symbol="square"),
        line=dict(color="red", dash="dash"),
        yaxis="y2",
        hovertemplate="θ = %{x:.2f}%<br>Runs = %{y}",
    )
)

# layout: left = score, right = event count
fig.update_layout(
    title="Normalized Up–Down Asymmetry Score (left) vs Event Count (right)",
    xaxis=dict(title="Threshold θ (%)"),
    yaxis=dict(
        title=dict(text="Asymmetry Score [0–1]", font=dict(color="dodgerblue")),
        tickfont=dict(color="dodgerblue"),
    ),
    yaxis2=dict(
        title=dict(text="Number of Runs", font=dict(color="red")),
        tickfont=dict(color="red"),
        overlaying="y",
        side="right",
    ),
    legend=dict(x=0.02, y=0.98, bgcolor="rgba(0,0,0,0)"),
    template="plotly_white",
)

fig.show()

In [None]:
from deap import base, creator, tools, algorithms
import random
import numpy as np
from matplotlib import pyplot as plt


# =====================================================
# 1️⃣  Evaluation function
# =====================================================


def make_evaluate_theta(prices, osv_min, osv_max, N_min, N_max, p):
    """
    Create an evaluation function that DEAP can call.
    Closes over the provided constants (data + global parameters).
    """

    def evaluate_theta(individual):
        θ = individual[0]
        # --- Run DC segmentation and attach indicators -----------------
        events, runs = compute_directional_change_events(prices, θ)
        runs = attach_OSV_EXT_to_runs(runs, θ)

        # --- Compute objectives ----------------------------------------
        f1 = event_count_score(runs, N_min=N_min, N_max=N_max, p=p)
        μ_up, μ_down = up_down_asymmetry(runs, osv_min, osv_max)
        f2 = (μ_up + μ_down) / 2.0  # combine up & down components

        return f1, f2  # maximize both objectives

    return evaluate_theta


# =====================================================
# 2️⃣  NSGA‑II setup / execution
# =====================================================


def run_nsga2(
    prices,
    osv_min,
    osv_max,
    N_min=40,
    N_max=170,
    p=2,
    ngen=50,
    pop_size=80,
    cxpb=0.7,
    mutpb=0.3,
    seed=42,
):
    random.seed(seed)
    np.random.seed(seed)

    # --- Setup evolutionary framework -----------------
    creator.create("FitnessMulti", base.Fitness, weights=(1.0, 1.0))  # maximize both
    creator.create("Individual", list, fitness=creator.FitnessMulti)

    toolbox = base.Toolbox()
    toolbox.register("attr_theta", lambda: random.uniform(0.001, 0.40))
    toolbox.register(
        "individual", tools.initRepeat, creator.Individual, toolbox.attr_theta, n=1
    )
    toolbox.register("population", tools.initRepeat, list, toolbox.individual)

    toolbox.register(
        "evaluate", make_evaluate_theta(prices, osv_min, osv_max, N_min, N_max, p)
    )
    toolbox.register("mate", tools.cxBlend, alpha=0.5)
    toolbox.register(
        "mutate", tools.mutPolynomialBounded, low=0.001, up=0.40, eta=25.0, indpb=1.0
    )
    toolbox.register("select", tools.selNSGA2)

    # --- Initialize population & Hall of Fame ----------
    pop = toolbox.population(n=pop_size)
    hof = tools.ParetoFront()

    # --- Run evolutionary loop (all keyword style) -----
    algorithms.eaMuPlusLambda(
        population=pop,
        toolbox=toolbox,
        mu=pop_size,
        lambda_=2 * pop_size,
        cxpb=cxpb,
        mutpb=mutpb,
        ngen=ngen,
        halloffame=hof,
        verbose=True,
    )

    # --- Extract Pareto‑optimal results ----------------
    pareto_thetas = np.array([ind[0] for ind in hof])
    pareto_scores = np.array([ind.fitness.values for ind in hof])
    return pareto_thetas, pareto_scores


# =====================================================
# 3️⃣  Run optimizer
# =====================================================

prices = df["Close"].to_numpy()

pareto_thetas, pareto_scores = run_nsga2(
    prices,
    osv_min=osv_min,
    osv_max=osv_max,
    N_min=73,
    N_max=220,
    p=2,
    cxpb=0.7,
    mutpb=0.3,
    ngen=40,
    pop_size=40,
)

# =====================================================
# 4️⃣  Visualize Pareto front
# =====================================================

score_event = pareto_scores[:, 0]
score_asym = pareto_scores[:, 1]

plt.figure(figsize=(6, 5))
plt.scatter(score_event, score_asym, s=50, c=pareto_thetas, cmap="viridis")
plt.xlabel("Event‑count score [0–1]")
plt.ylabel("Asymmetry score [0–1]")
plt.title("Pareto‑optimal front via NSGA‑II")
plt.colorbar(label="θ")
plt.grid(alpha=0.4)
plt.tight_layout()
plt.show()

for θ, f1, f2 in zip(pareto_thetas, score_event, score_asym):
    print(f"θ={θ:.4f} | event_count={f1:.4f} | asymmetry={f2:.4f}")

In [None]:
from core.plotting import plot_directional_change_runs

theta = 0.1596
prices = df["Close"].to_numpy()

events, runs = compute_directional_change_events(prices, theta)

end_date = pd.Timestamp.today().normalize()
start_date = end_date - pd.DateOffset(years=1)

plot_directional_change_runs(
    prices=prices,
    theta=0.16,
    events=events,
    runs=runs,
    datetimes=df["Close Time"].to_numpy(),
    start=start_date,
    end=end_date,
    mark_events=True,
)