In [None]:
from core.data import load_all_klines
import pandas as pd

YEARS = 4

df = load_all_klines(
    root="data/data/spot/monthly/klines/",
    interval="1h",
    range_folder="2017-01-01_2025-10-08",
)

# Get all original symbols
all_symbols = df.index.get_level_values("Symbol").unique()

# Compute time span per symbol
time_spans = df.groupby(level="Symbol").apply(
    lambda g: g.index.get_level_values("Open Time")[-1]
    - g.index.get_level_values("Open Time")[0]
)

min_timedelta = pd.Timedelta(days=YEARS * 365.25)

# Identify valid symbols
valid_symbols = time_spans[time_spans >= min_timedelta].index

# Identify dropped symbols
dropped_symbols = all_symbols.difference(valid_symbols)

# Print dropped symbols
print(f"Dropped {len(dropped_symbols)} symbols (less than {YEARS} years of data):")
for sym in sorted(dropped_symbols):
    span = time_spans.get(sym, pd.Timedelta(0))
    print(f"  {sym}: {span.days} days")

# Filter the DataFrame
df = df[df.index.get_level_values("Symbol").isin(valid_symbols)]

In [None]:
import numpy as np
from core.plotting import plot_z_returns

df_close = df["Close"].unstack("Symbol")

# Compute log returns for each asset:
# r_{i,t} = ln(P_{i,t} / P_{i,t-1})
returns = np.log(df_close / df_close.shift(1))

# Compute the global (whole-period) volatility per asset:
# σ_i = sqrt( (1 / (T-1)) * Σ_t (r_{i,t} - mean(r_i))^2 )
# the standard deviation of returns for each coin
global_vol = returns.std()

# Volatility-normalized (z-scored) returns:
# z_{i,t} = r_{i,t} / σ_i
# expresses each return in units of that asset's own volatility (dimensionless)
z_returns = returns / global_vol

# Plot overlapping period of 3 Coins alongside Bitcoin
symbols = z_returns.columns.drop("BTCUSDT")
sampled = np.random.choice(symbols, 3, replace=False)
selected = ["BTCUSDT"] + sampled.tolist()

plot_df = z_returns[selected].dropna(how="any")
plot_df = plot_df.asfreq("1h")
assert not plot_df.isna().any().any(), "Unexpected NaNs after asfreq('1h')"


plot_z_returns(z_returns, selected)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
from tqdm import tqdm

# Toggle to run the computation
run = True

symbols = df.index.get_level_values("Symbol").unique()
n = len(symbols)
correlation_matrix = np.zeros((n, n))


def align_series(series1, series2):
    """Return two series aligned to their common timestamps."""
    common_index = series1.index.intersection(series2.index)
    return series1.loc[common_index], series2.loc[common_index]


if run:
    total_combinations = n * (n - 1) // 2
    progress_bar = tqdm(total=total_combinations, desc="Computing correlations")

    for i, symbol_i in enumerate(symbols):
        data_i = df.xs(symbol_i, level="Symbol")["Close"]
        for j, symbol_j in enumerate(symbols):
            if i >= j:  # use only upper triangle
                progress_bar.update(1 if i != j else 0)
                continue

            data_j = df.xs(symbol_j, level="Symbol")["Close"]
            s1, s2 = align_series(data_i, data_j)

            # Skip if no overlap
            if s1.empty or s2.empty:
                correlation_matrix[i, j] = np.nan
                progress_bar.update(1)
                continue

            # Pearson correlation of CLOSE prices (or log returns if you prefer)
            corr, _ = pearsonr(s1, s2)
            correlation_matrix[i, j] = corr
            progress_bar.update(1)

    progress_bar.close()

    # Symmetrize and set diagonals
    correlation_matrix = correlation_matrix + correlation_matrix.T
    np.fill_diagonal(correlation_matrix, 1.0)

    # Convert to DataFrame and save
    correlation_df = pd.DataFrame(correlation_matrix, index=symbols, columns=symbols)
    correlation_df.to_csv("correlation.csv")

    # --- 3️⃣ Heatmap plot ---
    plt.figure(figsize=(10, 8))
    plt.title("Correlation Matrix of Crypto Assets")
    plt.imshow(correlation_matrix, cmap="coolwarm", interpolation="none", aspect="auto")
    plt.colorbar(label="Pearson Correlation")
    plt.xticks(ticks=range(n), labels=symbols, rotation=90)
    plt.yticks(ticks=range(n), labels=symbols)
    plt.tight_layout()
    plt.show()

In [None]:
import pandas as pd
import numpy as np

correlation_df = pd.read_csv("correlation.csv", index_col="Symbol")

abs_corr = correlation_df["BTCUSDT"].abs().drop("BTCUSDT")

# Choose quantile threshold (e.g., keep 40% least correlated assets)
q = 0.3
t_corr = abs_corr.quantile(q)
print(f"Chosen correlation threshold (q={q:.2f}): {t_corr:.3f}")

# Assets less correlated to BTC
less_corr_assets = abs_corr[abs_corr <= t_corr].index.tolist()

# Combine BTC with less-correlated assets
selected_symbols = ["BTCUSDT"] + less_corr_assets

# Filter rows where 'Symbol' level is in the selected list
df_btc_lesscorr = (
    df.loc[df.index.get_level_values("Symbol").isin(selected_symbols)]
    .sort_index(level=["Symbol", "Open Time"])
    .copy()
)

print(
    "Included symbols:",
    df_btc_lesscorr.index.get_level_values("Symbol").unique().tolist(),
)
plot_z_returns(
    z_returns, df_btc_lesscorr.index.get_level_values("Symbol").unique().tolist()[:5]
)

#### Visualize event_density_score function

In [None]:
import numpy as np
import plotly.graph_objects as go
from core.dc import (
    compute_directional_change_events,
    attach_OSV_EXT_to_runs,
    attach_TMV_EXT_to_runs,
    attach_T_to_runs,
    attach_R_to_runs,
)
from core.opt import event_density_score

# df_single = df.loc["BTCUSDT"]

thetas = np.linspace(0.01, 10.0, 10000)
# prices = df_single["Close"].to_numpy()
prices = z_returns["BTCUSDT"]

penalty_vals, counts = [], []

# Reference: d_target = 0.002 → roughly one event every 500 samples
d_target = 0.002

for theta in thetas:
    events, runs = compute_directional_change_events(prices, theta)
    runs = attach_TMV_EXT_to_runs(runs, theta)
    runs = attach_OSV_EXT_to_runs(runs, theta)
    runs = attach_T_to_runs(runs)
    runs = attach_R_to_runs(runs, theta)

    penalty_vals.append(
        event_density_score(prices, events, d_target=d_target, alpha=2, beta=1.5)
    )
    counts.append(len(runs))

x = thetas * 100  # percent for x‑axis (thresholds in %)

# -------------------------------------------------------------------------
# Additional “log‑space” representation
# -------------------------------------------------------------------------
# Convert raw penalties to a view vs. log of density ratio:
T = len(prices)
densities = np.array(counts) / T
ratio = densities / d_target

# Use log10 for visual readability
log_ratio = np.log10(ratio)

# Sort both lists for a visually continuous log curve
# (optional, ensures x increases)
sorted_idx = np.argsort(log_ratio)
log_ratio_sorted = log_ratio[sorted_idx]
penalty_sorted = np.array(penalty_vals)[sorted_idx]

# --- Build interactive Plotly figure ---
fig = go.Figure()

# Left axis: Event‑count penalty
fig.add_trace(
    go.Scatter(
        x=x,
        y=penalty_vals,
        name="Penalty (vs θ)",
        mode="lines+markers",
        marker=dict(color="blue"),
        line=dict(color="blue"),
        hovertemplate="θ=%{x:.2f}%<br>Penalty=%{y:.4f}",
    )
)

# Additional trace: Penalty vs log10(d/d_target)
fig.add_trace(
    go.Scatter(
        x=log_ratio_sorted,
        y=penalty_sorted,
        name="Penalty (vs log₁₀ density ratio)",
        mode="lines+markers",
        marker=dict(color="green", symbol="circle"),
        line=dict(color="green", dash="dot"),
        hovertemplate="log₁₀(d/dₜ)=%{x:.3f}<br>Penalty=%{y:.4f}",
    )
)

# Right axis: Number of runs
fig.add_trace(
    go.Scatter(
        x=x,
        y=counts,
        name="Event count",
        mode="lines+markers",
        marker=dict(color="red", symbol="square"),
        line=dict(color="red", dash="dash"),
        yaxis="y2",
        hovertemplate="θ=%{x:.2f}%<br>Runs=%{y}",
    )
)

# Layout with proper axis definitions
fig.update_layout(
    title="Event‑count penalty (left) vs Number of runs (right)",
    xaxis=dict(title="Threshold θ (%) and log₁₀(d/dₜ)", showgrid=True),
    yaxis=dict(
        title=dict(text="Event‑count penalty", font=dict(color="blue")),
        tickfont=dict(color="blue"),
    ),
    yaxis2=dict(
        title=dict(text="Number of runs", font=dict(color="red")),
        tickfont=dict(color="red"),
        overlaying="y",
        side="right",
    ),
    legend=dict(x=0.02, y=0.98, bgcolor="rgba(0,0,0,0)"),
    template="plotly_white",
)

fig.show()

## Estimate the sqash value for up_down_asymmetry

### **Choosing a good `squash` (bootstrapped, multi‑coin)**

1. **Run the calibration script** — it collects overshoot ratios (|OSV_EXT| / θ) for all coins and performs bootstrapped sampling.  
   You’ll see a **distribution of squash estimates** and a **μ(test r)** stability plot.

2. **Check the histogram of bootstrapped `squash` values.**  
   - If it’s narrow and single‑peaked → calibration is stable.  
   - The mean / median of that distribution is your global `squash`.

3. **Inspect μ(test r) curves.**  
   - If curves almost overlap → excellent stability.  
   - If they spread widely → different coins behave differently → consider grouping or re‑running per group.

4. **Use the median bootstrap result** (reported as *“Recommended global squash value”*).  
   - Typical overshoots (around the median r) should yield μ ≈ 0.6 – 0.8.  
   - This keeps the metric responsive but not over‑sensitive.

5. **Practical rule:**  
   - Use the single, bootstrapped global `squash` for all coins → consistent, dimensionless normalization.  
   - Re‑run the bootstrap only if you add many new assets or regimes change noticeably.

---

> 💡 *Intuition:* the bootstrap automatically tests “what if I had different coins?”  
> If the squash estimate hardly changes (small spread), you’ve found a robust universal scaling constant.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from core.dc import compute_directional_change_events, attach_OSV_EXT_to_runs

# --- CONFIG ---------------------------------------------------------------
sample_frac = 0.7  # fraction of coins sampled per bootstrap
m_bootstrap = 50  # number of bootstrap iterations
thetas = np.linspace(0.01, 0.30, 30)
all_coins = z_returns.columns.to_list()
rng = np.random.default_rng(42)


# --- Helper: collect |OSV_EXT| / θ ratios for one coin --------------------
def collect_ratios(prices: np.ndarray, thetas: np.ndarray) -> np.ndarray:
    ratios = []
    for theta in thetas:
        events, runs = compute_directional_change_events(prices, theta)
        runs = attach_OSV_EXT_to_runs(runs, theta)
        osv = np.array(
            [r["OSV_EXT"] for r in runs if r.get("OSV_EXT") is not None], float
        )
        osv = osv[~np.isnan(osv)]
        if osv.size == 0:
            continue
        ratios.extend(np.abs(osv) / theta)
    return np.array(ratios)


# --- 1. Collect overshoot ratios per coin ---------------------------------
ratios_per_coin = {}
for coin in all_coins:
    prices = z_returns[coin].dropna().to_numpy()
    ratios = collect_ratios(prices, thetas)
    if ratios.size >= 100:
        ratios_per_coin[coin] = ratios

available_coins = list(ratios_per_coin.keys())
n_total = len(available_coins)
if n_total == 0:
    raise ValueError("No valid coins with sufficient overshoot data found.")

# --- 2. Baseline: median of coin medians ----------------------------------
coin_medians = {c: np.median(r) for c, r in ratios_per_coin.items()}
baseline = np.median(list(coin_medians.values()))
print(f"\nBaseline median of coin medians: {baseline:.2f} from {n_total} coins")

# --- 3. Bootstrap hierarchical medians ------------------------------------
boot_squash = []
for i in range(m_bootstrap):
    n_sample = max(1, int(sample_frac * n_total))
    sample = rng.choice(available_coins, size=n_sample, replace=True)
    sample_medians = [np.median(ratios_per_coin[c]) for c in sample]
    boot_squash.append(np.median(sample_medians))
boot_squash = np.array(boot_squash)

# --- 4. Bootstrap diagnostics ---------------------------------------------
print("\n--- Bootstrap stability ---")
mean_s, std_s = boot_squash.mean(), boot_squash.std()
cv = std_s / mean_s
print(f"Mean squash  : {mean_s:.2f}")
print(f"Std deviation: {std_s:.2f}")
print(f"Coeff. of variation: {cv:.3f}")

plt.hist(boot_squash, bins=12, color="cornflowerblue", edgecolor="k", alpha=0.8)
plt.axvline(mean_s, color="red", linestyle="--", label="mean")
plt.title("Bootstrap distribution of squash estimates")
plt.xlabel("Estimated squash")
plt.ylabel("Frequency")
plt.legend()
plt.tight_layout()
plt.show()

# --- 5. μ(test_r) stability curves ----------------------------------------
combined = np.concatenate(list(ratios_per_coin.values()))
test_r = np.linspace(np.percentile(combined, 5), np.percentile(combined, 95), 50)
plt.figure(figsize=(8, 5))
for s in boot_squash:
    plt.plot(test_r, 1 - np.exp(-test_r / s), color="gray", alpha=0.3, lw=1)
mu_mean = np.mean([1 - np.exp(-test_r / s) for s in boot_squash], axis=0)
mu_std = np.std([1 - np.exp(-test_r / s) for s in boot_squash], axis=0)
plt.plot(test_r, mu_mean, "k", lw=2, label="Mean μ curve")
plt.fill_between(
    test_r,
    mu_mean - mu_std,
    mu_mean + mu_std,
    color="lightgray",
    alpha=0.6,
    label="±1σ band",
)
plt.title("μ(test_r) stability (bootstrap over coins)")
plt.xlabel("|OSV_EXT| / θ")
plt.ylabel("μ = 1 - exp(-r/s)")
plt.grid(True, linestyle="--", alpha=0.4)
plt.legend()
plt.tight_layout()
plt.show()

# --- 6. Final global squash estimate --------------------------------------
global_squash = np.median(boot_squash)
ci_low, ci_high = np.percentile(boot_squash, [5, 95])
print(f"\n✅  Recommended global squash: {global_squash:.2f}")
print(f"    90% bootstrap CI: [{ci_low:.2f}, {ci_high:.2f}]")

#### Visualize up_down_asymmetry

In [None]:
from core.opt import up_down_asymmetry

prices = z_returns["BTCUSDT"]
asym_vals, counts = [], []

for theta in thetas:
    # run DC segmentation and attach indicators
    events, runs = compute_directional_change_events(prices, theta)
    runs = attach_TMV_EXT_to_runs(runs, theta)
    runs = attach_OSV_EXT_to_runs(runs, theta)
    runs = attach_T_to_runs(runs)
    runs = attach_R_to_runs(runs, theta)

    # compute normalized asymmetry: returns (mu_up_norm, mu_down_norm)
    mu_up_norm, mu_down_norm = up_down_asymmetry(runs, theta, 4)
    # total score could be combined (for example, mean of both directions)
    score = (mu_up_norm + mu_down_norm) / 2.0

    asym_vals.append(score)
    counts.append(len(runs))
    print(
        f"θ={theta:.3f} | runs={len(runs):4d} | μ_up={mu_up_norm:.4f} | μ_down={mu_down_norm:.4f} | score={score:.4f}"
    )

# --- 3️⃣  Build interactive Plotly figure -------------------------------
x = thetas * 100  # percent for the x‑axis
fig = go.Figure()

# main: asymmetry score
fig.add_trace(
    go.Scatter(
        x=x,
        y=asym_vals,
        name="Up–Down Asymmetry Score (avg of μ↑, μ↓)",
        mode="lines+markers",
        marker=dict(color="dodgerblue"),
        line=dict(color="dodgerblue"),
        hovertemplate="θ = %{x:.2f}%<br>Asymmetry Score = %{y:.4f}",
    )
)

# optional secondary trace: event count
fig.add_trace(
    go.Scatter(
        x=x,
        y=counts,
        name="Event Count",
        mode="lines+markers",
        marker=dict(color="red", symbol="square"),
        line=dict(color="red", dash="dash"),
        yaxis="y2",
        hovertemplate="θ = %{x:.2f}%<br>Runs = %{y}",
    )
)

# layout: left = score, right = event count
fig.update_layout(
    title="Normalized Up–Down Asymmetry Score (left) vs Event Count (right)",
    xaxis=dict(title="Threshold θ (%)"),
    yaxis=dict(
        title=dict(text="Asymmetry Score [0–1]", font=dict(color="dodgerblue")),
        tickfont=dict(color="dodgerblue"),
    ),
    yaxis2=dict(
        title=dict(text="Number of Runs", font=dict(color="red")),
        tickfont=dict(color="red"),
        overlaying="y",
        side="right",
    ),
    legend=dict(x=0.02, y=0.98, bgcolor="rgba(0,0,0,0)"),
    template="plotly_white",
)

fig.show()

In [None]:
from deap import base, creator, tools, algorithms
import random
import numpy as np
from matplotlib import pyplot as plt


# =====================================================
# 1️⃣  Evaluation function
# =====================================================


def make_evaluate_theta(prices, osv_min, osv_max, N_min, N_max, p):
    """
    Create an evaluation function that DEAP can call.
    Closes over the provided constants (data + global parameters).
    """

    def evaluate_theta(individual):
        θ = individual[0]
        # --- Run DC segmentation and attach indicators -----------------
        events, runs = compute_directional_change_events(prices, θ)
        runs = attach_OSV_EXT_to_runs(runs, θ)

        # --- Compute objectives ----------------------------------------
        f1 = event_count_score(runs, N_min=N_min, N_max=N_max, p=p)
        μ_up, μ_down = up_down_asymmetry(runs, osv_min, osv_max)
        f2 = (μ_up + μ_down) / 2.0  # combine up & down components

        return f1, f2  # maximize both objectives

    return evaluate_theta


# =====================================================
# 2️⃣  NSGA‑II setup / execution
# =====================================================


def run_nsga2(
    prices,
    osv_min,
    osv_max,
    N_min=40,
    N_max=170,
    p=2,
    ngen=50,
    pop_size=80,
    cxpb=0.7,
    mutpb=0.3,
    seed=42,
):
    random.seed(seed)
    np.random.seed(seed)

    # --- Setup evolutionary framework -----------------
    creator.create("FitnessMulti", base.Fitness, weights=(1.0, 1.0))  # maximize both
    creator.create("Individual", list, fitness=creator.FitnessMulti)

    toolbox = base.Toolbox()
    toolbox.register("attr_theta", lambda: random.uniform(0.001, 0.40))
    toolbox.register(
        "individual", tools.initRepeat, creator.Individual, toolbox.attr_theta, n=1
    )
    toolbox.register("population", tools.initRepeat, list, toolbox.individual)

    toolbox.register(
        "evaluate", make_evaluate_theta(prices, osv_min, osv_max, N_min, N_max, p)
    )
    toolbox.register("mate", tools.cxBlend, alpha=0.5)
    toolbox.register(
        "mutate", tools.mutPolynomialBounded, low=0.001, up=0.40, eta=25.0, indpb=1.0
    )
    toolbox.register("select", tools.selNSGA2)

    # --- Initialize population & Hall of Fame ----------
    pop = toolbox.population(n=pop_size)
    hof = tools.ParetoFront()

    # --- Run evolutionary loop (all keyword style) -----
    algorithms.eaMuPlusLambda(
        population=pop,
        toolbox=toolbox,
        mu=pop_size,
        lambda_=2 * pop_size,
        cxpb=cxpb,
        mutpb=mutpb,
        ngen=ngen,
        halloffame=hof,
        verbose=True,
    )

    # --- Extract Pareto‑optimal results ----------------
    pareto_thetas = np.array([ind[0] for ind in hof])
    pareto_scores = np.array([ind.fitness.values for ind in hof])
    return pareto_thetas, pareto_scores


# =====================================================
# 3️⃣  Run optimizer
# =====================================================

prices = df["Close"].to_numpy()

pareto_thetas, pareto_scores = run_nsga2(
    prices,
    osv_min=osv_min,
    osv_max=osv_max,
    N_min=73,
    N_max=220,
    p=2,
    cxpb=0.7,
    mutpb=0.3,
    ngen=40,
    pop_size=40,
)

# =====================================================
# 4️⃣  Visualize Pareto front
# =====================================================

score_event = pareto_scores[:, 0]
score_asym = pareto_scores[:, 1]

plt.figure(figsize=(6, 5))
plt.scatter(score_event, score_asym, s=50, c=pareto_thetas, cmap="viridis")
plt.xlabel("Event‑count score [0–1]")
plt.ylabel("Asymmetry score [0–1]")
plt.title("Pareto‑optimal front via NSGA‑II")
plt.colorbar(label="θ")
plt.grid(alpha=0.4)
plt.tight_layout()
plt.show()

for θ, f1, f2 in zip(pareto_thetas, score_event, score_asym):
    print(f"θ={θ:.4f} | event_count={f1:.4f} | asymmetry={f2:.4f}")

In [None]:
from core.plotting import plot_directional_change_runs

theta = 0.1596
prices = df["Close"].to_numpy()

events, runs = compute_directional_change_events(prices, theta)

end_date = pd.Timestamp.today().normalize()
start_date = end_date - pd.DateOffset(years=1)

plot_directional_change_runs(
    prices=prices,
    theta=0.16,
    events=events,
    runs=runs,
    datetimes=df["Close Time"].to_numpy(),
    start=start_date,
    end=end_date,
    mark_events=True,
)