In [None]:
import argparse
import glob
import math
from pathlib import Path
import os
import numpy as np
import pandas as pd
from scipy.optimize import curve_fit
import statsmodels.api as sm
import matplotlib.pyplot as plt

In [None]:
def loading(path):
    PX_COLS = [f"{s}_px_{i:02d}"
           for i in range(10) for s in ("bid", "ask")]
    SZ_COLS = [f"{s}_sz_{i:02d}"
           for i in range(10) for s in ("bid", "ask")]
    files = glob.glob(os.path.join(path, "*.csv"))
    df=[]
    for file in files:
        current = pd.read_csv(file, usecols=["ts_event"]+PX_COLS+SZ_COLS)
        df.append(current)
    df=pd.concat(df,ignore_index=True)
    df["ts"] = pd.to_datetime(df["ts_event"], unit="ns", utc=True)
    dt = df["ts"].dt.tz_convert("America/New_York")
    minute_idx = (dt.hour - 9) * 60 + (dt.minute - 30)
    df["m"] = minute_idx
    df[PX_COLS] = df[PX_COLS] * 1e-9
    return df[(df["m"] >= 0) & (df["m"] < 390)].sort_values("ts")

In [None]:
def minute_metrics(df, depth_bps) :
    rows = []
    group = df.groupby("m", sort=True, observed=True)
    for m, data in group:
        best_ask = data["ask_px_00"].iloc[-1]  
        best_bid = data["bid_px_00"].iloc[-1]
        if math.isnan(best_ask) or math.isnan(best_bid):
            continue
        spread = best_ask - best_bid
        mid = 0.5 * (best_ask + best_bid)

        ask_sz_0 = data["ask_sz_00"].iloc[-1]
        bid_sz_0 = data["bid_sz_00"].iloc[-1]
        imbalance = (ask_sz_0 - bid_sz_0) / (ask_sz_0 + bid_sz_0 + 1e-12)

        
        price_ceiling = best_ask * (1 + depth_bps / 1e4)
        mask_levels = [i for i in range(10)
                       if data[f"ask_px_{i:02d}"].iloc[-1] <= price_ceiling]
        V = sum(data[f"ask_sz_{i:02d}"].iloc[-1] for i in mask_levels)

        rows.append(dict(minute=m, mid=mid, spread=spread,
                         imbalance=imbalance, V=V))
    return pd.DataFrame(rows)

In [None]:
def g_t_buy(snap, mid, x_grid) :
    
    prices = snap[[f"ask_px_{i:02d}" for i in range(10)]].values
    sizes = snap[[f"ask_sz_{i:02d}" for i in range(10)]].values
    cum_sizes = np.cumsum(sizes)
    g = np.full_like(x_grid, np.nan, dtype=float)

    for k, x in enumerate(x_grid):
        idx = np.searchsorted(cum_sizes, x, side="left")
        if idx >= len(prices):
            continue  
        w = sizes.copy()
        w[idx] = x - (cum_sizes[idx-1] if idx > 0 else 0)
        vwap = np.average(prices[:idx+1], weights=w[:idx+1])
        g[k] = vwap - mid
    return g


def fit_alpha_eta(x_grid: np.ndarray,
                  g_vec: np.ndarray,
                  V: float) -> tuple[float, float]:
    mask = ~np.isnan(g_vec)
    if mask.sum() < 3 or V == 0:
        raise ValueError
    logx = np.log(x_grid[mask] / V)
    logy = np.log(g_vec[mask])
    X = sm.add_constant(logx)
    model = sm.OLS(logy, X).fit()
    alpha = model.params[1]
    eta = math.exp(model.params[0])
    return alpha, eta

In [None]:
def process_ticker(df, ticker, x_grid, plot=False, depth_bps=20.0) :
    met = minute_metrics(df, depth_bps)
    results = []
    for _, r in met.iterrows():
        m = int(r.minute)
        snap = df[df["m"] == m].iloc[-1]
        g_vec = g_t_buy(snap, r.mid, x_grid)
        try:
            alpha, eta = fit_alpha_eta(x_grid, g_vec, r.V)
        except ValueError:
            continue
        results.append(dict(minute=m, alpha=alpha, eta=eta,
                            spread=r.spread, imbalance=r.imbalance, V=r.V))
        if plot and m == met.minute.iloc[len(met)//2]:
            import matplotlib.ticker as mtick
            plt.figure(figsize=(4, 3))
            plt.loglog(x_grid, g_vec, "o", label="data")
            plt.loglog(x_grid, eta * (x_grid / r.V) ** alpha,
                       label=f"fit α={alpha:.2f}")
            plt.gca().yaxis.set_major_formatter(mtick.StrMethodFormatter('${x:.3f}'))
            plt.xlabel("amount x")
            plt.ylabel("gₜ(x)")
            plt.title(f"{ticker} minute {m}")
            plt.legend()
            plt.tight_layout()
            plt.show()
    return pd.DataFrame(results)

def regress_eta(df):
    df = df.copy().sort_values("minute")
    df["sigma"] = df["spread"].rolling(5, min_periods=1).std().bfill()
    y = df["eta"]
    X = sm.add_constant(df[["spread", "sigma", "imbalance"]])
    return sm.OLS(y, X).fit()