In [2]:
import os, datetime
import numpy as np
import pandas as pd

path = "./market_data/"
csv_files = [f for f in os.listdir(path) if f.endswith(".csv")]
dfs = {fname.replace(".csv", ""): pd.read_csv(os.path.join(path, fname)) for fname in csv_files}

def build_df_for_side(df, curr_side):
    df_side = df[df["side"] == curr_side]
    if df_side.empty:
        return pd.DataFrame(columns=["fecha_nano", "prices", "quantities", "side"])
    f = df_side["fecha_nano"].to_numpy(dtype=np.int64)
    p = df_side["price"].to_numpy(dtype=np.float64)
    q = df_side["quantity"].to_numpy(dtype=np.float64)
    order = np.argsort(f, kind="mergesort")
    f_s, p_s, q_s = f[order], p[order], q[order]
    keys, idx, cnt = np.unique(f_s, return_index=True, return_counts=True)
    starts, ends = idx, idx + cnt
    rows = []
    for s, e, k in zip(starts, ends, keys):
        seg_p = p_s[s:e].astype(np.float64)
        seg_q = q_s[s:e].astype(np.float64)
        valid = np.isfinite(seg_p) & np.isfinite(seg_q) & (seg_q > 0) & (seg_p > 0)
        seg_p = seg_p[valid]
        seg_q = seg_q[valid]
        rows.append({"fecha_nano": int(k), "prices": seg_p.tolist(), "quantities": seg_q.tolist(), "side": curr_side})
    return pd.DataFrame(rows)

def build_metrics(df):
    def _row_metrics(row):
        prices = np.asarray(row["prices"], dtype=float)
        qtys = np.asarray(row["quantities"], dtype=float)
        valid = np.isfinite(prices) & np.isfinite(qtys) & (qtys > 0)
        prices = prices[valid]
        qtys = qtys[valid]
        if qtys.size == 0 or np.nansum(qtys) == 0:
            return pd.Series({"vwap": np.nan, "spread": np.nan})
        vwap = np.average(prices, weights=qtys)
        var = np.average((prices - vwap) ** 2, weights=qtys)
        return pd.Series({"vwap": vwap, "spread": np.sqrt(var)})
    metrics = df.apply(_row_metrics, axis=1)
    return pd.concat([df[["fecha_nano", "side"]].reset_index(drop=True), metrics], axis=1)

def compute_instantaneous_returns(g):
    g = g.sort_values("ts").copy()
    p = g["vwap"].astype(float)
    dt = g["ts"].diff().dt.total_seconds()
    log_ret = np.log(p).diff()
    with np.errstate(divide="ignore", invalid="ignore"):
        r = log_ret / dt
    g["r_instant"] = pd.Series(r).replace([np.inf, -np.inf], np.nan).to_numpy()
    return g

all_series = {}
all_frames = []
for inst, df0 in dfs.items():
    df0 = df0[["fecha_nano", "price", "quantity", "side"]].dropna(subset=["price", "quantity", "side"])
    sides_present = df0["side"].dropna().unique().tolist()
    dfs_by_side = {s: build_df_for_side(df0, s) for s in sides_present}
    if len(dfs_by_side) == 0:
        continue
    df_concat = pd.concat(dfs_by_side.values(), ignore_index=True)
    df_metrics = build_metrics(df_concat).sort_values("fecha_nano").reset_index(drop=True)
    df_tmp = df_metrics.copy()
    df_tmp["ts"] = pd.to_datetime(df_tmp["fecha_nano"], unit="ns")
    df_tmp = df_tmp.sort_values(["side", "ts"])
    df_ret = df_tmp.groupby("side", group_keys=False).apply(compute_instantaneous_returns)
    series_by_side = {s: df_ret[df_ret["side"] == s].set_index("fecha_nano")["r_instant"] for s in df_ret["side"].unique()}
    all_series[inst] = series_by_side
    df_ret["instrument"] = inst
    all_frames.append(df_ret)

df_all = pd.concat(all_frames, ignore_index=True)

required_sides = {"BI", "OF", "TRADE"}
eligible = []
for inst, d in all_series.items():
    if not required_sides.issubset(set(d.keys())):
        continue
    ok = True
    for s in required_sides:
        ser = pd.Series(d[s]).dropna()
        if ser.size == 0:
            ok = False
            break
    if ok:
        eligible.append(inst)

selected_instruments = sorted(eligible)[:5]

series15_by_inst = {
    inst: {s: pd.Series(all_series[inst][s]).dropna() for s in ["BI", "OF", "TRADE"]}
    for inst in selected_instruments
}

df15 = df_all[df_all["instrument"].isin(selected_instruments) & df_all["side"].isin(["BI","OF","TRADE"])]
panel_wide = df15.pivot_table(index="fecha_nano", columns=["instrument","side"], values="r_instant").sort_index(axis=1)


  df_ret = df_tmp.groupby("side", group_keys=False).apply(compute_instantaneous_returns)
  df_ret = df_tmp.groupby("side", group_keys=False).apply(compute_instantaneous_returns)
  df_ret = df_tmp.groupby("side", group_keys=False).apply(compute_instantaneous_returns)
  df_ret = df_tmp.groupby("side", group_keys=False).apply(compute_instantaneous_returns)
  df_ret = df_tmp.groupby("side", group_keys=False).apply(compute_instantaneous_returns)


In [4]:
df15.sample(15)

Unnamed: 0,fecha_nano,side,vwap,spread,ts,r_instant,instrument
154466,1747073610333755486,BI,67.394012,0.026558,2025-05-12 18:13:30.333755486,6.323003e-06,AL30_1205_CI_CCL
639846,1747063723393361005,OF,77483.079208,15.154889,2025-05-12 15:28:43.393361005,0.0002343323,AL30_1205_CI_PESOS
61914,1747076024484504055,TRADE,23.01,0.0,2025-05-12 18:53:44.484504055,-0.02658391,PESOS-1305
635896,1747063565293130808,OF,77400.292252,15.449256,2025-05-12 15:26:05.293130808,0.01549268,AL30_1205_CI_PESOS
171238,1747077193796899303,BI,67.357419,0.012713,2025-05-12 19:13:13.796899303,3.819151,AL30_1205_CI_CCL
1096719,1747073693996290210,BI,68.347801,0.0124,2025-05-12 18:14:53.996290210,2.529992e-09,AL30_1205_CI_MEP
5380,1747064832207877940,BI,23.455725,0.010347,2025-05-12 15:47:12.207877940,-0.000500366,PESOS-1305
301662,1747075814038827317,OF,67.588846,0.024877,2025-05-12 18:50:14.038827317,-0.002387676,AL30_1205_CI_CCL
918611,1747067879246046266,TRADE,77760.0,0.0,2025-05-12 16:37:59.246046266,0.0,AL30_1205_CI_PESOS
941154,1747073306286405665,TRADE,78030.0,0.0,2025-05-12 18:08:26.286405665,-0.001594572,AL30_1205_CI_PESOS


In [None]:
import numpy as np
import pandas as pd

target_instrument = "AL30_1205_CI_CCL"
k_last = 3
dt_median_window = 20

df0 = df_all.copy()
if "ts" not in df0.columns:
    df0["ts"] = pd.to_datetime(df0["fecha_nano"], unit="ns")

trade = df0[df0["side"]=="TRADE"][["instrument","ts","vwap"]].dropna().sort_values(["instrument","ts"])
counts = trade.groupby("instrument").size().sort_values(ascending=False)
inst_pool = counts.index.tolist()
if target_instrument not in inst_pool:
    raise ValueError("Target instrument not found in df_all.")
others = [i for i in inst_pool if i!=target_instrument]
selected = [target_instrument] + others[:4]

def get_trade_df(inst):
    d = trade[trade["instrument"]==inst].sort_values("ts").reset_index(drop=True)
    d["t_sec"] = d["ts"].astype("int64")/1e9
    return d

def fit_line_last3_at_t(d, t):
    sub = d[d["ts"]<=t].tail(k_last)
    if len(sub)<k_last:
        return np.nan, np.nan
    x = sub["t_sec"].to_numpy()
    y = sub["vwap"].astype(float).to_numpy()
    xm = x.mean()
    X = np.vstack([np.ones_like(x), (x-xm)]).T
    a_c, b = np.linalg.lstsq(X, y, rcond=None)[0]
    t_sec = t.value/1e9
    y_t = a_c + b*(t_sec - xm)
    return float(y_t), float(b)

dfs_trade = {inst: get_trade_df(inst) for inst in selected}

t_grid = dfs_trade[target_instrument]["ts"].reset_index(drop=True)
valid_rows = []
for i in range(k_last-1, len(t_grid)-1):
    t0 = t_grid.iloc[i]
    t1 = t_grid.iloc[i+1]
    row = {"t0": t0, "t1": t1}
    ok = True
    p_vec = {}
    m_vec = {}
    for inst in selected:
        d = dfs_trade[inst]
        if inst==target_instrument:
            sub = d[d["ts"]<=t0].tail(k_last)
            if len(sub)<k_last:
                ok=False; break
            p_now_true = float(sub["vwap"].iloc[-1])
            _, m_now = fit_line_last3_at_t(d, t0)
            p_vec[f"p__{inst}"] = p_now_true
            m_vec[f"m__{inst}"] = m_now
        else:
            p_hat, m_hat = fit_line_last3_at_t(d, t0)
            if np.isnan(p_hat) or np.isnan(m_hat):
                ok=False; break
            p_vec[f"p__{inst}"] = p_hat
            m_vec[f"m__{inst}"] = m_hat
    if not ok:
        continue
    d_tar = dfs_trade[target_instrument]
    sub1 = d_tar[d_tar["ts"]<=t1].tail(k_last)
    if len(sub1)<k_last:
        continue
    _, m_next = fit_line_last3_at_t(d_tar, t1)
    dt_next = (t1 - t0).total_seconds()
    row.update(p_vec)
    row.update(m_vec)
    row["m_next"] = m_next
    row["dt_next"] = dt_next
    row["p_now"] = p_vec[f"p__{target_instrument}"]
    valid_rows.append(row)

train = pd.DataFrame(valid_rows).dropna().reset_index(drop=True)

feat_cols = [f"p__{inst}" for inst in selected] + [f"m__{inst}" for inst in selected]
X = train[feat_cols].to_numpy(float)
y = train["m_next"].to_numpy(float)
beta = np.linalg.lstsq(X, y, rcond=None)[0]

x_last = train[feat_cols].iloc[-1].to_numpy(float)
m_hat = float(x_last @ beta)

d_tar = dfs_trade[target_instrument]
dt_hist = d_tar[d_tar["ts"]<=train["t0"].iloc[-1]]["ts"].diff().dt.total_seconds().dropna()
dt_hat = float(dt_hist.tail(dt_median_window).median()) if len(dt_hist)>0 else 1.0

p0 = float(train["p_now"].iloc[-1])
p_next_hat = p0 + m_hat*dt_hat

out_summary = {
    "selected_instruments": selected,
    "n_samples": int(len(train)),
    "last_t0": str(train["t0"].iloc[-1]),
    "p0": p0,
    "m_hat": m_hat,
    "dt_hat_sec": dt_hat,
    "p_next_hat": p_next_hat
}
out_summary
