In [10]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.decomposition import PCA

DATA_DIR = "data/2025-09-30"
round = 'round_5'

prices  = pd.read_csv(f"{DATA_DIR}/prices.csv",     parse_dates=["date"]).set_index("date").sort_index()
volumes = pd.read_csv(f"{DATA_DIR}/volumes.csv",    parse_dates=["date"]).set_index("date").sort_index()
signals = pd.read_csv(f"{DATA_DIR}/signals.csv",    parse_dates=["date"]).set_index("date").sort_index()
cash    = pd.read_csv(f"{DATA_DIR}/cash_rate.csv",  parse_dates=["date"]).set_index("date").sort_index()

# Align assets & dates across the main panels
common_assets = sorted(set(prices.columns) & set(volumes.columns) & set(signals.columns))
common_dates = prices.index.intersection(volumes.index).intersection(signals.index)

print("Dates:", prices.index.min(), "->", prices.index.max(), "| Assets:", len(common_assets))

Dates: 2017-01-03 00:00:00 -> 2025-09-30 00:00:00 | Assets: 0


In [11]:
# Assets come from prices.csv
assets = list(prices.columns)
assert len(assets) > 0, "prices has zero asset columns."

# Align to common dates across prices/volumes/signals
common_dates = prices.index.intersection(volumes.index).intersection(signals.index)
prices  = prices.loc[common_dates, assets]
volumes = volumes.loc[common_dates]
signals = signals.loc[common_dates]

# volumes.csv has columns like INSTRUMENT_1_vol -> rename back to INSTRUMENT_1
vol_cols = [f"{a}_vol" for a in assets]
missing_vol = [c for c in vol_cols if c not in volumes.columns]
assert len(missing_vol) == 0, f"Missing volume columns like: {missing_vol[:10]}"

volumes_renamed = volumes[vol_cols].copy()
volumes_renamed.columns = assets

# signals.csv has INSTRUMENT_k_trend{4,8,16,32}
trend_horizons = [4, 8, 16, 32]
sig_trend = {}
for h in trend_horizons:
    cols = [f"{a}_trend{h}" for a in assets]
    missing = [c for c in cols if c not in signals.columns]
    assert len(missing) == 0, f"Missing trend columns for trend{h}, e.g. {missing[:10]}"
    tmp = signals[cols].copy()
    tmp.columns = assets
    sig_trend[f"trend{h}"] = tmp

print("Aligned shapes -> prices:", prices.shape, "| volumes_renamed:", volumes_renamed.shape, "| signals:", signals.shape)
print("Date range:", prices.index.min(), "->", prices.index.max())

Aligned shapes -> prices: (3124, 10) | volumes_renamed: (3124, 10) | signals: (3124, 40)
Date range: 2017-01-03 00:00:00 -> 2025-09-30 00:00:00


In [12]:
logp = np.log(prices)
ret1 = logp.diff()

H = 60
# Forward 60 trading-day log return: sum_{t+1..t+60} r
y = ret1.rolling(H).sum().shift(-H)

print("ret1 shape:", ret1.shape, "| y shape:", y.shape)

ret1 shape: (3124, 10) | y shape: (3124, 10)


In [13]:
feat_dict = {}

# momentum base
mom_3  = ret1.rolling(3).sum()
mom_10 = ret1.rolling(10).sum()
mom_20 = ret1.rolling(20).sum()

# momentum extra windows
mom_windows_extra = [30, 50, 60, 70, 80, 90, 100, 120, 140, 160, 180, 200, 220, 240, 260]
mom_extra = {f"mom_{w}": ret1.rolling(w).sum() for w in mom_windows_extra}

# reversal
rev_1 = -ret1

# vol base
vol_10 = ret1.rolling(10).std()
vol_20 = ret1.rolling(20).std()

# vol extra
vol_windows_extra = [30, 50, 60, 70, 80, 90, 100, 120, 140, 160, 180, 200, 220, 240, 260]
vol_extra = {f"vol_{w}": ret1.rolling(w).std() for w in vol_windows_extra}

# volume z-score (handle zeros safely)
volumes_safe = volumes_renamed.replace(0, np.nan)
volz_20 = (volumes_safe - volumes_safe.rolling(20).mean()) / (volumes_safe.rolling(20).std() + 1e-12)

# cross-sectional dispersion replicated across assets
disp = ret1.std(axis=1)
dispersion = pd.DataFrame(np.repeat(disp.values[:, None], len(assets), axis=1),
                          index=prices.index, columns=assets)

# relative features
rel_mom_10 = mom_10.sub(mom_10.mean(axis=1), axis=0)
rel_ret_1  = ret1.sub(ret1.mean(axis=1), axis=0)

# avg correlation vs market proxy
market = ret1.mean(axis=1)
avg_corr = pd.DataFrame(index=prices.index, columns=assets, dtype=float)
window_corr = 60
for a in assets:
    avg_corr[a] = ret1[a].rolling(window_corr).corr(market)

# rolling PCA features (factor, loading, residual) for multiple windows
def rolling_pca_features(ret_df: pd.DataFrame, window: int):
    dates = ret_df.index
    assets_all = ret_df.columns

    pc1_factor  = pd.DataFrame(np.nan, index=dates, columns=assets_all)
    pc1_loading = pd.DataFrame(np.nan, index=dates, columns=assets_all)
    pc1_resid_1 = pd.DataFrame(np.nan, index=dates, columns=assets_all)

    pca = PCA(n_components=1)

    for i in range(window, len(dates)):
        window_slice = ret_df.iloc[i-window:i].dropna(axis=1, how="any")
        if window_slice.shape[1] < 2:
            continue

        X = window_slice.values
        X = X - X.mean(axis=0, keepdims=True)

        pca.fit(X)
        x_last = X[-1:]
        score = pca.transform(x_last)[0, 0]
        loadings = pca.components_[0]

        recon = score * loadings
        resid = x_last.flatten() - recon

        slice_assets = window_slice.columns
        pc1_factor.loc[dates[i], slice_assets] = score
        pc1_loading.loc[dates[i], slice_assets] = loadings
        pc1_resid_1.loc[dates[i], slice_assets] = resid

    return pc1_factor, pc1_loading, pc1_resid_1

pca_windows = [20, 40, 60, 80, 100, 120, 140, 160, 180, 200, 220, 240, 260, 300]
pca_factor_feats, pca_loading_feats, pca_resid_feats = {}, {}, {}

for w in pca_windows:
    fac_df, load_df, resid_df = rolling_pca_features(ret1, window=w)
    pca_factor_feats[f"pc1_factor_w{w}"]   = fac_df
    pca_loading_feats[f"pc1_loading_w{w}"] = load_df
    pca_resid_feats[f"pc1_resid_1_w{w}"]   = resid_df

# cash3mo macro replicated across assets (your cash_rate.csv has '3mo')
cash_ff = cash.reindex(prices.index).ffill()
cash3mo = cash_ff["3mo"]
macro_3mo = pd.DataFrame(np.repeat(cash3mo.values[:, None], len(assets), axis=1),
                         index=prices.index, columns=assets)

# assemble full feat_dict
feat_dict = {
    "rel_mom_10": rel_mom_10,
    "rel_ret_1": rel_ret_1,
    "avg_corr": avg_corr,

    "mom_3": mom_3,
    "mom_10": mom_10,
    "mom_20": mom_20,
    "rev_1": rev_1,
    "vol_10": vol_10,
    "vol_20": vol_20,
    "volz_20": volz_20,
    "disp": dispersion,

    "trend4": sig_trend["trend4"],
    "trend8": sig_trend["trend8"],
    "trend16": sig_trend["trend16"],
    "trend32": sig_trend["trend32"],

    "cash3mo": macro_3mo,
}

feat_dict.update(mom_extra)
feat_dict.update(vol_extra)
feat_dict.update(pca_factor_feats)
feat_dict.update(pca_loading_feats)
feat_dict.update(pca_resid_feats)

print("Total features:", len(feat_dict))

Total features: 88


In [14]:
# Build wide df with MultiIndex columns: (feature, asset)
X_wide = pd.concat(feat_dict, axis=1)  # (feature, asset)

# Convert to panel with rows (date, asset) and columns feature
# Since our concat is (feature, asset), stacking level=1 will stack the asset level.
X_panel_all = X_wide.stack(level=1)
X_panel_all.index.names = ["date", "asset"]  # (date, asset) index
# columns are feature names now

# Replace inf -> NaN
X_panel_all = X_panel_all.replace([np.inf, -np.inf], np.nan)

# Cross-sectional rank per date
X_panel_all = X_panel_all.groupby(level=0).rank(pct=True)

# Fill NaNs per date with per-date median (transform keeps same index always)
med = X_panel_all.groupby(level=0).transform("median")
X_panel_all = X_panel_all.fillna(med)

# Any remaining NaNs (e.g., all NaN on that date/feature) -> neutral 0.5
X_panel_all = X_panel_all.fillna(0.5)

feature_cols = list(X_panel_all.columns)
assert len(feature_cols) > 0, "feature_cols ended up empty (should not happen with this data)."

print("X_panel_all shape:", X_panel_all.shape, "| #features:", len(feature_cols))

X_panel_all shape: (31240, 88) | #features: 88


In [15]:
y_panel = y.stack().rename("y")
y_panel.index.names = ["date", "asset"]

panel_train = X_panel_all.join(y_panel, how="inner").dropna(subset=["y"])

dates = panel_train.index.get_level_values(0).unique().sort_values()
print("panel_train shape:", panel_train.shape, "| trainable dates:", len(dates))
print("Trainable date range:", dates.min(), "->", dates.max())

panel_train shape: (30640, 89) | trainable dates: 3064
Trainable date range: 2017-01-03 00:00:00 -> 2025-08-01 00:00:00


In [16]:
split = int(len(dates) * 0.8)
train_dates = dates[:split]
train = panel_train.loc[train_dates]

model = Ridge(alpha=1.0)
model.fit(train[feature_cols].to_numpy(dtype=float), train["y"].to_numpy(dtype=float))

latest_date = X_panel_all.index.get_level_values(0).max()
latest_X = X_panel_all.loc[latest_date, feature_cols]  # index=asset

mu_hat = pd.Series(model.predict(latest_X.to_numpy(dtype=float)), index=latest_X.index, name="mu_hat")

print("Train last date:", train_dates.max())
print("Predict date:", latest_date)
print(mu_hat.sort_values(ascending=False).head(10))

Train last date: 2023-11-27 00:00:00
Predict date: 2025-09-30 00:00:00
asset
INSTRUMENT_7     0.114257
INSTRUMENT_9     0.064927
INSTRUMENT_10    0.035492
INSTRUMENT_2     0.021769
INSTRUMENT_5     0.006207
INSTRUMENT_3    -0.013325
INSTRUMENT_1    -0.020425
INSTRUMENT_6    -0.029810
INSTRUMENT_8    -0.033218
INSTRUMENT_4    -0.034371
Name: mu_hat, dtype: float64


In [17]:
# Risk-aware score ~ Sharpe proxy: mu / vol
VOL_WIN = 20
vol = ret1[mu_hat.index].rolling(VOL_WIN).std().loc[latest_date]
score = mu_hat / (vol + 1e-8)
score = score.replace([np.inf, -np.inf], np.nan).fillna(score.median())

# Long-only weights with true zeros: center then clip
s = score - score.median()
w = s.clip(lower=0.0)

# fallback if everything <= 0
if w.sum() <= 0:
    w = pd.Series(1.0, index=score.index)

w = w / w.sum()

submission = pd.DataFrame({"asset": w.index, "weight": w.values}).sort_values("weight", ascending=False)
submission.to_csv(f"submissions/AAKK_{round}.csv", index=False)

print(f"Saved: AAKK_{round}.csv")
print("Sum weights:", submission["weight"].sum(), "Min weight:", submission["weight"].min(), "(can be 0 now)")
print(submission.head(10))

Saved: AAKK_round_5.csv
Sum weights: 1.0 Min weight: 0.0 (can be 0 now)
           asset    weight
6   INSTRUMENT_7  0.531870
1   INSTRUMENT_2  0.180075
8   INSTRUMENT_9  0.164758
4   INSTRUMENT_5  0.065826
9  INSTRUMENT_10  0.057472
0   INSTRUMENT_1  0.000000
2   INSTRUMENT_3  0.000000
3   INSTRUMENT_4  0.000000
5   INSTRUMENT_6  0.000000
7   INSTRUMENT_8  0.000000


In [18]:
# pick a date/asset where label exists and verify y = sum of next 60 daily returns
t = dates[-200]
a = assets[0]
manual = ret1[a].loc[t:].iloc[1:61].sum()
print("y vs manual:", float(y.loc[t, a]), float(manual))

y vs manual: -0.03421587763547951 -0.03421587763547951
