In [1]:
!python3 -m pip install scikit-learn

Defaulting to user installation because normal site-packages is not writeable


In [2]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, balanced_accuracy_score, classification_report
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from IPython.display import display


In [3]:
RANDOM_STATE   = 42
PREDICT_MONTH  = pd.Period('2023-07', 'M')   # month to predict
TRAIN_END   = pd.Period('2023-06', 'M')   # last month with labels for training
SHIFT_MACRO_BY = 1                           # shift macro features by +1M to avoid release leakage


In [4]:
stock = pd.read_csv("stock_data.csv")
index  = pd.read_csv("monashIndex.csv")
company  = pd.read_csv("company_info.csv")
vix    = pd.read_csv("vix_index.csv")
us10yt     = pd.read_csv("us_10_year_treasury.csv")
us5yt      = pd.read_csv("us_5_year_treasury.csv")
infl     = pd.read_csv("fed_inflation_rate.csv")
fedfunds = pd.read_csv("fed_funds_rate.csv")
unemp     = pd.read_csv("fed_unemployment_rate.csv")
train_tgt= pd.read_csv("training_targets.csv").astype({"stock_id":"str"})
test_tgt  = pd.read_csv("testing_targets.csv").astype({"stock_id":"str"})


In [5]:
# Sort by time for rolling/lag operations
def lag_and_roll(df, col, l1=True, mean3=True, time_col="month_id"):
    df = df.sort_values(time_col).copy()
    out = pd.DataFrame({time_col: df[time_col]})
    if l1:
        out[f"{col}_lag1"] = df[col].shift(1)
    if mean3:
        out[f"{col}_mean_3m"] = df[col].shift(1).rolling(3, min_periods=1).mean()
    return out

vix_feat = lag_and_roll(vix, "vix")  # vix_lag1, vix_mean_3m

yields = (us10yt.merge(us5yt, on="month_id", how="outer")
            .sort_values("month_id"))
yields_feat = yields[["month_id"]].copy()
yields_feat["y10_lag1"] = yields["10y_treasury"].shift(1)
yields_feat["y5_lag1"]  = yields["5y_treasury"].shift(1)
yields_feat["term_slope_lag1"] = yields_feat["y10_lag1"] - yields_feat["y5_lag1"]

fed = fedfunds.sort_values("month_id")
fed_feat = fed[["month_id"]].copy()
fed_feat["fed_rate_lag1"]   = fed["fed_rate"].shift(1)
# 3-month change (t-1 vs t-4)
fed_feat["fed_rate_chg_3m"] = fed["fed_rate"].shift(1) - fed["fed_rate"].shift(4)

cpi_feat = lag_and_roll(infl, "inflation_rate")  # inflation_rate_lag1, inflation_rate_mean_3m
cpi_feat = cpi_feat.rename(columns={
    "inflation_rate_lag1": "inflation_rate_lag1",
    "inflation_rate_mean_3m": "inflation_3m_mean"
})

unemp = unemp.sort_values("month_id")
unemp_feat = unemp[["month_id"]].copy()
unemp_feat["unemployment_rate_lag1"] = unemp["unemployment_rate"].shift(1)

# Index context
idx = index.sort_values("month_id")
idx_feat = idx[["month_id"]].copy()
idx_feat["index_return_lag1"]     = idx["index_return"].shift(1)
idx_feat["index_return_mean_3m"]  = idx["index_return"].shift(1).rolling(3, min_periods=1).mean()
# trailing 3m drawdown using index_value known by t-1
roll_max = idx["index_value"].shift(1).rolling(3, min_periods=1).max()
idx_feat["index_value_dd_3m"] = (idx["index_value"].shift(1) - roll_max) / roll_max

# Combine all macro + index features
macro_feat = (
    vix_feat.merge(yields_feat, on="month_id", how="outer")
            .merge(fed_feat, on="month_id", how="outer")
            .merge(cpi_feat, on="month_id", how="outer")
            .merge(unemp_feat, on="month_id", how="outer")
            .merge(idx_feat, on="month_id", how="outer")
            .sort_values("month_id")
)

print("macro_feat shape:", macro_feat.shape)


macro_feat shape: (42, 14)


In [6]:
stock = stock.sort_values(["stock_id", "month_id"]).copy()

# Pick the columns to lag (all time-varying)
to_lag = [
    "month_start_open_usd","month_end_close_usd",
    "month_high_usd","month_low_usd",
    "monthly_volume","intramonth_return","intramonth_volatility",
    "return_1m","return_3m","return_6m",
    "volatility_3m","volatility_6m",
    "trading_days","avg_volume_3m","volume_ratio","price_range_ratio"
]

missing_cols = [c for c in to_lag if c not in stock.columns]
if missing_cols:
    raise ValueError(f"Missing columns in stock_data.csv: {missing_cols}")

# Create *_lag1 versions aligned to the current month_id
lagged = stock.groupby("stock_id")[to_lag].shift(1)
lagged.columns = [f"{c}_lag1" for c in to_lag]

stock_feat = pd.concat([stock[["stock_id","month_id"]], lagged], axis=1)

# Drop the first available month per stock (no lag yet)
stock_feat = stock_feat.dropna(subset=[col for col in stock_feat.columns if col.endswith("_lag1")])

print("stock_feat shape:", stock_feat.shape)


stock_feat shape: (24867, 18)


In [7]:
# Ensure company_ohe exists (one-hot of company_info)
if 'company_ohe' not in globals():
    cat_cols = ['sector','business_model','geographic_focus','business_maturity','competitive_position',
                'market_cap_category','revenue_tier','profitability_profile','asset_intensity','financial_strength']
    comp = company[['stock_id'] + cat_cols].copy()
    company_ohe = pd.get_dummies(comp, columns=cat_cols, prefix=cat_cols, prefix_sep='=')

features_panel = (
    stock_feat
    .merge(macro_feat, on="month_id", how="left")
    .merge(company_ohe, on="stock_id", how="left")
    .sort_values(["stock_id","month_id"])
)

print("features_panel shape:", features_panel.shape)
features_panel.head(3)


features_panel shape: (24867, 70)


Unnamed: 0,stock_id,month_id,month_start_open_usd_lag1,month_end_close_usd_lag1,month_high_usd_lag1,month_low_usd_lag1,monthly_volume_lag1,intramonth_return_lag1,intramonth_volatility_lag1,return_1m_lag1,...,revenue_tier=Tier_3,profitability_profile=High_Margin,profitability_profile=Low_Margin,profitability_profile=Standard,asset_intensity=Asset_Light,asset_intensity=Capital_Intensive,asset_intensity=Moderate,financial_strength=Developing,financial_strength=Stable,financial_strength=Strong
0,US001,2020_02,120.192,107.326,123.486,107.035,84539259.0,-0.107045,0.253304,-0.100669,...,False,False,False,True,False,False,True,False,True,False
1,US001,2020_03,108.252,101.868,111.641,99.636,91313882.0,-0.058974,0.259208,-0.050854,...,False,False,False,True,False,False,True,False,True,False
2,US001,2020_04,103.301,93.178,105.622,77.841,177930833.0,-0.097995,0.892709,-0.085299,...,False,False,False,True,False,False,True,False,True,False


In [8]:
macro = (
    vix[["month_id", "vix"]]
    .merge(us10yt[["month_id", "10y_treasury"]], on="month_id", how="outer")
    .merge(us5yt[["month_id", "5y_treasury"]], on="month_id", how="outer")
    .merge(infl[["month_id", "inflation_rate"]], on="month_id", how="outer")
    .merge(fedfunds[["month_id", "fed_rate"]], on="month_id", how="outer")
    .merge(unemp[["month_id", "unemployment_rate"]], on="month_id", how="outer")
    .merge(index[["month_id", "index_return", "index_value"]], on="month_id", how="outer")
    .sort_values("month_id")
)

# engineered macro fields
macro["TERM_SPREAD"] = macro["10y_treasury"] - macro["5y_treasury"]
macro["REAL_RATE_PROXY"] = macro["10y_treasury"] - macro["inflation_rate"]
for col in ["vix", "10y_treasury", "5y_treasury", "inflation_rate", "fed_rate", "unemployment_rate"]:
    macro[f"{col}_CHG_1M"] = macro[col].diff()

# lag ALL macro features by 1 month to avoid leak
macro_lag = macro.copy()
macro_cols = [c for c in macro.columns if c != "month_id"]
macro_lag[macro_cols] = macro_lag[macro_cols].shift(1)
macro_lag = macro_lag.rename(columns={c: f"{c}_lag1" for c in macro_cols})


In [9]:
stock = stock.sort_values(["stock_id", "month_id"])

In [10]:
to_lag = [
    "return_1m","return_3m","return_6m",
    "intramonth_volatility","volatility_3m","volatility_6m",
    "volume_ratio","avg_volume_3m","monthly_volume",
    "price_range_ratio","trading_days"
]
to_lag = [c for c in to_lag if c in stock.columns]

for c in to_lag:
    stock[f"{c}_lag1"] = stock.groupby("stock_id")[c].shift(1)

lag_feats = sorted([f"{c}_lag1" for c in to_lag if f"{c}_lag1" in stock.columns])

# Build panel of features only (lagged stock + lagged macro) + company info
panel = stock[["stock_id","month_id"] + lag_feats].merge(macro_lag, on="month_id", how="left")
panel = panel.merge(company, on="stock_id", how="left")

# Add training labels by merge (do NOT compute from prices here)
panel = panel.merge(
    train_tgt[["stock_id","month_id","outperform_binary"]],
    on=["stock_id","month_id"], how="left"
)


In [11]:
# ---- HARD CAST month_id to Period[M] everywhere (fix for '<=' TypeError) ----
def force_period_m(df, col="month_id"):
    if col in df.columns:
        # Use .dt.to_period("M") for correct conversion
        df[col] = pd.to_datetime(df[col].astype(str), errors="coerce").dt.to_period("M")
    return df

stock     = force_period_m(stock)
macro_lag = force_period_m(macro_lag)
train_tgt = force_period_m(train_tgt)
test_tgt  = force_period_m(test_tgt)
panel     = force_period_m(panel)

# (optional sanity) - Removed print statement causing NameError
# print("DTypes:",
#       "panel.month_id ->", panel["month_id"].dtype,
#       "| tjm1 type ->", type(tjm1))

  df[col] = pd.to_datetime(df[col].astype(str), errors="coerce").dt.to_period("M")
  df[col] = pd.to_datetime(df[col].astype(str), errors="coerce").dt.to_period("M")
  df[col] = pd.to_datetime(df[col].astype(str), errors="coerce").dt.to_period("M")
  df[col] = pd.to_datetime(df[col].astype(str), errors="coerce").dt.to_period("M")


In [12]:
train_df = (train_tgt
            .merge(panel, on=["stock_id","month_id"], how="inner")
            .sort_values(["stock_id","month_id"]))
test_df  = (test_tgt
            .merge(panel, on=["stock_id","month_id"], how="left")
            .sort_values(["stock_id","month_id"]))

print(train_df["month_id"].min(), train_df["month_id"].max())
print("Unique test months:", sorted(test_df["month_id"].unique().astype(str)))

# Feature columns = everything except identifiers & labels
# Adjusted to exclude the merged target columns
drop_cols = ["stock_id","month_id","outperform_binary_x","outperform_binary_y","test_outperform","excess_return"]
feat_cols = [c for c in train_df.columns if c not in drop_cols]

X_train = train_df[feat_cols].replace([np.inf, -np.inf], np.nan)
# Use the correctly named target column from the merge for y_train
y_train = train_df["outperform_binary_x"].astype(int)

X_test  = test_df[feat_cols].replace([np.inf, -np.inf], np.nan)
# Removed y_test creation as y_test_jul is used for evaluation


# Consistent NA imputation from TRAIN medians
train_medians = X_train.median(numeric_only=True)
X_train = X_train.fillna(train_medians)
X_test  = X_test.fillna(train_medians)

# Define numeric and categorical columns based on X_train
numeric_cols = X_train.select_dtypes(include=np.number).columns.tolist()
categorical_cols = X_train.select_dtypes(include='object').columns.tolist()


print("X_train/test shapes:", X_train.shape, X_test.shape)
print("Numeric columns:", numeric_cols)
print("Categorical columns:", categorical_cols)

NaT NaT
Unique test months: ['NaT']


X_train/test shapes: (1069738, 37) (25618, 37)
Numeric columns: ['avg_volume_3m_lag1', 'intramonth_volatility_lag1', 'monthly_volume_lag1', 'price_range_ratio_lag1', 'return_1m_lag1', 'return_3m_lag1', 'return_6m_lag1', 'trading_days_lag1', 'volatility_3m_lag1', 'volatility_6m_lag1', 'volume_ratio_lag1', 'vix_lag1', '10y_treasury_lag1', '5y_treasury_lag1', 'inflation_rate_lag1', 'fed_rate_lag1', 'unemployment_rate_lag1', 'index_return_lag1', 'index_value_lag1', 'TERM_SPREAD_lag1', 'REAL_RATE_PROXY_lag1', 'vix_CHG_1M_lag1', '10y_treasury_CHG_1M_lag1', '5y_treasury_CHG_1M_lag1', 'inflation_rate_CHG_1M_lag1', 'fed_rate_CHG_1M_lag1', 'unemployment_rate_CHG_1M_lag1']
Categorical columns: ['sector', 'business_model', 'geographic_focus', 'business_maturity', 'competitive_position', 'market_cap_category', 'revenue_tier', 'profitability_profile', 'asset_intensity', 'financial_strength']


In [13]:
print("Panel month range:", panel["month_id"].min(), "→", panel["month_id"].max())
print("Counts by month (tail):")
print(panel["month_id"].value_counts().sort_index().tail(6))

missing_cols = [c for c in ["monthly_volume","avg_volume_3m","volume_ratio","price_range_ratio","trading_days"]
                if c not in stock.columns]
if missing_cols:
    print("Missing in stock_data.csv:", missing_cols)

Panel month range: NaT → NaT
Counts by month (tail):
Series([], Freq: M, Name: count, dtype: int64)


MODELO 1

In [14]:
# ===== Model A: Logistic Regression (with scaling) =====
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score, balanced_accuracy_score, f1_score

# preprocessors
num_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

try:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
except TypeError:  # older sklearn
    ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)

cat_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", ohe)
])

pre = ColumnTransformer(
    transformers=[
        ("num", num_pipe, numeric_cols),
        ("cat", cat_pipe, categorical_cols),
    ],
    remainder="drop",
    verbose_feature_names_out=False,
)

clf = LogisticRegression(
    max_iter=2000,
    class_weight="balanced",
    C=1.0,
    random_state=RANDOM_STATE
)

pipe_lr = Pipeline([
    ("pre", pre),
    ("clf", clf)
])

print("X_train/X_test shapes:", X_train.shape, X_test.shape)
pipe_lr.fit(X_train, y_train)

# July 2023 predictions
proba = pipe_lr.predict_proba(X_test)[:, 1]
pred  = (proba >= 0.5).astype(int)

preds_lr = test_df[["stock_id","month_id"]].drop_duplicates().copy()
preds_lr["proba"] = proba
preds_lr["pred"] = pred

# If ground truth for July exists, print quick metrics
y_test_jul = globals().get('y_test_jul', None)
if y_test_jul is not None and y_test_jul.notna().any():
    y_true = y_test_jul.values.astype(int)
    print("LR — July 2023")
    print("  AUC:                ", round(roc_auc_score(y_true, proba), 4))
    print("  Accuracy:           ", round(accuracy_score(y_true, pred), 4))
    print("  Balanced Accuracy:  ", round(balanced_accuracy_score(y_true, pred), 4))
    print("  F1 (positive=1):    ", round(f1_score(y_true, pred), 4))

# Top candidates
preds_lr.sort_values("proba", ascending=False).head(10)


X_train/X_test shapes: (1069738, 37) (25618, 37)


  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights


  ret = a @ b
  ret = a @ b
  ret = a @ b


ValueError: Length of values (25618) does not match length of index (616)

MODEL 2

In [None]:
# ===== Model B: Random Forest (no scaling needed) =====
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, balanced_accuracy_score, f1_score

# Preprocessing for trees: impute only + OHE
num_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
])

try:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
except TypeError:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)

cat_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", ohe)
])

pre_tree = ColumnTransformer(
    transformers=[
        ("num", num_pipe, numeric_cols),
        ("cat", cat_pipe, categorical_cols),
    ],
    remainder="drop",
    verbose_feature_names_out=False,
)

rf = RandomForestClassifier(
    n_estimators=600,
    max_depth=None,
    min_samples_leaf=2,
    class_weight="balanced_subsample",
    random_state=RANDOM_STATE,
    n_jobs=-1
)

pipe_rf = Pipeline([
    ("pre", pre_tree),
    ("clf", rf)
])

pipe_rf.fit(X_train, y_train)

proba = pipe_rf.predict_proba(X_test)[:, 1]
pred  = (proba >= 0.5).astype(int)

preds_rf = test_df[["stock_id","month_id"]].drop_duplicates().copy()
preds_rf["proba"] = proba
preds_rf["pred"] = pred

y_test_jul = globals().get('y_test_jul', None)
if y_test_jul is not None and y_test_jul.notna().any():
    y_true = y_test_jul.values.astype(int)
    print("RF — July 2023")
    print("  AUC:                ", round(roc_auc_score(y_true, proba), 4))
    print("  Accuracy:           ", round(accuracy_score(y_true, pred), 4))
    print("  Balanced Accuracy:  ", round(balanced_accuracy_score(y_true, pred), 4))
    print("  F1 (positive=1):    ", round(f1_score(y_true, pred), 4))

preds_rf.sort_values("proba", ascending=False).head(10)


MODELO 3

In [None]:
# ===== Model C: HistGradientBoosting (tree boosting) =====
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_hist_gradient_boosting  # noqa: F401
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, balanced_accuracy_score, f1_score

# Preprocess: impute + dense OHE (HGB needs dense)
num_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
])

try:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
except TypeError:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)

cat_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", ohe)
])

pre_hgb = ColumnTransformer(
    transformers=[
        ("num", num_pipe, numeric_cols),
        ("cat", cat_pipe, categorical_cols),
    ],
    remainder="drop",
    verbose_feature_names_out=False,
)

hgb = HistGradientBoostingClassifier(
    learning_rate=0.06,
    max_depth=7,            # None also works; 7 is a good start
    max_iter=400,
    min_samples_leaf=30,
    l2_regularization=0.0,
    random_state=RANDOM_STATE
)

pipe_hgb = Pipeline([
    ("pre", pre_hgb),
    ("clf", hgb)
])

pipe_hgb.fit(X_train, y_train)

proba = pipe_hgb.predict_proba(X_test)[:, 1]
pred  = (proba >= 0.5).astype(int)

preds_hgb = test_df[["stock_id","month_id"]].drop_duplicates().copy()
preds_hgb["proba"] = proba
preds_hgb["pred"] = pred

y_test_jul = globals().get('y_test_jul', None)
if y_test_jul is not None and y_test_jul.notna().any():
    y_true = y_test_jul.values.astype(int)
    print("HGB — July 2023")
    print("  AUC:                ", round(roc_auc_score(y_true, proba), 4))
    print("  Accuracy:           ", round(accuracy_score(y_true, pred), 4))
    print("  Balanced Accuracy:  ", round(balanced_accuracy_score(y_true, pred), 4))
    print("  F1 (positive=1):    ", round(f1_score(y_true, pred), 4))

preds_hgb.sort_values("proba", ascending=False).head(10)


In [None]:
def merge_preds(meta, **pred_dfs):
    base = meta[["stock_id","month_id"]].drop_duplicates().copy()
    for name, dfp in pred_dfs.items():
        if dfp is not None:
            base = base.merge(
                dfp[["stock_id","proba"]].rename(columns={"proba": f"proba_{name}"}),
                on="stock_id", how="left"
            )
    return base

preds_all = merge_preds(
    test_df[["stock_id","month_id"]].drop_duplicates(),
    lr=preds_lr if "preds_lr" in globals() else None,
    rf=preds_rf if "preds_rf" in globals() else None,
    hgb=preds_hgb if "preds_hgb" in globals() else None,
)

proba_cols = [c for c in preds_all.columns if c.startswith("proba_")]
preds_all["mean_proba"] = preds_all[proba_cols].mean(axis=1, skipna=True)
preds_all["vote_1s"]    = (preds_all[proba_cols] >= 0.5).sum(axis=1)
preds_all["pred_ens"]   = (preds_all["mean_proba"] >= 0.5).astype(int)

print("Rows in July universe:", len(preds_all))
preds_all.sort_values("mean_proba", ascending=False).head(10)

In [None]:
def eval_on_july(name, proba, y_true):
    m = ~np.isnan(proba)
    y = y_true[m].astype(int)
    p = proba[m]
    pred = (p >= 0.5).astype(int)
    print(f"{name:>6} | AUC={roc_auc_score(y,p):.3f}  Acc={accuracy_score(y,pred):.3f}  "
          f"BalAcc={balanced_accuracy_score(y,pred):.3f}  F1={f1_score(y,pred):.3f}  n={m.sum()}")

if y_test_jul is not None and y_test_jul.notna().any():
    y_true = y_test_jul.values
    for c in proba_cols:
        eval_on_july(c.replace("proba_","").upper(), preds_all[c].values, y_true)
    eval_on_july("ENSEMB", preds_all["mean_proba"].values, y_true)


In [None]:
# Ranked picks by ensemble probability
ranked = preds_all.sort_values(["mean_proba","vote_1s"], ascending=[False, False]).reset_index(drop=True)

# Choose a portfolio size (e.g., top 10%); feel free to change
k = max(1, int(0.10 * len(ranked)))
top_k = ranked.head(k).copy()

# Join optional metadata for reporting
if "company" in globals():
    top_k = top_k.merge(company[["stock_id","sector"]], on="stock_id", how="left")

print("Top-k (k=", k, ") preview:")
display(top_k.head(15))

# If ground truth & excess returns available, grade the portfolio:
tj = test_df["month_id"].iloc[0]
try:
    jul_truth = (test_tgt.loc[test_tgt["month_id"] == tj, ["stock_id","outperform_binary","excess_return"]]
                         .drop_duplicates("stock_id"))
except Exception:
    jul_truth = None

if jul_truth is not None and not jul_truth.empty:
    top_eval = top_k.merge(jul_truth, on="stock_id", how="left")
    hit_rate = np.nanmean(top_eval["outperform_binary"])
    avg_excess = np.nanmean(top_eval["excess_return"])
    print(f"Top-{k} hit rate: {hit_rate:.3f} | mean excess return: {avg_excess:.3%}")

    # Bottom decile baseline
    bot_k = ranked.tail(k).merge(jul_truth, on="stock_id", how="left")
    bot_hit = np.nanmean(bot_k["outperform_binary"])
    bot_excess = np.nanmean(bot_k["excess_return"])
    print(f"Bottom-{k} hit rate: {bot_hit:.3f} | mean excess return: {bot_excess:.3%}")
    if "sector" in top_k.columns:
        print("\nTop-k sector mix (% of names):")
        print((top_k["sector"].value_counts(normalize=True)*100).round(1).to_string())


In [None]:
# Random Forest importances
try:
    rf_feats = list(pipe_rf.named_steps["pre"].get_feature_names_out())
    rf_imps  = pipe_rf.named_steps["clf"].feature_importances_
    feat_imp_rf = (pd.DataFrame({"feature": rf_feats, "importance": rf_imps})
                    .sort_values("importance", ascending=False)
                    .head(25))
    display(feat_imp_rf)
except Exception as e:
    print("RF importance not available:", e)

# HistGB importances (if exposed)
try:
    hgb_feats = list(pipe_hgb.named_steps["pre"].get_feature_names_out())
    hgb_imps  = getattr(pipe_hgb.named_steps["clf"], "feature_importances_", None)
    if hgb_imps is not None:
        feat_imp_hgb = (pd.DataFrame({"feature": hgb_feats, "importance": hgb_imps})
                         .sort_values("importance", ascending=False)
                         .head(25))
        display(feat_imp_hgb)
except Exception as e:
    print("HGB importance not available:", e)
