In [6]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Ranker training with auto-selected features (core + derived + request-relative)
- Reads: /mnt/data/blood_request_ranking_dataset.csv
- GroupKFold (5 folds) by Request_ID
- Early stopping on NDCG@5
- Graded relevance if available, else falls back cleanly
- Final fit on ALL data with avg best iteration
- Saves artifacts to /mnt/data
"""

import os, json, numpy as np, pandas as pd
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import ndcg_score
import lightgbm as lgb
from lightgbm import early_stopping, log_evaluation
import joblib

CSV_PATH = r"D:\Thal-AI\thalcare-AI\backend\blood_request_ranking_dataset.csv"
OUT_DIR  = r"D:\Thal-AI\thalcare-AI\backend\output"
MODEL_OUT = os.path.join(OUT_DIR, "ranker_auto.pkl")
ENC_DIR   = os.path.join(OUT_DIR, "encoders_auto")
FEATS_OUT = os.path.join(OUT_DIR, "features_auto.json")

# -----------------------------
# Utilities
# -----------------------------
def keep_valid_groups(d: pd.DataFrame, label_col: str) -> pd.DataFrame:
    gsize = d.groupby("Request_ID").size()
    gpos  = d.groupby("Request_ID")[label_col].sum()
    good  = gsize[(gsize >= 2) & (gpos >= 1)].index
    return d[d["Request_ID"].isin(good)].copy()

def ndcg_atk_grouped(y_true: np.ndarray, y_pred: np.ndarray, group_sizes: list, k=5) -> float:
    off, sc = 0, []
    for sz in group_sizes:
        yt = y_true[off:off+sz]
        yp = y_pred[off:off+sz]
        sc.append(ndcg_score([yt], [yp], k=k))
        off += sz
    return float(np.mean(sc))

def safe_add(col, df, fn):
    try:
        df[col] = fn(df)
        return True
    except Exception:
        return False

# -----------------------------
# Load
# -----------------------------
df = pd.read_csv(CSV_PATH)

# -----------------------------
# Labels & groups
# -----------------------------
label = None
if "Relevance" in df.columns:
    label = "Relevance"
else:
    # graded relevance if possible
    if "Was_Fulfilled" in df.columns and "Was_Chosen_By_User" in df.columns:
        df["Relevance"] = np.where(df["Was_Fulfilled"]==1, 2,
                            np.where(df["Was_Chosen_By_User"]==1, 1, 0)).astype(int)
        label = "Relevance"
    elif "Was_Chosen_By_User" in df.columns:
        label = "Was_Chosen_By_User"
    else:
        raise ValueError("No suitable label found. Provide Relevance or Was_Chosen_By_User.")

if "Request_ID" not in df.columns:
    raise ValueError("Request_ID column is required for grouped ranking.")

# -----------------------------
# Core features (only if present)
# -----------------------------
core_candidates = [
    "Distance_km",
    "Available_Units_For_Type",
    "Meets_Demand_Bool",
    "Last_Updated_Min_Ago",
    "Units_Requested",
    "Blood_Group_Requested",
    "Urgency_Level",
    "City",
    "Component_Requested",  # optional
]
core = [c for c in core_candidates if c in df.columns]

# -----------------------------
# Derived features (API-safe)
# -----------------------------
# Availability & freshness
safe_add("Availability_Ratio", df, lambda d: d["Available_Units_For_Type"] / d["Units_Requested"].clip(lower=1))
safe_add("Availability_Gap",   df, lambda d: d["Available_Units_For_Type"] - d["Units_Requested"])
safe_add("Staleness_Score",    df, lambda d: 1.0 / (1.0 + d["Last_Updated_Min_Ago"].astype(float)))

# Distance shaping
safe_add("Inv_Distance", df, lambda d: 1.0 / (1.0 + d["Distance_km"]))

# Urgency numerics + interactions
if "Urgency_Level" in df.columns:
    urg_map = {"Emergency": 2, "Routine": 1, "Scheduled": 0}
    df["Urgency_Num"] = df["Urgency_Level"].map(urg_map).fillna(1).astype(int)
    if "Distance_km" in df.columns:
        df["Urgency_x_Distance"] = df["Urgency_Num"] * df["Distance_km"]
    if "Staleness_Score" in df.columns:
        df["Urgency_x_Recency"] = df["Urgency_Num"] * df["Staleness_Score"]
    if "Availability_Ratio" in df.columns:
        df["Adequacy_Urgent"] = (df["Urgency_Level"].eq("Emergency").astype(int) * df["Availability_Ratio"])

derived = [c for c in [
    "Availability_Ratio","Availability_Gap","Staleness_Score","Inv_Distance",
    "Urgency_Num","Urgency_x_Distance","Urgency_x_Recency","Adequacy_Urgent"
] if c in df.columns]

# -----------------------------
# Request-relative features (per Request_ID)
# -----------------------------
if "Available_Units_For_Type" in df.columns and "Distance_km" in df.columns:
    g = df.groupby("Request_ID", sort=False)
    df["Rel_Availability"] = df["Available_Units_For_Type"] / g["Available_Units_For_Type"].transform("mean").clip(lower=1e-6)
    df["Rel_Distance"]     = df["Distance_km"] / (g["Distance_km"].transform("min") + 1e-6)
    # 25th percentile proximity flag
    q25 = g["Distance_km"].transform(lambda s: np.quantile(s, 0.25) if len(s) else np.nan)
    df["TopK_Proximity"]   = (df["Distance_km"] <= q25).astype(int)

rel_derived = [c for c in ["Rel_Availability","Rel_Distance","TopK_Proximity"] if c in df.columns]

# -----------------------------
# Rich extras (only if present in your dataset)
# -----------------------------
rich_candidates = [
    "Total_Units","Fulfillment_Rate_%","Avg_Response_Time_Min","Patient_Satisfaction_%",
    "Blood_Safety_Score_%","Emergency_Service","24x7_Availability",
    "Beds_Available","Doctors_On_Duty"
]
rich = [c for c in rich_candidates if c in df.columns]

# Normalized rich-derived (if present)
if "Fulfillment_Rate_%" in df.columns:
    df["Fulfillment_Rate_N"] = df["Fulfillment_Rate_%"] / 100.0
if "Patient_Satisfaction_%" in df.columns:
    df["Satisfaction_N"] = df["Patient_Satisfaction_%"] / 100.0
if "Blood_Safety_Score_%" in df.columns:
    df["Safety_N"] = df["Blood_Safety_Score_%"] / 100.0
if "Avg_Response_Time_Min" in df.columns:
    df["Response_Speed"] = 1.0 / (1.0 + df["Avg_Response_Time_Min"])
    if "Urgency_Num" in df.columns:
        df["Urgency_x_Speed"] = df["Urgency_Num"] * df["Response_Speed"]
if {"Beds_Available","Doctors_On_Duty"}.issubset(df.columns):
    df["Staffing_Level"] = df["Beds_Available"].fillna(0) + df["Doctors_On_Duty"].fillna(0)
    if "Availability_Ratio" in df.columns:
        df["Adequacy_x_Staff"] = df["Availability_Ratio"] * df["Staffing_Level"]

rich_derived = [c for c in [
    "Fulfillment_Rate_N","Satisfaction_N","Safety_N","Response_Speed","Urgency_x_Speed",
    "Staffing_Level","Adequacy_x_Staff"
] if c in df.columns]

# -----------------------------
# Final feature list
# -----------------------------
features = core + derived + rel_derived + rich + rich_derived
features = list(dict.fromkeys(features))  # preserve order, drop dups

# Categorical columns to encode (subset of features)
cat_cols = [c for c in ["Blood_Group_Requested","Urgency_Level","City","Component_Requested"] if c in features]
# Boolean columns to cast as int if present
bool_cols = [c for c in ["Meets_Demand_Bool","Emergency_Service","24x7_Availability"] if c in features]

# -----------------------------
# GroupKFold CV (5 folds)
# -----------------------------
groups = df["Request_ID"].values
gkf = GroupKFold(n_splits=5)

fold_scores, best_iters = [], []
fold_id = 1

for tr_idx, te_idx in gkf.split(df, groups=groups):
    tr, te = df.iloc[tr_idx].copy(), df.iloc[te_idx].copy()

    # Encode categoricals per fold to avoid leakage
    encs = {}
    for c in cat_cols:
        le = LabelEncoder()
        tr[c] = le.fit_transform(tr[c].astype(str))
        mapping = {cls: i for i, cls in enumerate(le.classes_)}
        te[c] = te[c].astype(str).map(lambda v: mapping.get(v, -1))
        encs[c] = le

    # Cast booleans to ints
    for bc in bool_cols:
        tr[bc] = tr[bc].astype(int)
        te[bc] = te[bc].astype(int)

    # Filter invalid groups
    tr = keep_valid_groups(tr, label)
    te = keep_valid_groups(te, label)

    Xtr, ytr = tr[features], tr[label].astype(int).values
    Xte, yte = te[features], te[label].astype(int).values
    gtr = tr.groupby("Request_ID").size().to_list()
    gte = te.groupby("Request_ID").size().to_list()

    ranker = lgb.LGBMRanker(
        objective="lambdarank",
        metric="ndcg",
        eval_at=[5],
        learning_rate=0.03,
        n_estimators=3000,
        num_leaves=95,
        min_child_samples=30,
        colsample_bytree=0.9,
        subsample=0.8,
        subsample_freq=1,
        reg_lambda=1.0,
        random_state=42,
    )

    ranker.fit(
        Xtr, ytr,
        group=gtr,
        eval_set=[(Xte, yte)],
        eval_group=[gte],
        callbacks=[early_stopping(100), log_evaluation(0)],  # set to 50 for logs
    )

    yp = ranker.predict(Xte, num_iteration=ranker.best_iteration_)
    ndcg5 = ndcg_atk_grouped(yte, yp, gte, k=5)
    print(f"Fold {fold_id}: NDCG@5 = {ndcg5:.4f}, best_iter = {ranker.best_iteration_ or 3000}")
    fold_scores.append(ndcg5)
    best_iters.append(ranker.best_iteration_ or 3000)
    fold_id += 1

mean_score, std_score = float(np.mean(fold_scores)), float(np.std(fold_scores))
avg_best = int(np.mean(best_iters))

print("\n=== CV Summary ===")
print(f"NDCG@5 per fold: {[round(s,4) for s in fold_scores]}")
print(f"Mean ± Std NDCG@5: {mean_score:.4f} ± {std_score:.4f}")
print(f"Avg best_iteration: {avg_best}")

# -----------------------------
# Final fit on ALL data
# -----------------------------
full = df.copy()

# Encode categoricals on full
os.makedirs(ENC_DIR, exist_ok=True)
encoders = {}
for c in cat_cols:
    le = LabelEncoder()
    full[c] = le.fit_transform(full[c].astype(str))
    encoders[c] = le
    joblib.dump(le, os.path.join(ENC_DIR, f"enc_{c}.pkl"))

# Booleans as int
for bc in bool_cols:
    full[bc] = full[bc].astype(int)

# Filter invalid groups
full = keep_valid_groups(full, label)
X_all, y_all = full[features], full[label].astype(int).values
g_all = full.groupby("Request_ID").size().to_list()

final_ranker = lgb.LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    eval_at=[5],
    learning_rate=0.03,
    n_estimators=avg_best,
    num_leaves=95,
    min_child_samples=30,
    colsample_bytree=0.9,
    subsample=0.8,
    subsample_freq=1,
    reg_lambda=1.0,
    random_state=42,
)
final_ranker.fit(X_all, y_all, group=g_all)

# Save artifacts
joblib.dump(final_ranker, MODEL_OUT)
with open(FEATS_OUT, "w", encoding="utf-8") as f:
    json.dump(features, f, indent=2)

# Show top importances
fi = pd.Series(final_ranker.feature_importances_, index=X_all.columns).sort_values(ascending=False)
print("\nTop 20 feature importances (final):")
print(fi.head(20).to_string())

print(f"\nSaved model → {MODEL_OUT}")
print(f"Saved encoders → {ENC_DIR}/enc_*.pkl")
print(f"Saved feature list → {FEATS_OUT}")




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001157 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4302
[LightGBM] [Info] Number of data points in the train set: 7573, number of used features: 36
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[64]	valid_0's ndcg@5: 0.671651
Fold 1: NDCG@5 = 0.6717, best_iter = 64
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000849 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4303
[LightGBM] [Info] Number of data points in the train set: 7573, number of used features: 36
Training until validation scores don't improve for 100 rounds




Early stopping, best iteration is:
[39]	valid_0's ndcg@5: 0.698675
Fold 2: NDCG@5 = 0.6987, best_iter = 39
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000939 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4297
[LightGBM] [Info] Number of data points in the train set: 7574, number of used features: 36
Training until validation scores don't improve for 100 rounds




Early stopping, best iteration is:
[190]	valid_0's ndcg@5: 0.735115
Fold 3: NDCG@5 = 0.7351, best_iter = 190
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000801 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4304
[LightGBM] [Info] Number of data points in the train set: 7574, number of used features: 36
Training until validation scores don't improve for 100 rounds




Early stopping, best iteration is:
[84]	valid_0's ndcg@5: 0.699902
Fold 4: NDCG@5 = 0.6999, best_iter = 84
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000594 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4299
[LightGBM] [Info] Number of data points in the train set: 7574, number of used features: 36
Training until validation scores don't improve for 100 rounds




Early stopping, best iteration is:
[16]	valid_0's ndcg@5: 0.706246
Fold 5: NDCG@5 = 0.7062, best_iter = 16

=== CV Summary ===
NDCG@5 per fold: [0.6717, 0.6987, 0.7351, 0.6999, 0.7062]
Mean ± Std NDCG@5: 0.7023 ± 0.0203
Avg best_iteration: 78
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000938 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4309
[LightGBM] [Info] Number of data points in the train set: 9467, number of used features: 36





Top 20 feature importances (final):
Rel_Distance                635
Rel_Availability            501
Last_Updated_Min_Ago        437
Distance_km                 366
Fulfillment_Rate_%          358
Patient_Satisfaction_%      351
Urgency_x_Distance          337
Urgency_x_Speed             314
Adequacy_x_Staff            302
Blood_Safety_Score_%        297
Availability_Ratio          295
Avg_Response_Time_Min       290
Inv_Distance                289
Urgency_x_Recency           284
Total_Units                 257
Staleness_Score             219
Beds_Available              215
Staffing_Level              215
Doctors_On_Duty             182
Available_Units_For_Type    176

Saved model → D:\Thal-AI\thalcare-AI\backend\output\ranker_auto.pkl
Saved encoders → D:\Thal-AI\thalcare-AI\backend\output\encoders_auto/enc_*.pkl
Saved feature list → D:\Thal-AI\thalcare-AI\backend\output\features_auto.json


In [3]:
df.head()

Unnamed: 0,Request_ID,Request_Timestamp,User_Latitude,User_Longitude,Blood_Group_Requested,Component_Requested,Units_Requested,Urgency_Level,Hospital_ID,Hospital_Name,...,Was_Fulfilled,Was_Chosen_By_User,Relevance,Availability_Ratio,Staleness_Score,Rel_Availability,Rel_Distance,Inv_Distance,Urgency_Num,Urgency_x_Distance
0,R2000,11-10-2025 22:02,26.889626,75.808118,A+,Whole,4,Routine,H1011,CityCare 12,...,0,0,0,4.5,0.03876,1.2,5.032962,0.179211,1,4.58
1,R2000,11-10-2025 22:02,26.889626,75.808118,A+,Whole,4,Routine,H1012,CityCare 13,...,0,0,0,2.25,0.026385,0.6,1007.097794,0.00109,1,916.46
2,R2000,11-10-2025 22:02,26.889626,75.808118,A+,Whole,4,Routine,H1036,CityCare 37,...,0,0,0,2.75,0.045045,0.733333,590.263088,0.001858,1,537.14
3,R2000,11-10-2025 22:02,26.889626,75.808118,A+,Whole,4,Routine,H1057,CityCare 58,...,0,0,0,6.0,0.04065,1.6,783.405733,0.001401,1,712.9
4,R2000,11-10-2025 22:02,26.889626,75.808118,A+,Whole,4,Routine,H1102,CityCare 103,...,1,1,2,4.0,0.153846,1.066667,5.681312,0.162075,1,5.17


In [8]:
from sklearn.metrics import ndcg_score

map_scores = []
ndcg_scores = []

def apk(actual, predicted, k=5):
    predicted = predicted[:k]
    score = 0.0
    num_hits = 0.0
    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)
    if not actual:
        return 0.0
    return score / min(len(actual), k)

for fold_id, (tr_idx, te_idx) in enumerate(gkf.split(df, groups=groups), 1):
    # ... fit your model ...
    yp = ranker.predict(Xte, num_iteration=ranker.best_iteration_)
    # group boundaries
    off = 0
    fold_map_scores = []
    for sz in gte:
        yt = yte[off:off+sz]
        yp_sub = yp[off:off+sz]
        # Get sorted indices of top predictions
        preds_ranked = np.argsort(-yp_sub)
        # Ground truth: indices where relevance > 0
        actual_idx = np.where(yt > 0)[0].tolist()
        fold_map_scores.append(apk(actual_idx, preds_ranked.tolist(), k=5))
        off += sz
    map_fold = np.mean(fold_map_scores)
    map_scores.append(map_fold)

    ndcg5 = ndcg_atk_grouped(yte, yp, gte, k=5)
    ndcg_scores.append(ndcg5)
    print(f"Fold {fold_id}: NDCG@5={ndcg5:.4f}, MAP@5={map_fold:.4f}")

print(f"Mean NDCG@5: {np.mean(ndcg_scores):.4f}")
print(f"Mean MAP@5: {np.mean(map_scores):.4f}")




Fold 1: NDCG@5=0.7062, MAP@5=0.5908
Fold 2: NDCG@5=0.7062, MAP@5=0.5908
Fold 3: NDCG@5=0.7062, MAP@5=0.5908
Fold 4: NDCG@5=0.7062, MAP@5=0.5908
Fold 5: NDCG@5=0.7062, MAP@5=0.5908
Mean NDCG@5: 0.7062
Mean MAP@5: 0.5908


In [9]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import ndcg_score
import lightgbm as lgb
from lightgbm import early_stopping
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

# === Load dataset ===
df = pd.read_csv(r"D:\Thal-AI\thalcare-AI\backend\blood_request_ranking_dataset.csv")

# === Label (graded relevance) ===
df["Relevance"] = np.where(df["Was_Fulfilled"] == 1, 2,
                    np.where(df["Was_Chosen_By_User"] == 1, 1, 0)).astype(int)

# === Derived features ===
df["Availability_Ratio"] = df["Available_Units_For_Type"] / df["Units_Requested"].clip(lower=1)
df["Staleness_Score"] = 1.0 / (1.0 + df["Last_Updated_Min_Ago"])
df["Inv_Distance"] = 1.0 / (1.0 + df["Distance_km"])
df["Urgency_Num"] = df["Urgency_Level"].map({"Emergency": 2, "Routine": 1, "Scheduled": 0}).fillna(1).astype(int)
df["Urgency_x_Distance"] = df["Urgency_Num"] * df["Distance_km"]

# === Request-relative ===
g = df.groupby("Request_ID", sort=False)
df["Rel_Availability"] = df["Available_Units_For_Type"] / g["Available_Units_For_Type"].transform("mean").clip(lower=1e-6)
df["Rel_Distance"] = df["Distance_km"] / (g["Distance_km"].transform("min") + 1e-6)

# === Feature list ===
features = [
    "Distance_km", "Available_Units_For_Type", "Meets_Demand_Bool",
    "Last_Updated_Min_Ago", "Units_Requested", "Blood_Group_Requested",
    "Urgency_Level", "City",
    "Availability_Ratio", "Staleness_Score", "Inv_Distance",
    "Urgency_Num", "Urgency_x_Distance",
    "Rel_Availability", "Rel_Distance"
]
features = [f for f in features if f in df.columns]

# === Encode categoricals ===
cat_cols = ["Blood_Group_Requested", "Urgency_Level", "City"]
for c in cat_cols:
    le = LabelEncoder()
    df[c] = le.fit_transform(df[c].astype(str))

# === Helpers ===
def keep_valid_groups(d):
    gsize = d.groupby("Request_ID").size()
    gpos = d.groupby("Request_ID")["Relevance"].sum()
    good = gsize[(gsize >= 2) & (gpos >= 1)].index
    return d[d["Request_ID"].isin(good)].copy()

def ndcg_atk_grouped(y_true, y_pred, group_sizes, k=5):
    off, sc = 0, []
    for sz in group_sizes:
        yt, yp = y_true[off:off+sz], y_pred[off:off+sz]
        sc.append(ndcg_score([yt], [yp], k=k))
        off += sz
    return np.mean(sc)

def apk(actual, predicted, k=5):
    if len(predicted) > k:
        predicted = predicted[:k]
    score, hits = 0.0, 0.0
    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            hits += 1.0
            score += hits / (i+1.0)
    return score / min(len(actual), k) if actual else 0.0

def mapk(y_true, y_pred, groups, k=5):
    off, scores = 0, []
    for sz in groups:
        yt, yp = y_true[off:off+sz], y_pred[off:off+sz]
        preds_ranked = np.argsort(-yp)
        actual_idx = np.where(yt > 0)[0].tolist()
        scores.append(apk(actual_idx, preds_ranked.tolist(), k))
        off += sz
    return np.mean(scores)

# === GroupKFold CV ===
df = keep_valid_groups(df)
groups = df["Request_ID"].values
X, y = df[features], df["Relevance"].astype(int).values
gkf = GroupKFold(n_splits=5)

fold_ndcg, fold_map = [], []

for fold, (tr_idx, te_idx) in enumerate(gkf.split(X, y, groups), 1):
    Xtr, Xte = X.iloc[tr_idx], X.iloc[te_idx]
    ytr, yte = y[tr_idx], y[te_idx]
    gtr = df.iloc[tr_idx].groupby("Request_ID").size().to_list()
    gte = df.iloc[te_idx].groupby("Request_ID").size().to_list()

    ranker = lgb.LGBMRanker(
        objective="lambdarank",
        metric="ndcg",
        eval_at=[5],
        num_leaves=63,
        learning_rate=0.05,
        n_estimators=2000,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    )

    ranker.fit(
        Xtr, ytr, group=gtr,
        eval_set=[(Xte, yte)], eval_group=[gte],
        callbacks=[early_stopping(100)]
    )

    yp = ranker.predict(Xte, num_iteration=ranker.best_iteration_)
    ndcg5 = ndcg_atk_grouped(yte, yp, gte, k=5)
    map5 = mapk(yte, yp, gte, k=5)
    fold_ndcg.append(ndcg5)
    fold_map.append(map5)
    print(f"Fold {fold}: NDCG@5={ndcg5:.4f}, MAP@5={map5:.4f}")

print(f"\nMean NDCG@5: {np.mean(fold_ndcg):.4f}")
print(f"Mean MAP@5:  {np.mean(fold_map):.4f}")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000360 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1949
[LightGBM] [Info] Number of data points in the train set: 7573, number of used features: 15
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[8]	valid_0's ndcg@5: 0.66573
Fold 1: NDCG@5=0.6658, MAP@5=0.5442
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000257 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1950
[LightGBM] [Info] Number of data points in the train set: 7573, number of used features: 15
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[47]	valid_0's ndcg@5: 0.688273
Fold 2: NDCG@5=0.6883, MAP@5=0.5715
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000268 s