In [5]:
!pip install lightgbm

Collecting lightgbm
  Obtaining dependency information for lightgbm from https://files.pythonhosted.org/packages/5e/23/f8b28ca248bb629b9e08f877dd2965d1994e1674a03d67cd10c5246da248/lightgbm-4.6.0-py3-none-win_amd64.whl.metadata
  Downloading lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-win_amd64.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------- ----------------- 0.8/1.5 MB 17.2 MB/s eta 0:00:01
   ---------------------------------------- 1.5/1.5 MB 22.9 MB/s eta 0:00:00
Installing collected packages: lightgbm
Successfully installed lightgbm-4.6.0



[notice] A new release of pip is available: 23.2.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [23]:
import pandas
import numpy as np
import lightgbm as lgb


In [1]:
# save as train_ranker.py and run in the same env as your original
import os
import json
import pandas as pd
import numpy as np
import lightgbm as lgb
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import ndcg_score
from lightgbm import early_stopping, log_evaluation

# ============================================================
#  CONFIG
# ============================================================
DATA_PATH = r"D:\Thal-AI\thalcare-AI\backend\blood_request_ranking_dataset.csv"
OUTPUT_DIR = r"D:\Thal-AI\thalcare-AI\backend\output_api"
ENC_DIR = os.path.join(OUTPUT_DIR, "encoders_api")
MODEL_PATH = os.path.join(OUTPUT_DIR, "ranker_api_aligned.txt")   # LightGBM booster save
PICKLE_MODEL = os.path.join(OUTPUT_DIR, "ranker_api_aligned.pkl") # optional joblib
FEATURES_PATH = os.path.join(OUTPUT_DIR, "features_api.json")
MANIFEST_PATH = os.path.join(OUTPUT_DIR, "manifest.json")

os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(ENC_DIR, exist_ok=True)

# ============================================================
#  LOAD DATA
# ============================================================
df = pd.read_csv(DATA_PATH)
# graded relevance: 2 (fulfilled) > 1 (chosen) > 0
df["Relevance"] = np.where(
    df["Was_Fulfilled"] == 1, 2,
    np.where(df["Was_Chosen_By_User"] == 1, 1, 0)
).astype(int)

# ============================================================
#  FEATURE ENGINEERING (your existing features)
# ============================================================
df["Availability_Ratio"] = df["Available_Units_For_Type"] / df["Units_Requested"].clip(lower=1)
df["Staleness_Score"] = 1.0 / (1.0 + df["Last_Updated_Min_Ago"])
g = df.groupby("Request_ID", sort=False)
df["Rel_Availability"] = df["Available_Units_For_Type"] / g["Available_Units_For_Type"].transform("mean").clip(lower=1e-6)
df["Rel_Distance"] = df["Distance_km"] / (g["Distance_km"].transform("min") + 1e-6)
df["Inv_Distance"] = 1.0 / (1.0 + df["Distance_km"])
df["Urgency_Num"] = df["Urgency_Level"].map({"Emergency": 2, "Routine": 1, "Scheduled": 0}).fillna(1).astype(int)
df["Urgency_x_Distance"] = df["Urgency_Num"] * df["Distance_km"]

features = [
    "Distance_km", "Available_Units_For_Type", "Meets_Demand_Bool",
    "Last_Updated_Min_Ago", "Units_Requested", "Blood_Group_Requested",
    "Urgency_Level", "City", "Availability_Ratio", "Staleness_Score",
    "Rel_Availability", "Rel_Distance", "Inv_Distance", "Urgency_Num",
    "Urgency_x_Distance",
]
label = "Relevance"

# ============================================================
#  ENCODING (LabelEncoder approach but persisted)
#  Alternative: use LightGBM categorical_feature instead of encoding
# ============================================================
encoders = {}
categorical_cols = ["Blood_Group_Requested", "Urgency_Level", "City"]
for col in categorical_cols:
    le = LabelEncoder()
    # fillna and cast to str to avoid issues
    df[col] = le.fit_transform(df[col].astype(str))
    encoders[col] = le
    enc_path = os.path.join(ENC_DIR, f"enc_{col}.pkl")
    joblib.dump(le, enc_path)
    print(f"✅ Saved encoder: {enc_path}")

# Persist features list
with open(FEATURES_PATH, "w") as f:
    json.dump(features, f)
print("✅ Saved features list:", FEATURES_PATH)

# ============================================================
#  TRAIN / TEST SPLIT (by Request_ID)
# ============================================================
req_ids = df["Request_ID"].unique()
train_ids, test_ids = train_test_split(req_ids, test_size=0.2, random_state=42)

train_df = df[df["Request_ID"].isin(train_ids)].copy()
test_df = df[df["Request_ID"].isin(test_ids)].copy()

# Optionally: keep only groups with at least 2 candidates and >=1 positive
def keep_valid_groups(d, label_col):
    gsize = d.groupby("Request_ID").size()
    gpos = d.groupby("Request_ID")[label_col].sum()
    good = gsize[(gsize >= 2) & (gpos >= 1)].index
    return d[d["Request_ID"].isin(good)].copy()

train_df = keep_valid_groups(train_df, label)
test_df = keep_valid_groups(test_df, label)

# ============================================================
#  CRITICAL: Sort by Request_ID so LightGBM group sizes align to row order
# ============================================================
train_df = train_df.sort_values("Request_ID").reset_index(drop=True)
test_df = test_df.sort_values("Request_ID").reset_index(drop=True)

# Build groups AFTER sorting
group_train = train_df.groupby("Request_ID").size().to_list()
group_test  = test_df.groupby("Request_ID").size().to_list()

X_train, y_train = train_df[features], train_df[label].astype(int)
X_test,  y_test  = test_df[features], test_df[label].astype(int)

# Optional sample weights (e.g., upweight fulfilled)
# sample_weight = np.where(train_df["Was_Fulfilled"]==1, 2.0, 1.0)
sample_weight = None

# ============================================================
#  TRAIN MODEL (LightGBM Ranker)
# ============================================================
ranker = lgb.LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    eval_at=[3,5],
    learning_rate=0.03,
    n_estimators=3000,
    num_leaves=95,
    min_child_samples=30,
    colsample_bytree=0.9,
    subsample=0.8,
    subsample_freq=1,
    reg_lambda=1.0,
    random_state=42,
    n_jobs=-1
)

ranker.fit(
    X_train, y_train,
    group=group_train,
    sample_weight=sample_weight,
    eval_set=[(X_test, y_test)],
    eval_group=[group_test],
    callbacks=[early_stopping(stopping_rounds=100), log_evaluation(period=50)],
)

# Save the trained model (LightGBM booster format and joblib fallback)
booster = ranker.booster_
booster.save_model(MODEL_PATH)
joblib.dump(ranker, PICKLE_MODEL)
print("✅ Saved LightGBM model:", MODEL_PATH)
print("✅ Saved sklearn wrapper model (joblib):", PICKLE_MODEL)

# Save manifest (what we saved + version)
manifest = {
    "model_booster": MODEL_PATH,
    "model_joblib": PICKLE_MODEL,
    "features": FEATURES_PATH,
    "encoders_dir": ENC_DIR,
    "training_rows": int(len(train_df)),
    "test_rows": int(len(test_df)),
    "best_iteration": int(getattr(ranker, "best_iteration_", -1) or booster.best_iteration)
}
with open(MANIFEST_PATH, "w") as f:
    json.dump(manifest, f, indent=2)
print("✅ Saved manifest:", MANIFEST_PATH)

# ============================================================
#  EVALUATION (grouped NDCG)
# ============================================================
def ndcg_grouped(y_true, y_score, group_sizes, k=5):
    off = 0
    s = []
    for sz in group_sizes:
        yt = y_true.iloc[off:off+sz].values
        ys = y_score[off:off+sz]
        # if all zeros in yt, ndcg_score returns 0 — that's fine
        s.append(ndcg_score([yt],[ys], k=k))
        off += sz
    return float(np.mean(s))

y_pred = ranker.predict(X_test, num_iteration=ranker.best_iteration_)
print(f"Best iteration: {getattr(ranker,'best_iteration_', -1)}")
print(f"NDCG@3:  {ndcg_grouped(y_test, y_pred, group_test, k=3):.4f}")
print(f"NDCG@5:  {ndcg_grouped(y_test, y_pred, group_test, k=5):.4f}")
print(f"NDCG@10: {ndcg_grouped(y_test, y_pred, group_test, k=10):.4f}")

# ============================================================
#  Feature importance (save)
# ============================================================
fi = booster.feature_importance(importance_type="gain")
fi_names = booster.feature_name()
feat_imp = sorted(zip(fi_names, fi.tolist()), key=lambda x: x[1], reverse=True)
feat_imp_path = os.path.join(OUTPUT_DIR, "feature_importance.json")
with open(feat_imp_path, "w") as f:
    json.dump(feat_imp, f, indent=2)
print("✅ Saved feature importance:", feat_imp_path)
print(feat_imp[:20])


✅ Saved encoder: D:\Thal-AI\thalcare-AI\backend\output_api\encoders_api\enc_Blood_Group_Requested.pkl
✅ Saved encoder: D:\Thal-AI\thalcare-AI\backend\output_api\encoders_api\enc_Urgency_Level.pkl
✅ Saved encoder: D:\Thal-AI\thalcare-AI\backend\output_api\encoders_api\enc_City.pkl
✅ Saved features list: D:\Thal-AI\thalcare-AI\backend\output_api\features_api.json
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000598 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1950
[LightGBM] [Info] Number of data points in the train set: 7578, number of used features: 15
Training until validation scores don't improve for 100 rounds
[50]	valid_0's ndcg@3: 0.668333	valid_0's ndcg@5: 0.696564




[100]	valid_0's ndcg@3: 0.661163	valid_0's ndcg@5: 0.684759
Early stopping, best iteration is:
[9]	valid_0's ndcg@3: 0.677206	valid_0's ndcg@5: 0.697986
✅ Saved LightGBM model: D:\Thal-AI\thalcare-AI\backend\output_api\ranker_api_aligned.txt
✅ Saved sklearn wrapper model (joblib): D:\Thal-AI\thalcare-AI\backend\output_api\ranker_api_aligned.pkl
✅ Saved manifest: D:\Thal-AI\thalcare-AI\backend\output_api\manifest.json
Best iteration: 9
NDCG@3:  0.6779
NDCG@5:  0.6979
NDCG@10: 0.8159
✅ Saved feature importance: D:\Thal-AI\thalcare-AI\backend\output_api\feature_importance.json
[('Rel_Distance', 2829.71607221663), ('Distance_km', 1653.1525871753693), ('Inv_Distance', 1559.6537709534168), ('Last_Updated_Min_Ago', 1004.7979336678982), ('Rel_Availability', 853.6202109009027), ('Availability_Ratio', 602.247102484107), ('Staleness_Score', 282.03175711631775), ('Available_Units_For_Type', 267.99151235818863), ('Urgency_x_Distance', 265.42469388246536), ('City', 169.10793149471283), ('Blood_Group

