In [5]:
!pip install lightgbm

Collecting lightgbm
  Obtaining dependency information for lightgbm from https://files.pythonhosted.org/packages/5e/23/f8b28ca248bb629b9e08f877dd2965d1994e1674a03d67cd10c5246da248/lightgbm-4.6.0-py3-none-win_amd64.whl.metadata
  Downloading lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-win_amd64.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------- ----------------- 0.8/1.5 MB 17.2 MB/s eta 0:00:01
   ---------------------------------------- 1.5/1.5 MB 22.9 MB/s eta 0:00:00
Installing collected packages: lightgbm
Successfully installed lightgbm-4.6.0



[notice] A new release of pip is available: 23.2.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [23]:
import pandas
import numpy as np
import lightgbm as lgb


In [37]:
dataset = pandas.read_csv(r'D:\Thal-AI\thalcare-AI\backend\blood_request_ranking_dataset.csv')

dataset.head()

dataset["Relevance"] = np.where(dataset["Was_Fulfilled"] == 1, 2, np.where(dataset["Was_Chosen_By_User"] == 1, 1, 0)).astype('int')

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import ndcg_score
import lightgbm as lgb
from lightgbm import early_stopping, log_evaluation
import joblib

# -----------------------------
# Load & labels
# -----------------------------
dataset = pd.read_csv(r'D:\Thal-AI\thalcare-AI\backend\blood_request_ranking_dataset.csv')

# graded relevance: 2 (fulfilled) > 1 (chosen) > 0
dataset["Relevance"] = np.where(
    dataset["Was_Fulfilled"] == 1, 2,
    np.where(dataset["Was_Chosen_By_User"] == 1, 1, 0)
).astype(int)

df = dataset.copy()

# -----------------------------
# Derived features (API-safe)
# -----------------------------
df["Availability_Ratio"] = df["Available_Units_For_Type"] / df["Units_Requested"].clip(lower=1)
df["Staleness_Score"] = 1.0 / (1.0 + df["Last_Updated_Min_Ago"])

g = df.groupby("Request_ID", sort=False)
df["Rel_Availability"] = df["Available_Units_For_Type"] / g["Available_Units_For_Type"].transform("mean").clip(lower=1e-6)
df["Rel_Distance"] = df["Distance_km"] / (g["Distance_km"].transform("min") + 1e-6)

df["Inv_Distance"] = 1.0 / (1.0 + df["Distance_km"])
df["Urgency_Num"] = df["Urgency_Level"].map({"Emergency": 2, "Routine": 1, "Scheduled": 0}).fillna(1).astype(int)
df["Urgency_x_Distance"] = df["Urgency_Num"] * df["Distance_km"]

# -----------------------------
# Features & label
# -----------------------------
features = [
    "Distance_km",
    "Available_Units_For_Type",
    "Meets_Demand_Bool",
    "Last_Updated_Min_Ago",
    "Units_Requested",
    "Blood_Group_Requested",
    "Urgency_Level",
    "City",
    "Availability_Ratio",
    "Staleness_Score",
    "Rel_Availability",
    "Rel_Distance",
    "Inv_Distance",
    "Urgency_Num",
    "Urgency_x_Distance",
]
label = "Relevance"

# -----------------------------
# Split by Request_ID (group-aware)
# -----------------------------
req_ids = df["Request_ID"].unique()
train_ids, test_ids = train_test_split(req_ids, test_size=0.2, random_state=42)

train_df = df[df["Request_ID"].isin(train_ids)].copy()
test_df  = df[df["Request_ID"].isin(test_ids)].copy()

# -----------------------------
# Safe label encoding (fit on train, transform test)
# -----------------------------
encoders = {}
for col in ["Blood_Group_Requested", "Urgency_Level", "City"]:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col].astype(str))
    encoders[col] = le

# -----------------------------
# Filter invalid groups
# -----------------------------
def keep_valid_groups(d: pd.DataFrame, label_col: str):
    gsize = d.groupby("Request_ID").size()
    gpos  = d.groupby("Request_ID")[label_col].sum()
    good  = gsize[(gsize >= 2) & (gpos >= 1)].index
    return d[d["Request_ID"].isin(good)].copy()

train_df = keep_valid_groups(train_df, label)
test_df  = keep_valid_groups(test_df, label)

train_df = (train_df.sort_values(["Request_ID","Distance_km"])
                   .groupby("Request_ID").head(30).reset_index(drop=True))
test_df = (test_df.sort_values(["Request_ID","Distance_km"])
                  .groupby("Request_ID").head(30).reset_index(drop=True))

# -----------------------------
# Matrices & groups
# -----------------------------
group_train = train_df.groupby("Request_ID").size().to_list()
group_test  = test_df.groupby("Request_ID").size().to_list()

X_train, y_train = train_df[features], train_df[label].astype(int)
X_test,  y_test  = test_df[features],  test_df[label].astype(int)

# -----------------------------
# LightGBM ranker (optimize NDCG@5)
# -----------------------------
ranker = lgb.LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    eval_at=[5],
    learning_rate=0.03,
    n_estimators=3000,
    num_leaves=95,
    min_child_samples=30,
    colsample_bytree=0.9,
    subsample=0.8,
    subsample_freq=1,
    reg_lambda=1.0,
    random_state=42,
)

ranker.fit(
    X_train, y_train,
    group=group_train,
    eval_set=[(X_test, y_test)],
    eval_group=[group_test],
    callbacks=[early_stopping(100), log_evaluation(50)],
)

# -----------------------------
# Evaluation @3/@5/@10
# -----------------------------
def ndcg_grouped(y_true, y_score, group_sizes, k=5):
    off = 0
    s = []
    for sz in group_sizes:
        yt = y_true.iloc[off:off+sz].values
        ys = y_score[off:off+sz]
        s.append(ndcg_score([yt],[ys],k=k))
        off += sz
    return float(np.mean(s))

y_pred = ranker.predict(X_test, num_iteration=ranker.best_iteration_)
print(f"Best iteration: {ranker.best_iteration_}")
print(f"NDCG@3:  {ndcg_grouped(y_test, y_pred, group_test, k=3):.4f}")
print(f"NDCG@5:  {ndcg_grouped(y_test, y_pred, group_test, k=5):.4f}")
print(f"NDCG@10: {ndcg_grouped(y_test, y_pred, group_test, k=10):.4f}")

# # -----------------------------
# # Save artifacts (for inference)
# # -----------------------------
# joblib.dump(ranker, r"D:\Thal-AI\thalcare-AI\backend\ranker_api_aligned.pkl")
# for col, le in encoders.items():
#     joblib.dump(le, rf"D:\Thal-AI\thalcare-AI\backend\enc_{col}.pkl")
# print("Saved model + encoders.")




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000294 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1950
[LightGBM] [Info] Number of data points in the train set: 7578, number of used features: 15
Training until validation scores don't improve for 100 rounds
[50]	valid_0's ndcg@5: 0.696126
[100]	valid_0's ndcg@5: 0.696311
[150]	valid_0's ndcg@5: 0.685032
Early stopping, best iteration is:
[71]	valid_0's ndcg@5: 0.699608
Best iteration: 71
NDCG@3:  0.6714
NDCG@5:  0.6996
NDCG@10: 0.8122




In [46]:
# Rebuild full, filtered dataset exactly as you did for train/test
full = df.copy()

# (Re)encode categoricals on FULL data for the final model
from sklearn.preprocessing import LabelEncoder
encoders = {}
for col in ["Blood_Group_Requested", "Urgency_Level", "City"]:
    le = LabelEncoder()
    full[col] = le.fit_transform(full[col].astype(str))
    encoders[col] = le

# Keep only valid groups (>=2 candidates and >=1 positive)
def keep_valid_groups(d, label_col):
    gsize = d.groupby("Request_ID").size()
    gpos  = d.groupby("Request_ID")[label_col].sum()
    good  = gsize[(gsize>=2) & (gpos>=1)].index
    return d[d["Request_ID"].isin(good)].copy()

full = keep_valid_groups(full, label_col="Relevance")

# Build X/y and group sizes
X_all = full[features]
y_all = full["Relevance"].astype(int).values
group_all = full.groupby("Request_ID").size().to_list()

# Refit final model with best iteration (71)
import lightgbm as lgb
final_ranker = lgb.LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    eval_at=[5],
    learning_rate=0.03,     # same as your tuned run
    n_estimators=71,        # lock best iteration
    num_leaves=95,
    min_child_samples=30,
    colsample_bytree=0.8,
    subsample=0.8,
    subsample_freq=2,
    reg_lambda=1.0,
    random_state=42,
)
final_ranker.fit(X_all, y_all, group=group_all)

y_pred = ranker.predict(X_test, num_iteration=ranker.best_iteration_)
print(f"Best iteration: {ranker.best_iteration_}")
print(f"NDCG@3:  {ndcg_grouped(y_test, y_pred, group_test, k=3):.4f}")
print(f"NDCG@5:  {ndcg_grouped(y_test, y_pred, group_test, k=5):.4f}")
print(f"NDCG@10: {ndcg_grouped(y_test, y_pred, group_test, k=10):.4f}")


# Save artifacts
import joblib
joblib.dump(final_ranker, r"D:\Thal-AI\thalcare-AI\backend\ranker_api_aligned.pkl")
for col, le in encoders.items():
    joblib.dump(le, rf"D:\Thal-AI\thalcare-AI\backend\enc_{col}.pkl")
print("Final model + encoders saved.")



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000336 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1950
[LightGBM] [Info] Number of data points in the train set: 9467, number of used features: 15
Best iteration: 71
NDCG@3:  0.6714
NDCG@5:  0.6996
NDCG@10: 0.8122
Final model + encoders saved.
