In [10]:
pip install lightgbm



In [11]:
import pandas as pd
import numpy as np

import lightgbm as lgb

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    recall_score,
    roc_auc_score,
    classification_report
)

In [12]:
# Load your generated dataset
df = pd.read_csv("pharmacy_dataset_improved_v2.csv")

# Convert date to datetime
df["date"] = pd.to_datetime(df["date"])


In [16]:
df.isnull().sum()

Unnamed: 0,0
record_id,0
date,0
pharmacy_id,0
medicine_id,0
medicine_category,0
pharmacy_location_code,0
supplier_count,0
current_stock_level,0
avg_weekly_sales,0
reorder_quantity,0


In [17]:
df = df.drop_duplicates()

In [18]:
df.describe()

Unnamed: 0,record_id,date,pharmacy_id,medicine_id,medicine_category,pharmacy_location_code,supplier_count,current_stock_level,avg_weekly_sales,reorder_quantity,...,price_change_rate,storage_capacity,previous_shortage_count,dos_per_patient,category_shortage_rate,stock_to_sales_ratio,demand_volatility,seasonal_demand_factor,num_patients,target_stockout
count,3000.0,3000,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,...,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0
mean,1500.5,2024-07-04 05:59:31.200000,50.655667,25.063667,2.005333,10.587667,2.276667,98.877333,36.878333,83.63,...,0.04838,299.783333,0.056667,2.09452,0.872133,3.75556,0.505157,1.040933,19.527333,0.963667
min,1.0,2024-01-01 00:00:00,1.0,1.0,0.0,1.0,1.0,0.0,5.0,20.0,...,-0.1,100.0,0.0,0.0,0.0,0.0,0.1,1.0,1.0,0.0
25%,750.75,2024-04-05 00:00:00,25.0,12.0,1.0,6.0,1.0,64.0,25.0,70.75,...,-0.01,201.0,0.0,0.57,1.0,1.6375,0.31,1.0,14.0,1.0
50%,1500.5,2024-07-07 00:00:00,50.0,25.0,2.0,11.0,2.0,98.0,36.0,100.0,...,0.05,302.0,0.0,1.0,1.0,2.68,0.51,1.0,19.0,1.0
75%,2250.25,2024-10-05 00:00:00,76.0,38.0,3.0,16.0,3.0,132.0,48.0,100.0,...,0.1,396.0,0.0,1.91,1.0,4.3,0.7,1.0,25.0,1.0
max,3000.0,2024-12-31 00:00:00,100.0,50.0,4.0,20.0,4.0,278.0,97.0,100.0,...,0.2,500.0,2.0,170.8,1.0,38.2,0.9,1.4,45.0,1.0
std,866.169729,,28.759847,14.624143,1.40948,5.721004,1.007535,48.803301,16.2798,23.781231,...,0.07572,114.179391,0.239739,5.716402,0.274509,4.068805,0.227979,0.10306,7.873283,0.187149


In [22]:
df.corr()["target_stockout"].sort_values(ascending=False)

Unnamed: 0,target_stockout
target_stockout,1.0
category_shortage_rate,0.25743
record_id,0.156713
supplier_delay_frequency,0.111628
avg_weekly_sales,0.10438
reorder_quantity,0.093851
num_patients,0.07456
previous_shortage_count,0.045904
seasonal_demand_factor,0.039101
demand_volatility,0.030261


In [23]:
# Sort chronologically to avoid data leakage
df = df.sort_values("date").reset_index(drop=True)

In [24]:
categorical_cols = ["medicine_category"]

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

In [25]:
TARGET = "target_stockout"

FEATURES = [
    # Identifiers (keep pharmacy & medicine IDs)
    "pharmacy_id",
    "medicine_id",

    # Core inventory & demand features
    "current_stock_level",
    "avg_weekly_sales",
    "reorder_quantity",
    "lead_time_days",
    "supplier_count",
    "supplier_delay_frequency",

    # Economic & capacity features
    "price_change_rate",
    "storage_capacity",

    # Research-based engineered features (Pall et al.)
    "previous_shortage_count",
    "dos_per_patient",
    "category_shortage_rate",
    "stock_to_sales_ratio",
    "demand_volatility",
    "seasonal_demand_factor",
    "num_patients",

    # Contextual features
    "medicine_category",
    "pharmacy_location_code"
]

X = df[FEATURES]
y = df[TARGET]


In [26]:
# Time-based split (example)
train_end = "2024-09-30"
val_end = "2024-10-31"

X_train = X[df["date"] <= train_end]
y_train = y[df["date"] <= train_end]

X_val = X[(df["date"] > train_end) & (df["date"] <= val_end)]
y_val = y[(df["date"] > train_end) & (df["date"] <= val_end)]

X_test = X[df["date"] > val_end]
y_test = y[df["date"] > val_end]

print("Train size:", X_train.shape)
print("Validation size:", X_val.shape)
print("Test size:", X_test.shape)


Train size: (2200, 19)
Validation size: (301, 19)
Test size: (499, 19)


In [27]:
model = lgb.LGBMClassifier(
    objective="binary",
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    class_weight="balanced"  # IMPORTANT for shortage imbalance
)


In [29]:
model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric="f1"
)



[LightGBM] [Info] Number of positive: 2120, number of negative: 80
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000870 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1665
[LightGBM] [Info] Number of data points in the train set: 2200, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000


In [30]:
# Predictions
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

# Metrics
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_proba)

print("ðŸ“Š TEST SET PERFORMANCE")
print(f"Accuracy : {accuracy:.3f}")
print(f"F1-score : {f1:.3f}")
print(f"Recall   : {recall:.3f}")
print(f"ROC-AUC  : {roc_auc:.3f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))


ðŸ“Š TEST SET PERFORMANCE
Accuracy : 0.988
F1-score : 0.994
Recall   : 0.996
ROC-AUC  : 0.993

Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.76      0.81        17
           1       0.99      1.00      0.99       482

    accuracy                           0.99       499
   macro avg       0.93      0.88      0.90       499
weighted avg       0.99      0.99      0.99       499



In [32]:


y_pred = model.predict(X_val)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.73      0.67      0.70        12
           1       0.99      0.99      0.99       289

    accuracy                           0.98       301
   macro avg       0.86      0.83      0.84       301
weighted avg       0.98      0.98      0.98       301

