In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    recall_score,
    roc_auc_score,
    classification_report
)

In [4]:
# Load dataset
df = pd.read_csv("pharmacy_dataset_improved_v2.csv")

# Convert date
df["date"] = pd.to_datetime(df["date"])

# Sort chronologically (VERY important)
df = df.sort_values("date").reset_index(drop=True)


In [5]:
categorical_cols = ["medicine_category"]

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])


In [6]:
TARGET = "target_stockout"

FEATURES = [
    "pharmacy_id",
    "medicine_id",
    "current_stock_level",
    "avg_weekly_sales",
    "reorder_quantity",
    "lead_time_days",
    "supplier_count",
    "supplier_delay_frequency",
    "price_change_rate",
    "storage_capacity",
    "previous_shortage_count",
    "dos_per_patient",
    "category_shortage_rate",
    "stock_to_sales_ratio",
    "demand_volatility",
    "seasonal_demand_factor",
    "num_patients",
    "medicine_category",
    "pharmacy_location_code"
]

X = df[FEATURES]
y = df[TARGET]


In [7]:
train_end = "2024-09-30"
val_end = "2024-10-31"

X_train = X[df["date"] <= train_end]
y_train = y[df["date"] <= train_end]

X_val = X[(df["date"] > train_end) & (df["date"] <= val_end)]
y_val = y[(df["date"] > train_end) & (df["date"] <= val_end)]

X_test = X[df["date"] > val_end]
y_test = y[df["date"] > val_end]

print("Train:", X_train.shape)
print("Validation:", X_val.shape)
print("Test:", X_test.shape)


Train: (2238, 19)
Validation: (259, 19)
Test: (503, 19)


In [9]:
# Standardization
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)


In [10]:
log_model = LogisticRegression(
    max_iter=1000,
    class_weight="balanced",   # VERY important for imbalance
    solver="lbfgs",
    random_state=42
)

log_model.fit(X_train_scaled, y_train)


In [11]:
y_val_pred = log_model.predict(X_val_scaled)
y_val_proba = log_model.predict_proba(X_val_scaled)[:, 1]

print("ðŸ“Š VALIDATION PERFORMANCE")
print(classification_report(y_val, y_val_pred))
print("ROC-AUC:", roc_auc_score(y_val, y_val_proba))


ðŸ“Š VALIDATION PERFORMANCE
              precision    recall  f1-score   support

           0       0.24      1.00      0.38         8
           1       1.00      0.90      0.95       251

    accuracy                           0.90       259
   macro avg       0.62      0.95      0.66       259
weighted avg       0.98      0.90      0.93       259

ROC-AUC: 0.9825697211155379


In [12]:
y_test_pred = log_model.predict(X_test_scaled)
y_test_proba = log_model.predict_proba(X_test_scaled)[:, 1]

accuracy = accuracy_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred)
recall = recall_score(y_test, y_test_pred)
roc_auc = roc_auc_score(y_test, y_test_proba)

print("ðŸ“Š TEST SET PERFORMANCE")
print(f"Accuracy : {accuracy:.3f}")
print(f"F1-score : {f1:.3f}")
print(f"Recall   : {recall:.3f}")
print(f"ROC-AUC  : {roc_auc:.3f}")

print("\nClassification Report:")
print(classification_report(y_test, y_test_pred))


ðŸ“Š TEST SET PERFORMANCE
Accuracy : 0.891
F1-score : 0.939
Recall   : 0.889
ROC-AUC  : 0.957

Classification Report:
              precision    recall  f1-score   support

           0       0.29      0.92      0.44        24
           1       1.00      0.89      0.94       479

    accuracy                           0.89       503
   macro avg       0.64      0.90      0.69       503
weighted avg       0.96      0.89      0.92       503

