# Modeling for E-Commerce Shipping Data

This notebook covers:
1. Data loading and preprocessing pipeline.

2. Data scaling & encoding checks.
3. Pre-Modeling: metrics, thresholds, candidate models, hypotheses.
4. Post-Modeling: training, evaluation, visualization, and summary.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, auc, precision_recall_curve
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (10, 4)

## Load dataset and inspect structure
We load the dataset, separate features and target, and identify numerical and categorical features.

In [None]:
df = pd.read_csv("../data/processed/train_split.csv")
print(f"Dataset loaded with shape: {df.shape}")
display(df.head())

# Separate X and y
X = df.drop("Reached.on.Time_Y.N", axis=1)
y = df["Reached.on.Time_Y.N"]

numerical_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features = X.select_dtypes(include=["object"]).columns.tolist()

print("Numerical features:", numerical_features)
print("Categorical features:", categorical_features)

## Build preprocessing pipeline
- Numerical: StandardScaler

- Categorical: OneHotEncoder
Both combined into a ColumnTransformer.

In [None]:
num_pipeline = Pipeline([("scaler", StandardScaler())])
cat_pipeline = Pipeline([("encoder", OneHotEncoder(handle_unknown="ignore"))])

preprocessor = ColumnTransformer([
    ("num", num_pipeline, numerical_features),
    ("cat", cat_pipeline, categorical_features)
])

print("Data preprocessing pipeline created.")

## Scaling check (before vs after)

In [None]:
X_num_before = X[numerical_features]
X_num_after = num_pipeline.fit_transform(X_num_before)
X_num_after_df = pd.DataFrame(X_num_after, columns=numerical_features)

for col in numerical_features:
    fig, axes = plt.subplots(1, 2, figsize=(10, 3))
    sns.histplot(X_num_before[col], ax=axes[0], kde=True)
    axes[0].set_title(f"{col} - Before Scaling")
    sns.histplot(X_num_after_df[col], ax=axes[1], kde=True)
    axes[1].set_title(f"{col} - After Scaling")
    plt.tight_layout()
    plt.show()

## Encoding check: number of columns & sparsity

In [None]:
ohe = OneHotEncoder(handle_unknown="ignore")
ohe.fit(X[categorical_features])
num_new_columns = len(ohe.get_feature_names_out())
print("Number of new columns from categorical encoding:", num_new_columns)

X_cat_transformed = ohe.transform(X[categorical_features])
if X_cat_transformed.shape[1] > 0:
    sparsity = 1 - (X_cat_transformed.nnz / (X_cat_transformed.shape[0] * X_cat_transformed.shape[1]))
    print(f"Sparsity after encoding: {sparsity:.4f}")
else:
    print("No categorical features found, skipping sparsity calc.")

X_transformed = preprocessor.fit_transform(X)
print("Shape before preprocessing:", X.shape)
print("Shape after preprocessing:", X_transformed.shape)

## Modeling Phase
This section includes both Pre-Modeling and Post-Modeling:
- Metrics & thresholds definition

- Model selection & hypothesis
- Model training & evaluation
- Visualization & summary

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_transformed, y, test_size=0.2, random_state=42, stratify=y
)

# Define metrics
metrics = {
    "Accuracy": accuracy_score,
    "Precision": precision_score,
    "Recall": recall_score,
    "F1 Score": f1_score,
    "ROC-AUC": roc_auc_score
}
print("Metrics selected:", list(metrics.keys()))

# Define thresholds
thresholds = {
    "Accuracy": 0.80,
    "Precision": 0.75,
    "Recall": 0.70,
    "F1 Score": 0.72,
    "ROC-AUC": 0.80
}
print("\nThresholds:")
for m, t in thresholds.items():
    print(f"{m}: ≥ {t}")

# Candidate models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42)
}

# Notes before training
notes = {
    "Logistic Regression": "Simple, interpretable, may struggle with non-linearity.",
    "Random Forest": "Captures non-linear patterns, robust to noise, less interpretable."
}
print("\n📌 Pre-training notes:")
for name, note in notes.items():
    print(f"{name}: {note}")

print("\nHypothesis: Random Forest may outperform Logistic Regression.")

## Model Training, Evaluation & Visualization
We'll train both models, evaluate them on all metrics, plot ROC and Precision-Recall curves, and summarize results

In [None]:
results = []

plt.figure()
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None

    # Store metrics
    row = {"Model": name}
    for m, func in metrics.items():
        if m == "ROC-AUC" and y_proba is not None:
            row[m] = func(y_test, y_proba)
        elif m == "ROC-AUC":
            row[m] = np.nan
        else:
            row[m] = func(y_test, y_pred)
    results.append(row)

    # ROC curve
    if y_proba is not None:
        fpr, tpr, _ = roc_curve(y_test, y_proba)
        plt.plot(fpr, tpr, label=f"{name} (AUC={auc(fpr, tpr):.2f})")

plt.plot([0, 1], [0, 1], "k--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curves")
plt.legend()
plt.show()

# Precision-Recall curve
plt.figure()
for name, model in models.items():
    if hasattr(model, "predict_proba"):
        y_proba = model.predict_proba(X_test)[:, 1]
        precision, recall, _ = precision_recall_curve(y_test, y_proba)
        plt.plot(recall, precision, label=name)
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curves")
plt.legend()
plt.show()

# Summary table
results_df = pd.DataFrame(results)
print("\nFinal Evaluation Summary:")
display(results_df)