# NYC Delivery Promise Engine: ETA Prediction & Delay Classification Models

## Notebook Overview
This notebook trains machine learning models for delivery promise optimization:
1. **ETA Regression Model**: Predicts trip duration for median (P50) estimates
2. **Delay Classification Model**: Predicts probability of exceeding P90 threshold
3. **Model Evaluation**: Performance metrics and feature importance analysis

### Business Context
- **ETA Model**: Provides realistic delivery time estimates
- **Delay Model**: Identifies high-risk deliveries for dynamic promise adjustments
- **Combined**: Enables P50 vs P90 promise strategy optimization


In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_absolute_error, roc_auc_score, precision_recall_curve
from sklearn.preprocessing import LabelEncoder
import joblib
import json
import warnings
warnings.filterwarnings('ignore')

print("🤖 NYC Delivery Promise Engine - Model Training")
print("=" * 60)

# Load processed data from feature engineering
print("Loading processed datasets...")
try:
    df_train = pd.read_csv('../data/processed_train_data.csv')
    df_test = pd.read_csv('../data/processed_test_data.csv')
    print(f"✅ Training data: {len(df_train):,} records")
    print(f"✅ Test data: {len(df_test):,} records")
except FileNotFoundError:
    print("❌ Processed data not found. Please run 02_feature_engineering.ipynb first.")
    raise


## 1. Feature Preparation


In [None]:
# Feature preparation for machine learning
print("\n🔧 Feature Preparation")
print("-" * 30)

# Select features for modeling
numeric_features = ['trip_distance', 'pickup_hour', 'pickup_dow']
categorical_features = ['pickup_borough', 'dropoff_borough']

# Encode categorical features
le_dict = {}
for feature in categorical_features:
    le = LabelEncoder()
    # Fit on combined data to ensure consistent encoding
    combined_values = pd.concat([df_train[feature], df_test[feature]]).unique()
    le.fit(combined_values)
    
    df_train[f'{feature}_encoded'] = le.transform(df_train[feature])
    df_test[f'{feature}_encoded'] = le.transform(df_test[feature])
    le_dict[feature] = le
    
    print(f"✅ Encoded {feature}: {len(le.classes_)} categories")

# Define feature columns for modeling
feature_cols = numeric_features + [f'{f}_encoded' for f in categorical_features]
target_col = 'trip_duration_minutes'

print(f"\n📊 Model Features: {feature_cols}")
print(f"🎯 Target: {target_col}")

# Prepare training and test sets
X_train = df_train[feature_cols].copy()
y_train = df_train[target_col].copy()
X_test = df_test[feature_cols].copy() 
y_test = df_test[target_col].copy()

print(f"\n✅ Feature matrices prepared:")
print(f"Training: {X_train.shape}, Test: {X_test.shape}")


## 2. ETA Regression Model


In [None]:
# Train ETA regression model
print("\n📈 Training ETA Regression Model")
print("-" * 40)

# Initialize Random Forest regressor
rf_regressor = RandomForestRegressor(
    n_estimators=100,
    max_depth=15,
    min_samples_split=100,
    random_state=42,
    n_jobs=-1
)

print("Training model...")
rf_regressor.fit(X_train, y_train)
print("✅ ETA model training completed!")

# Make predictions
y_pred = rf_regressor.predict(X_test)

# Evaluate model performance
mae = mean_absolute_error(y_test, y_pred)
print(f"\n📊 ETA Model Performance:")
print(f"Mean Absolute Error: {mae:.2f} minutes")

# Calculate P90 coverage (important for delivery promises)
residuals = y_test - y_pred
p90_adjustment = np.percentile(residuals, 90)
y_pred_p90 = y_pred + p90_adjustment
p90_coverage = (y_test <= y_pred_p90).mean()

print(f"P90 Coverage: {p90_coverage:.1%} (target: ~90%)")

# Model statistics
print(f"\n📈 Prediction Quality:")
print(f"Actual - Mean: {y_test.mean():.1f}, Median: {y_test.median():.1f}")
print(f"Predicted - Mean: {y_pred.mean():.1f}, Median: {np.median(y_pred):.1f}")

# Feature importance
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': rf_regressor.feature_importances_
}).sort_values('importance', ascending=False)

print(f"\n🔍 Top 5 Feature Importance:")
for _, row in feature_importance.head().iterrows():
    print(f"{row['feature']:20s}: {row['importance']:.3f}")

# Save model
print(f"\n💾 Saving ETA model...")
joblib.dump(rf_regressor, '../artifacts/models/eta_model.pkl')
joblib.dump(le_dict, '../artifacts/models/label_encoders.pkl')
print("✅ Model saved to artifacts/models/")
