# Passenger Demand Forecasting Model Development

This notebook contains the development and training of the XGBoost model for passenger demand forecasting at jeepney stops.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# ML libraries
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
import xgboost as xgb

# Custom modules
import sys
sys.path.append('../')
from data_generator import PassengerDataGenerator
from ml_pipeline import PassengerForecastingModel

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries imported successfully!")

## 1. Data Generation and Exploration

In [None]:
# Generate synthetic data
print("Generating synthetic passenger demand data...")
generator = PassengerDataGenerator()

# Generate 60,000 records for 2 years
df = generator.generate_dataset('2023-01-01', '2024-12-31', 60000)

print(f"Dataset shape: {df.shape}")
print(f"Date range: {df['datetime'].min()} to {df['datetime'].max()}")
print(f"Unique stops: {df['stop_name'].nunique()}")
print(f"Average passenger count: {df['passenger_count'].mean():.2f}")

# Display first few rows
df.head()

In [None]:
# Basic statistics
print("Dataset Statistics:")
print("=" * 50)
print(df.describe())

print("\nStop Types Distribution:")
print("=" * 30)
print(df['stop_type'].value_counts())

print("\nPassenger Count Distribution:")
print("=" * 30)
print(df['passenger_count'].describe())

## 2. Data Visualization

In [None]:
# Create visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# 1. Passenger count distribution
axes[0, 0].hist(df['passenger_count'], bins=50, alpha=0.7, color='skyblue', edgecolor='black')
axes[0, 0].set_title('Passenger Count Distribution')
axes[0, 0].set_xlabel('Passenger Count')
axes[0, 0].set_ylabel('Frequency')

# 2. Hourly passenger patterns
hourly_avg = df.groupby('hour_of_day')['passenger_count'].mean()
axes[0, 1].plot(hourly_avg.index, hourly_avg.values, marker='o', linewidth=2, markersize=6)
axes[0, 1].set_title('Average Passengers by Hour of Day')
axes[0, 1].set_xlabel('Hour of Day')
axes[0, 1].set_ylabel('Average Passengers')
axes[0, 1].grid(True, alpha=0.3)

# 3. Daily patterns
daily_avg = df.groupby('day_of_week')['passenger_count'].mean()
day_names = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
axes[1, 0].bar(day_names, daily_avg.values, color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#FFA07A', '#98D8C8', '#F7DC6F', '#BB8FCE'])
axes[1, 0].set_title('Average Passengers by Day of Week')
axes[1, 0].set_xlabel('Day of Week')
axes[1, 0].set_ylabel('Average Passengers')

# 4. Stop type analysis
stop_type_avg = df.groupby('stop_type')['passenger_count'].mean().sort_values(ascending=False)
axes[1, 1].barh(stop_type_avg.index, stop_type_avg.values, color='lightcoral')
axes[1, 1].set_title('Average Passengers by Stop Type')
axes[1, 1].set_xlabel('Average Passengers')

plt.tight_layout()
plt.show()

print("Visualizations completed!")

In [None]:
# Feature correlation analysis
feature_cols = [
    'hour_of_day', 'day_of_week', 'is_weekend', 'is_public_holiday',
    'is_school_dismissal_time', 'is_hightide', 'lag_1_hour_demand',
    'lag_24_hour_demand', 'rolling_3_hour_avg_demand', 'rolling_6_hour_avg_demand',
    'hour_sin', 'hour_cos', 'day_of_week_sin', 'day_of_week_cos', 'passenger_count'
]

plt.figure(figsize=(12, 10))
correlation_matrix = df[feature_cols].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0,
            square=True, linewidths=0.5, fmt='.2f')
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

print("Correlation analysis completed!")

## 3. Feature Engineering

In [None]:
# Feature engineering
print("Feature Engineering:")
print("=" * 20)

# Define feature columns
FEATURES = [
    'hour_of_day',
    'day_of_week',
    'is_weekend',
    'is_public_holiday',
    'is_school_dismissal_time',
    'is_hightide',
    'lag_1_hour_demand',
    'lag_24_hour_demand',
    'rolling_3_hour_avg_demand',
    'rolling_6_hour_avg_demand',
    'hour_sin',
    'hour_cos',
    'day_of_week_sin',
    'day_of_week_cos'
]

print(f"Total features: {len(FEATURES)}")
print(f"Features: {FEATURES}")

# Check for missing values
print("\nMissing values:")
print(df[FEATURES].isnull().sum())

# Fill any remaining missing values
df[FEATURES] = df[FEATURES].fillna(method='ffill').fillna(method='bfill')

print("\nFeature engineering completed!")

## 4. Model Training

In [None]:
# Prepare data for training
print("Preparing data for training...")

# Features and target
X = df[FEATURES].copy()
y = df['passenger_count'].copy()

# Ensure all features are numeric
X = X.astype(float)

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=None
)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

print("Data preparation completed!")

In [None]:
# Train XGBoost model
print("Training XGBoost model...")

# XGBoost parameters optimized for perfect scores
params = {
    'objective': 'reg:squarederror',
    'n_estimators': 1000,
    'max_depth': 8,
    'learning_rate': 0.1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'random_state': 42,
    'n_jobs': -1
}

# Create and train model
model = xgb.XGBRegressor(**params)
model.fit(X_train, y_train)

print("Model training completed!")

## 5. Model Evaluation

In [None]:
# Make predictions
print("Making predictions...")

y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Calculate metrics
def calculate_metrics(y_true, y_pred, dataset_name):
    r2 = r2_score(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    
    print(f"\n{dataset_name} Metrics:")
    print(f"R² Score: {r2:.4f}")
    print(f"MAE: {mae:.4f}")
    print(f"RMSE: {rmse:.4f}")
    
    return {'r2': r2, 'mae': mae, 'rmse': rmse}

# Training metrics
train_metrics = calculate_metrics(y_train, y_train_pred, "Training")

# Test metrics
test_metrics = calculate_metrics(y_test, y_test_pred, "Test")

print("\nModel evaluation completed!")

In [None]:
# Feature importance
feature_importance = model.feature_importances_
feature_names = FEATURES

# Create feature importance DataFrame
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importance
}).sort_values('importance', ascending=False)

print("Feature Importance:")
print(importance_df)

# Plot feature importance
plt.figure(figsize=(10, 8))
plt.barh(importance_df['feature'], importance_df['importance'])
plt.title('XGBoost Feature Importance')
plt.xlabel('Importance')
plt.tight_layout()
plt.show()

print("Feature importance analysis completed!")

## 6. Model Validation

In [None]:
# Prediction vs Actual plots
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Training predictions
axes[0].scatter(y_train, y_train_pred, alpha=0.5, color='blue')
axes[0].plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], 'r--', lw=2)
axes[0].set_xlabel('Actual Passengers')
axes[0].set_ylabel('Predicted Passengers')
axes[0].set_title(f'Training Predictions (R² = {train_metrics["r2"]:.4f})')
axes[0].grid(True, alpha=0.3)

# Test predictions
axes[1].scatter(y_test, y_test_pred, alpha=0.5, color='green')
axes[1].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
axes[1].set_xlabel('Actual Passengers')
axes[1].set_ylabel('Predicted Passengers')
axes[1].set_title(f'Test Predictions (R² = {test_metrics["r2"]:.4f})')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("Validation plots completed!")

In [None]:
# Cross-validation
print("Performing cross-validation...")

cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')

print(f"Cross-validation R² scores: {cv_scores}")
print(f"Mean CV R² score: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

print("Cross-validation completed!")

## 7. Model Persistence

In [None]:
# Save model
import pickle

model_data = {
    'model': model,
    'feature_columns': FEATURES,
    'train_metrics': train_metrics,
    'test_metrics': test_metrics,
    'feature_importance': importance_df
}

with open('../passenger_forecasting_model.pkl', 'wb') as f:
    pickle.dump(model_data, f)

print("Model saved successfully!")
print(f"Model file: passenger_forecasting_model.pkl")
print(f"Final Test R² Score: {test_metrics['r2']:.4f}")
print(f"Final Test MAE: {test_metrics['mae']:.4f}")
print(f"Final Test RMSE: {test_metrics['rmse']:.4f}")

## 8. Sample Predictions

In [None]:
# Generate sample predictions for different scenarios
print("Generating sample predictions...")

# Sample scenarios
scenarios = [
    {'hour': 8, 'day': 1, 'weekend': 0, 'holiday': 0, 'school': 1, 'tide': 0, 'desc': 'School dismissal time on Tuesday'},
    {'hour': 17, 'day': 5, 'weekend': 1, 'holiday': 0, 'school': 0, 'tide': 1, 'desc': 'Weekend evening with high tide'},
    {'hour': 12, 'day': 2, 'weekend': 0, 'holiday': 1, 'school': 0, 'tide': 0, 'desc': 'Lunch time on holiday'},
]

for scenario in scenarios:
    # Create feature vector
    features = np.array([
        scenario['hour'],  # hour_of_day
        scenario['day'],   # day_of_week
        scenario['weekend'],  # is_weekend
        scenario['holiday'],  # is_public_holiday
        scenario['school'],   # is_school_dismissal_time
        scenario['tide'],     # is_hightide
        15,  # lag_1_hour_demand (example)
        12,  # lag_24_hour_demand (example)
        14,  # rolling_3_hour_avg_demand (example)
        13,  # rolling_6_hour_avg_demand (example)
        np.sin(2 * np.pi * scenario['hour'] / 24),  # hour_sin
        np.cos(2 * np.pi * scenario['hour'] / 24),  # hour_cos
        np.sin(2 * np.pi * scenario['day'] / 7),    # day_of_week_sin
        np.cos(2 * np.pi * scenario['day'] / 7),    # day_of_week_cos
    ]).reshape(1, -1)
    
    prediction = model.predict(features)[0]
    
    print(f"Scenario: {scenario['desc']}")
    print(f"Predicted passengers: {int(prediction)}")
    print("-" * 50)

print("Sample predictions completed!")

## 9. Summary

In [None]:
# Model summary
print("MODEL TRAINING SUMMARY")
print("=" * 50)
print(f"Dataset size: {len(df):,} records")
print(f"Features used: {len(FEATURES)}")
print(f"Training samples: {len(X_train):,}")
print(f"Test samples: {len(X_test):,}")
print()
print("PERFORMANCE METRICS:")
print(f"Test R² Score: {test_metrics['r2']:.4f}")
print(f"Test MAE: {test_metrics['mae']:.4f}")
print(f"Test RMSE: {test_metrics['rmse']:.4f}")
print()
print("TOP 5 MOST IMPORTANT FEATURES:")
for i, (feature, importance) in enumerate(importance_df.head().values):
    print(f"{i+1}. {feature}: {importance:.4f}")
print()
print("Model ready for deployment!")