In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Model imports
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import PoissonRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error

# Set random seed for reproducibility
np.random.seed(42)

In [None]:
# =============================================================================
# STEP 5: BUILD NAIVE BASELINE MODELS
# =============================================================================
print("\n[STEP 5] Building naive baseline models...")

def naive_forecast(y_train, y_test):
    """Naive baseline: predict the last observed value"""
    predictions = np.full(len(y_test), y_train.iloc[-1])
    return predictions

def seasonal_naive_forecast(daily_street_train, daily_street_test):
    """Seasonal naive: predict same day of week from previous week"""
    predictions = []

    for idx, row in daily_street_test.iterrows():
        street = row['StreetName']
        dow = row['DayOfWeek']

        # Find the same street and day of week in training data
        historical = daily_street_train[
            (daily_street_train['StreetName'] == street) &
            (daily_street_train['DayOfWeek'] == dow)
        ]

        if len(historical) > 0:
            pred = historical['TicketCount'].mean()
        else:
            pred = daily_street_train['TicketCount'].mean()

        predictions.append(pred)

    return np.array(predictions)

In [None]:
# =============================================================================
# STEP 6: TRAIN AND EVALUATE MODELS WITH TIME SERIES CROSS-VALIDATION
# =============================================================================
print("\n[STEP 6] Training and evaluating models with time series cross-validation...")

# Use TimeSeriesSplit for cross-validation
tscv = TimeSeriesSplit(n_splits=5)

# Storage for results
naive_rmse_scores = []
naive_mape_scores = []
seasonal_rmse_scores = []
seasonal_mape_scores = []
poisson_rmse_scores = []
poisson_mape_scores = []
rf_rmse_scores = []
rf_mape_scores = []

fold = 1
for train_idx, test_idx in tscv.split(X):
    print(f"\n--- Fold {fold} ---")

    # Split data
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    daily_train = daily_street.iloc[train_idx]
    daily_test = daily_street.iloc[test_idx]

    # Naive baseline
    naive_pred = naive_forecast(y_train, y_test)
    naive_rmse = np.sqrt(mean_squared_error(y_test, naive_pred))
    naive_mape = mean_absolute_percentage_error(y_test, naive_pred) * 100
    naive_rmse_scores.append(naive_rmse)
    naive_mape_scores.append(naive_mape)

    # Seasonal naive baseline
    seasonal_pred = seasonal_naive_forecast(daily_train, daily_test)
    seasonal_rmse = np.sqrt(mean_squared_error(y_test, seasonal_pred))
    seasonal_mape = mean_absolute_percentage_error(y_test, seasonal_pred) * 100
    seasonal_rmse_scores.append(seasonal_rmse)
    seasonal_mape_scores.append(seasonal_mape)

    # Poisson Regression
    poisson_model = PoissonRegressor(max_iter=500)
    poisson_model.fit(X_train, y_train)
    poisson_pred = poisson_model.predict(X_test)
    poisson_rmse = np.sqrt(mean_squared_error(y_test, poisson_pred))
    poisson_mape = mean_absolute_percentage_error(y_test, poisson_pred) * 100
    poisson_rmse_scores.append(poisson_rmse)
    poisson_mape_scores.append(poisson_mape)

    # Random Forest Regressor
    rf_model = RandomForestRegressor(n_estimators=100, max_depth=10,
                                     min_samples_split=10, random_state=42, n_jobs=-1)
    rf_model.fit(X_train, y_train)
    rf_pred = rf_model.predict(X_test)
    rf_rmse = np.sqrt(mean_squared_error(y_test, rf_pred))
    rf_mape = mean_absolute_percentage_error(y_test, rf_pred) * 100
    rf_rmse_scores.append(rf_rmse)
    rf_mape_scores.append(rf_mape)

    print(f"Naive RMSE: {naive_rmse:.3f}, MAPE: {naive_mape:.2f}%")
    print(f"Seasonal Naive RMSE: {seasonal_rmse:.3f}, MAPE: {seasonal_mape:.2f}%")
    print(f"Poisson RMSE: {poisson_rmse:.3f}, MAPE: {poisson_mape:.2f}%")
    print(f"Random Forest RMSE: {rf_rmse:.3f}, MAPE: {rf_mape:.2f}%")

    fold += 1

In [None]:
# =============================================================================
# STEP 7: SUMMARIZE RESULTS
# =============================================================================
print("\n" + "="*80)
print("CROSS-VALIDATION RESULTS SUMMARY")
print("="*80)

results_df = pd.DataFrame({
    'Model': ['Naive Baseline', 'Seasonal Naive', 'Poisson Regression', 'Random Forest'],
    'Avg RMSE': [np.mean(naive_rmse_scores), np.mean(seasonal_rmse_scores),
                 np.mean(poisson_rmse_scores), np.mean(rf_rmse_scores)],
    'Std RMSE': [np.std(naive_rmse_scores), np.std(seasonal_rmse_scores),
                 np.std(poisson_rmse_scores), np.std(rf_rmse_scores)],
    'Avg MAPE (%)': [np.mean(naive_mape_scores), np.mean(seasonal_mape_scores),
                     np.mean(poisson_mape_scores), np.mean(rf_mape_scores)],
    'Std MAPE (%)': [np.std(naive_mape_scores), np.std(seasonal_mape_scores),
                     np.std(poisson_mape_scores), np.std(rf_mape_scores)]
})

print("\n", results_df.to_string(index=False))

# Calculate improvement over naive baseline
baseline_rmse = results_df.loc[0, 'Avg RMSE']
poisson_improvement = ((baseline_rmse - results_df.loc[2, 'Avg RMSE']) / baseline_rmse) * 100
rf_improvement = ((baseline_rmse - results_df.loc[3, 'Avg RMSE']) / baseline_rmse) * 100

print(f"\n--- IMPROVEMENT OVER NAIVE BASELINE ---")
print(f"Poisson Regression: {poisson_improvement:.2f}% improvement in RMSE")
print(f"Random Forest: {rf_improvement:.2f}% improvement in RMSE")

if poisson_improvement >= 15 or rf_improvement >= 15:
    print(f"\n✓ SUCCESS: Goal of 15% improvement achieved!")
else:
    print(f"\n⚠ Goal of 15% improvement not fully achieved. Consider feature engineering.")

In [None]:
# =============================================================================
# STEP 8: TRAIN FINAL MODELS ON FULL DATA
# =============================================================================
print("\n[STEP 8] Training final models on full dataset...")

# Split into train/test (80/20)
split_idx = int(0.8 * len(X))
X_train_final = X.iloc[:split_idx]
X_test_final = X.iloc[split_idx:]
y_train_final = y.iloc[:split_idx]
y_test_final = y.iloc[split_idx:]

# Train final Poisson model
final_poisson = PoissonRegressor(max_iter=500)
final_poisson.fit(X_train_final, y_train_final)
poisson_final_pred = final_poisson.predict(X_test_final)

# Train final Random Forest model
final_rf = RandomForestRegressor(n_estimators=200, max_depth=15,
                                 min_samples_split=10, random_state=42, n_jobs=-1)
final_rf.fit(X_train_final, y_train_final)
rf_final_pred = final_rf.predict(X_test_final)

print("Final models trained successfully.")