# 08 - Predictive Modeling (Planning & Approach)

This notebook documents the planned approach for predictive modeling in Phase 3. For Phase 2, we focus on methodology planning and initial model setup.

## Objectives
- Document predictive modeling approach
- Plan time series forecasting models (ARIMA, Prophet)
- Plan customer behavior prediction models
- Define evaluation metrics
- Set up initial data preparation for modeling

## Phase 2 Requirements
- ✅ Model planning and approach documentation
- ✅ Methodology selection and justification
- ✅ Evaluation metrics definition
- ✅ Data preparation for modeling
- ⚠️ Full model implementation (Phase 3)


In [None]:
# Load required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os

warnings.filterwarnings('ignore')
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)

print("=" * 80)
print("PREDICTIVE MODELING - PLANNING & APPROACH")
print("=" * 80)

# Load data
project_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
data_path = os.path.join(project_root, 'data', 'raw', 'Online Retail.csv')

df = pd.read_csv(data_path, encoding='latin-1')
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'], errors='coerce')
df = df[~df['InvoiceNo'].astype(str).str.startswith('C')]
df = df[(df['Quantity'] > 0) & (df['UnitPrice'] > 0)]
df = df[df['Description'].notna()]
df['TotalPrice'] = df['Quantity'] * df['UnitPrice']
df = df[df['InvoiceDate'].notna()]

print(f"\nDataset loaded: {df.shape[0]:,} transactions")
print(f"Date range: {df['InvoiceDate'].min()} to {df['InvoiceDate'].max()}")
print(f"\nNote: This notebook focuses on planning and approach for Phase 3 implementation.")


## Step 1: Predictive Modeling Objectives

Define the key predictive modeling goals for Phase 3.


In [None]:
# Modeling objectives
print("=" * 80)
print("PREDICTIVE MODELING OBJECTIVES")
print("=" * 80)

objectives = {
    "1. Demand Forecasting": {
        "Goal": "Predict future daily/monthly revenue and transaction volumes",
        "Use Case": "Stock planning, inventory optimization",
        "Time Horizon": "Short-term (1-30 days), Medium-term (1-6 months)"
    },
    "2. Customer Behavior Prediction": {
        "Goal": "Predict customer purchase likelihood and churn risk",
        "Use Case": "Targeted marketing, retention campaigns",
        "Time Horizon": "Next purchase timing, 30/60/90 day churn"
    },
    "3. Product Demand Prediction": {
        "Goal": "Forecast product-level demand",
        "Use Case": "Product-specific stock allocation",
        "Time Horizon": "Weekly and monthly forecasts"
    },
    "4. Basket Size Prediction": {
        "Goal": "Predict transaction value and quantity",
        "Use Case": "Revenue forecasting, pricing strategies",
        "Time Horizon": "Next transaction prediction"
    }
}

for obj_name, details in objectives.items():
    print(f"\n{obj_name}:")
    for key, value in details.items():
        print(f"  {key}: {value}")

print("\n" + "=" * 80)


## Step 2: Model Selection & Justification

Document planned models and justify their selection.


In [None]:
# Model selection
print("=" * 80)
print("MODEL SELECTION & JUSTIFICATION")
print("=" * 80)

models = {
    "Time Series Forecasting": {
        "ARIMA": {
            "Description": "AutoRegressive Integrated Moving Average",
            "Justification": "Handles trend and seasonality, interpretable, good for univariate time series",
            "Use Case": "Daily/monthly revenue forecasting",
            "Limitations": "Requires stationarity, assumes linear relationships"
        },
        "Prophet": {
            "Description": "Facebook's time series forecasting tool",
            "Justification": "Handles seasonality, holidays, trend changes automatically, robust to missing data",
            "Use Case": "Revenue forecasting with multiple seasonality patterns",
            "Limitations": "Less interpretable than ARIMA, requires sufficient historical data"
        }
    },
    "Customer Behavior": {
        "Logistic Regression": {
            "Description": "Binary classification for churn prediction",
            "Justification": "Interpretable, handles categorical features well, baseline model",
            "Use Case": "Customer churn prediction",
            "Limitations": "Assumes linear relationships, may need feature engineering"
        },
        "Random Forest": {
            "Description": "Ensemble method for classification/regression",
            "Justification": "Handles non-linear relationships, feature importance, robust to outliers",
            "Use Case": "Purchase likelihood, basket size prediction",
            "Limitations": "Less interpretable, can overfit with small datasets"
        }
    }
}

for category, model_dict in models.items():
    print(f"\n{category}:")
    print("=" * 60)
    for model_name, details in model_dict.items():
        print(f"\n{model_name}:")
        for key, value in details.items():
            print(f"  {key}: {value}")

print("\n" + "=" * 80)


## Step 3: Data Preparation for Modeling

Prepare time-series and customer-level datasets for modeling.


In [None]:
# Data preparation
print("=" * 80)
print("DATA PREPARATION FOR MODELING")
print("=" * 80)

# 1. Time-series data for forecasting
print("\n1. TIME-SERIES DATA PREPARATION:")
daily_data = df.groupby(df['InvoiceDate'].dt.date).agg({
    'TotalPrice': 'sum',
    'Quantity': 'sum',
    'InvoiceNo': 'nunique',
    'CustomerID': 'nunique'
}).reset_index()

daily_data.columns = ['Date', 'DailyRevenue', 'DailyQuantity', 'DailyTransactions', 'DailyCustomers']
daily_data['Date'] = pd.to_datetime(daily_data['Date'])
daily_data = daily_data.sort_values('Date').reset_index(drop=True)

# Create complete date range
date_range = pd.date_range(start=daily_data['Date'].min(), end=daily_data['Date'].max(), freq='D')
daily_complete = pd.DataFrame({'Date': date_range})
daily_complete = daily_complete.merge(daily_data, on='Date', how='left')
daily_complete = daily_complete.fillna(0)

print(f"  Daily time-series: {len(daily_complete)} days")
print(f"  Date range: {daily_complete['Date'].min()} to {daily_complete['Date'].max()}")
print(f"  Missing days filled: {len(daily_complete) - len(daily_data)}")

# 2. Monthly aggregation
monthly_data = df.groupby(df['InvoiceDate'].dt.to_period('M')).agg({
    'TotalPrice': 'sum',
    'Quantity': 'sum',
    'InvoiceNo': 'nunique',
    'CustomerID': 'nunique'
}).reset_index()

monthly_data.columns = ['YearMonth', 'MonthlyRevenue', 'MonthlyQuantity', 'MonthlyTransactions', 'MonthlyCustomers']
monthly_data['Date'] = pd.to_datetime(monthly_data['YearMonth'].astype(str))
print(f"\n  Monthly time-series: {len(monthly_data)} months")

# 3. Customer-level features for behavior prediction
print("\n2. CUSTOMER-LEVEL FEATURES:")
reference_date = df['InvoiceDate'].max() + pd.Timedelta(days=1)

customer_features = df.groupby('CustomerID').agg({
    'InvoiceDate': lambda x: (reference_date - x.max()).days,  # Recency
    'InvoiceNo': 'nunique',  # Frequency
    'TotalPrice': ['sum', 'mean'],  # Monetary
    'Quantity': 'sum',
    'InvoiceDate': ['min', 'max']  # First and last purchase
}).reset_index()

customer_features.columns = ['CustomerID', 'Recency', 'Frequency', 'TotalSpent', 'AvgTransaction', 
                             'TotalQuantity', 'FirstPurchase', 'LastPurchase']

customer_features['CustomerLifetime'] = (customer_features['LastPurchase'] - customer_features['FirstPurchase']).dt.days
customer_features['AvgDaysBetweenPurchases'] = customer_features['CustomerLifetime'] / customer_features['Frequency']

print(f"  Customer features: {len(customer_features)} customers")
print(f"  Features: Recency, Frequency, Monetary, Lifetime, AvgDaysBetweenPurchases")

# Display sample
print("\nSample Time-Series Data (Last 10 days):")
print(daily_complete[['Date', 'DailyRevenue', 'DailyTransactions']].tail(10).to_string(index=False))

print("\nSample Customer Features (Top 10 by Total Spent):")
print(customer_features.nlargest(10, 'TotalSpent')[['CustomerID', 'Recency', 'Frequency', 'TotalSpent']].to_string(index=False))

print("\n" + "=" * 80)
print("DATA PREPARATION COMPLETE")
print("=" * 80)


In [None]:
# Evaluation metrics
print("=" * 80)
print("EVALUATION METRICS")
print("=" * 80)

metrics = {
    "Time Series Forecasting": {
        "MAE (Mean Absolute Error)": "Average absolute difference between predicted and actual values",
        "RMSE (Root Mean Squared Error)": "Penalizes larger errors more, good for business impact",
        "MAPE (Mean Absolute Percentage Error)": "Percentage error, interpretable for stakeholders",
        "R² (Coefficient of Determination)": "Proportion of variance explained by the model"
    },
    "Classification (Churn/Purchase Prediction)": {
        "Accuracy": "Overall correctness of predictions",
        "Precision": "Proportion of positive predictions that are correct",
        "Recall": "Proportion of actual positives correctly identified",
        "F1-Score": "Harmonic mean of precision and recall",
        "ROC-AUC": "Area under ROC curve, measures classification performance"
    },
    "Regression (Basket Size)": {
        "MAE": "Average absolute error in basket size prediction",
        "RMSE": "Penalizes larger errors",
        "R²": "Model fit quality"
    }
}

for category, metric_dict in metrics.items():
    print(f"\n{category}:")
    print("-" * 60)
    for metric, description in metric_dict.items():
        print(f"  • {metric}: {description}")

print("\n" + "=" * 80)


## Step 5: Modeling Approach & Implementation Plan

Document the step-by-step approach for Phase 3 implementation.


In [None]:
# Implementation plan
print("=" * 80)
print("MODELING APPROACH & IMPLEMENTATION PLAN")
print("=" * 80)

plan = {
    "Phase 1: Baseline Models": [
        "1. Implement ARIMA for daily revenue forecasting",
        "2. Train baseline logistic regression for churn prediction",
        "3. Evaluate baseline models using defined metrics",
        "4. Document baseline performance"
    ],
    "Phase 2: Advanced Models": [
        "1. Implement Prophet for revenue forecasting with seasonality",
        "2. Train Random Forest for purchase likelihood prediction",
        "3. Compare advanced models with baselines",
        "4. Feature engineering and hyperparameter tuning"
    ],
    "Phase 3: Model Validation": [
        "1. Time-series cross-validation (walk-forward validation)",
        "2. Hold-out test set evaluation",
        "3. Statistical significance testing",
        "4. Business impact assessment"
    ],
    "Phase 4: Model Deployment": [
        "1. Model serialization and versioning",
        "2. Prediction pipeline development",
        "3. Model monitoring framework",
        "4. Documentation and reporting"
    ]
}

for phase, steps in plan.items():
    print(f"\n{phase}:")
    print("-" * 60)
    for step in steps:
        print(f"  {step}")

print("\n" + "=" * 80)
print("KEY CONSIDERATIONS:")
print("=" * 80)
considerations = [
    "Train/Test Split: Use temporal split (e.g., last 3 months as test set)",
    "Cross-Validation: Time-series cross-validation to avoid data leakage",
    "Feature Engineering: Create lag features, rolling statistics, temporal features",
    "Model Interpretability: Balance accuracy with interpretability for business stakeholders",
    "Scalability: Ensure models can handle production-scale data",
    "Monitoring: Plan for model performance monitoring and retraining"
]

for i, consideration in enumerate(considerations, 1):
    print(f"{i}. {consideration}")

print("\n" + "=" * 80)
print("PREDICTIVE MODELING PLANNING COMPLETE")
print("=" * 80)
print("\nNext Steps (Phase 3):")
print("  1. Implement baseline ARIMA model")
print("  2. Implement Prophet model")
print("  3. Build customer churn prediction model")
print("  4. Evaluate and compare all models")
print("  5. Document results and business implications")
