In [1]:
import mlflow
import mlflow.prophet
import pandas as pd
import numpy as np
from prophet import Prophet
from prophet.diagnostics import cross_validation, performance_metrics
from dotenv import load_dotenv
import datetime
# Load environment variables from .env file
load_dotenv()


experiment_name = "/Users/j.huertas@closerstillmedia.com/prophet"
try:
    mlflow.create_experiment(experiment_name)
except:
    print("experiment exists")
mlflow.set_experiment(experiment_name)


experiment exists


<Experiment: artifact_location='dbfs:/databricks/mlflow-tracking/3039328626124250', creation_time=1755624146266, experiment_id='3039328626124250', last_update_time=1755875139361, lifecycle_stage='active', name='/Users/j.huertas@closerstillmedia.com/prophet', tags={'mlflow.databricks.filesystem.experiment_permissions_check': 'test',
 'mlflow.experiment.sourceName': '/Users/j.huertas@closerstillmedia.com/prophet',
 'mlflow.experimentKind': 'custom_model_development',
 'mlflow.experimentType': 'MLFLOW_EXPERIMENT',
 'mlflow.ownerEmail': 'b.relf@closerstillmedia.com',
 'mlflow.ownerId': '7931383772120950'}>

# Data Preparation Best Practices

In [5]:
def prepare_prophet_data(data, date_col, value_col, freq="D"):
    """
    Prepare data for Prophet training.

    Args:
        data: DataFrame with time series data
        date_col: Name of date column
        value_col: Name of value column
        freq: Frequency of the time series
    """

    # Prophet requires columns named 'ds' (datestamp) and 'y' (value)
    prophet_df = data[[date_col, value_col]].copy()
    prophet_df.columns = ["ds", "y"]

    # Ensure ds is datetime
    prophet_df["ds"] = pd.to_datetime(prophet_df["ds"])

    # Sort by date
    prophet_df = prophet_df.sort_values("ds").reset_index(drop=True)

    # Handle missing dates if needed
    if freq:
        full_date_range = pd.date_range(
            start=prophet_df["ds"].min(), end=prophet_df["ds"].max(), freq=freq
        )

        # Reindex to fill missing dates
        prophet_df = prophet_df.set_index("ds").reindex(full_date_range).reset_index()
        prophet_df.columns = ["ds", "y"]

        # Log data quality metrics
        missing_dates = prophet_df["y"].isna().sum()
        print(f"Missing dates filled: {missing_dates}")

    return prophet_df

def prophet_data_best_practices():
    """Demonstrate Prophet data preparation best practices."""

    best_practices = {
        "data_frequency": "Use consistent frequency (daily, weekly, monthly)",
        "missing_values": "Prophet handles missing values, but document them",
        "outliers": "Consider outlier detection and handling",
        "data_volume": "Minimum 2-3 seasonal cycles (2-3 years for yearly seasonality)",
        "column_names": "Always use 'ds' for dates and 'y' for values",
        "date_format": "Ensure dates are properly parsed as datetime",
        "timezone_handling": "Be consistent with timezone handling",
    }

    print("Prophet Data Preparation Best Practices:")
    for practice, description in best_practices.items():
        print(f"- {practice}: {description}")

    return best_practices


# Data validation function
def validate_prophet_data(df):
    """Validate data for Prophet modeling."""

    issues = []
    recommendations = []

    # Check required columns
    if not all(col in df.columns for col in ["ds", "y"]):
        issues.append("Missing required columns 'ds' and/or 'y'")

    # Check data types
    if "ds" in df.columns and not pd.api.types.is_datetime64_any_dtype(df["ds"]):
        issues.append("Column 'ds' is not datetime type")
        recommendations.append(
            "Convert 'ds' to datetime: df['ds'] = pd.to_datetime(df['ds'])"
        )

    # Check for sufficient data
    if len(df) < 100:
        issues.append(f"Insufficient data points: {len(df)} (recommend >100)")

    # Check for missing values
    missing_y = df["y"].isnull().sum()
    if missing_y > 0:
        recommendations.append(f"Consider handling {missing_y} missing values in 'y'")

    # Check for duplicate dates
    if "ds" in df.columns:
        duplicates = df["ds"].duplicated().sum()
        if duplicates > 0:
            issues.append(f"Found {duplicates} duplicate dates")

    # Check data range
    if "ds" in df.columns and len(df) > 0:
        date_range = (df["ds"].max() - df["ds"].min()).days
        if date_range < 365:
            recommendations.append(
                "Less than 1 year of data may limit seasonality detection"
            )

    return {
        "issues": issues,
        "recommendations": recommendations,
        "data_points": len(df),
        "date_range_days": date_range if "ds" in df.columns and len(df) > 0 else 0,
    }


# Usage


In [6]:
raw_data =  pd.read_csv("sales_data_2023_2025_v3.csv")
df = prepare_prophet_data(raw_data, 'date', 'sales', freq='D')
df.head()

Missing dates filled: 0


Unnamed: 0,ds,y
0,2021-01-01,941
1,2021-01-02,1157
2,2021-01-03,1149
3,2021-01-04,944
4,2021-01-05,1071


In [7]:

validation_results = validate_prophet_data(df)
print("Validation Results:", validation_results)

Validation Results: {'issues': [], 'recommendations': [], 'data_points': 1826, 'date_range_days': 1825}


In [8]:
validation_results = validate_prophet_data(df[:100])
print("Validation Results:", validation_results)

Validation Results: {'issues': [], 'recommendations': ['Less than 1 year of data may limit seasonality detection'], 'data_points': 100, 'date_range_days': 99}


# Performance Optimization

In [9]:
def optimize_prophet_performance():
    """Tips for optimizing Prophet performance."""

    optimization_tips = {
        "parallel_processing": {
            "cross_validation": "Use parallel='threads' or 'processes' in cross_validation()",
            "multiple_models": "Use joblib.Parallel for training multiple models",
        },
        "model_configuration": {
            "mcmc_samples": "Set mcmc_samples=0 for faster MAP estimation",
            "uncertainty_samples": "Reduce uncertainty_samples for faster predictions",
            "stan_backend": "Use 'CMDSTANPY' backend for better performance",
        },
        "data_preprocessing": {
            "frequency": "Aggregate to appropriate frequency (daily vs hourly)",
            "outliers": "Remove extreme outliers before training",
            "data_size": "Consider sampling for very large datasets",
        },
    }

    return optimization_tips


# Example of parallel cross-validation
def parallel_prophet_evaluation(models_dict, df):
    """Evaluate multiple Prophet models in parallel."""

    from joblib import Parallel, delayed

    def evaluate_single_model(name, model):
        try:
            cv_results = cross_validation(
                model.fit(df),
                initial="365 days",
                period="90 days",
                horizon="30 days",
                parallel="threads",
            )
            metrics = performance_metrics(cv_results)
            return name, metrics["mape"].mean()
        except Exception as e:
            return name, float("inf")

    # Parallel evaluation
    results = Parallel(n_jobs=-1)(
        delayed(evaluate_single_model)(name, model)
        for name, model in models_dict.items()
    )

    return dict(results)