# Predicte 2025 sales: test on different models

In [1]:
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import (
    mean_absolute_error,
    root_mean_squared_error,
    r2_score,
    median_absolute_error,
)
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

# =========================================================
# 0. Global configuration
# =========================================================
FILE_PATH = "../data/processed/bmw_sales_2020_2024_clean.csv"

CAT_FEATURES = ["model", "region", "fuel_type", "transmission", "color"]
NUM_FEATURES = ["year", "engine_size_l", "mileage_km", "price_usd"]
FEATURE_COLS = CAT_FEATURES + NUM_FEATURES

plt.rcParams["figure.dpi"] = 120


# =========================================================
# 1. Aggregate raw data (by year + category combination)
# =========================================================
def load_and_aggregate(path: str) -> pd.DataFrame:
    """
    Load the raw data and aggregate by the combination of:
        year + model + region + fuel_type + transmission + color.

    Aggregation rules:
        - engine_size_l, mileage_km, price_usd: mean value
        - sales_volume: sum
    """
    df = pd.read_csv(path)

    group_cols = ["year", "model", "region", "fuel_type", "transmission", "color"]

    agg_df = (
        df.groupby(group_cols, as_index=False)
        .agg(
            {
                "engine_size_l": "mean",
                "mileage_km": "mean",
                "price_usd": "mean",
                "sales_volume": "sum",
            }
        )
    )

    return agg_df


# =========================================================
# 2. Generic metric computation
# =========================================================
def compute_metrics(y_true: np.ndarray, y_pred: np.ndarray) -> dict:
    """
    Compute regression metrics between ground truth and predictions.

    Metrics:
        - MAE
        - RMSE
        - MedianAE
        - MAPE
        - R2

    Additionally, compute percentage metrics relative to the mean of y_true:
        - MAE_pct
        - RMSE_pct
        - MedianAE_pct
    """
    mae = mean_absolute_error(y_true, y_pred)
    rmse = root_mean_squared_error(y_true, y_pred)
    med_ae = median_absolute_error(y_true, y_pred)

    # Avoid division by zero when computing MAPE
    mask = y_true != 0
    if mask.sum() > 0:
        mape = (np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])).mean() * 100
    else:
        mape = np.nan

    # R2 is not meaningful for a single data point
    if len(y_true) > 1:
        r2 = r2_score(y_true, y_pred)
    else:
        r2 = np.nan

    mean_y = y_true.mean()
    mae_pct = mae / mean_y * 100 if mean_y != 0 else np.nan
    rmse_pct = rmse / mean_y * 100 if mean_y != 0 else np.nan
    med_ae_pct = med_ae / mean_y * 100 if mean_y != 0 else np.nan

    return {
        "MAE": mae,
        "RMSE": rmse,
        "MedianAE": med_ae,
        "MAPE": mape,
        "R2": r2,
        "MAE_pct": mae_pct,
        "RMSE_pct": rmse_pct,
        "MedianAE_pct": med_ae_pct,
    }


# =========================================================
# 3. Lag feature construction (for ML models)
# =========================================================
def add_lag_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Add lag-based features for each combination of:
        model + region + fuel_type + transmission + color.

    Newly created features:
        - sales_last1, sales_last2
        - sales_avg3
        - sales_growth
        - price_last1, price_trend
        - mileage_last1, mileage_trend

    The global NUM_FEATURES and FEATURE_COLS variables are updated accordingly.
    """
    df = df.sort_values(
        ["model", "region", "fuel_type", "transmission", "color", "year"]
    ).copy()

    group_cols = ["model", "region", "fuel_type", "transmission", "color"]

    df["sales_last1"] = df.groupby(group_cols)["sales_volume"].shift(1)
    df["sales_last2"] = df.groupby(group_cols)["sales_volume"].shift(2)

    df["sales_avg3"] = df.groupby(group_cols)["sales_volume"].transform(
        lambda x: x.rolling(3).mean()
    )

    df["sales_growth"] = (df["sales_last1"] - df["sales_last2"]) / df["sales_last2"]

    df["price_last1"] = df.groupby(group_cols)["price_usd"].shift(1)
    df["price_trend"] = (df["price_usd"] - df["price_last1"]) / df["price_last1"]

    df["mileage_last1"] = df.groupby(group_cols)["mileage_km"].shift(1)
    df["mileage_trend"] = (df["mileage_km"] - df["mileage_last1"]) / df["mileage_last1"]

    # Update the global numeric feature list with lag-based features
    lag_num_features = [
        "sales_last1",
        "sales_last2",
        "sales_avg3",
        "sales_growth",
        "price_last1",
        "price_trend",
        "mileage_last1",
        "mileage_trend",
    ]

    global NUM_FEATURES, FEATURE_COLS
    NUM_FEATURES = ["year", "engine_size_l", "mileage_km", "price_usd"] + lag_num_features
    FEATURE_COLS = CAT_FEATURES + NUM_FEATURES

    return df


# =========================================================
# 4. Model builders (pipelines and CatBoost model)
# =========================================================
def build_xgb_pipeline():
    """
    Build an XGBoost regression pipeline with preprocessing.

    The pipeline includes:
        - One-hot encoding for categorical features
        - Passthrough for numeric features
        - XGBRegressor as the final estimator
    """
    preprocessor = ColumnTransformer(
        transformers=[
            ("cat", OneHotEncoder(handle_unknown="ignore"), CAT_FEATURES),
            ("num", "passthrough", NUM_FEATURES),
        ]
    )

    xgb = XGBRegressor(
        n_estimators=400,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1,
    )

    return Pipeline(steps=[("preprocess", preprocessor), ("model", xgb)])


def build_lgbm_pipeline():
    """
    Build a LightGBM regression pipeline with preprocessing.

    The pipeline includes:
        - One-hot encoding for categorical features
        - Passthrough for numeric features
        - LGBMRegressor as the final estimator
    """
    preprocessor = ColumnTransformer(
        transformers=[
            ("cat", OneHotEncoder(handle_unknown="ignore"), CAT_FEATURES),
            ("num", "passthrough", NUM_FEATURES),
        ]
    )

    lgbm = LGBMRegressor(
        n_estimators=500,
        learning_rate=0.05,
        max_depth=-1,
        num_leaves=31,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1,
        objective="regression",
        verbose=-1,
    )

    return Pipeline(steps=[("preprocess", preprocessor), ("model", lgbm)])


def build_rf_pipeline():
    """
    Build a RandomForest regression pipeline with preprocessing.

    The pipeline includes:
        - One-hot encoding for categorical features
        - Passthrough for numeric features
        - RandomForestRegressor as the final estimator
    """
    preprocessor = ColumnTransformer(
        transformers=[
            ("cat", OneHotEncoder(handle_unknown="ignore"), CAT_FEATURES),
            ("num", "passthrough", NUM_FEATURES),
        ]
    )

    rf = RandomForestRegressor(
        n_estimators=300,
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        random_state=42,
        n_jobs=-1,
    )

    return Pipeline(steps=[("preprocess", preprocessor), ("model", rf)])


def build_catboost_model():
    """
    Build a CatBoostRegressor model.

    CatBoost consumes the raw DataFrame directly, with categorical
    feature indices specified via cat_features.
    """
    return CatBoostRegressor(
        depth=8,
        learning_rate=0.05,
        n_estimators=800,
        loss_function="RMSE",
        eval_metric="RMSE",
        random_seed=42,
        verbose=False,
    )


# =========================================================
# 5. Training and validation (2020–2023 train, 2024 validate)
# =========================================================
def train_and_validate_models(df_lagged: pd.DataFrame):
    """
    Train XGBoost, LightGBM, RandomForest, and CatBoost on data with lag features
    and evaluate them on the 2024 validation set.

    The training/validation split:
        - Training: years < 2024
        - Validation: year == 2024

    Parameters
    ----------
    df_lagged : pd.DataFrame
        Input DataFrame containing lag-augmented features and sales_volume.

    Returns
    -------
    metrics_all : dict
        Dictionary mapping model names to their evaluation metrics on 2024.
    cat_model : CatBoostRegressor
        Trained CatBoost model (for potential reuse).
    """
    # Drop rows with missing lag features or target values
    df_clean = df_lagged.dropna(subset=FEATURE_COLS + ["sales_volume"]).copy()

    # Split by year
    train_df = df_clean[df_clean["year"] < 2024]
    valid_df = df_clean[df_clean["year"] == 2024]

    X_train = train_df[FEATURE_COLS]
    y_train = train_df["sales_volume"].values

    X_valid = valid_df[FEATURE_COLS]
    y_valid = valid_df["sales_volume"].values

    print(
        f"Shape train (with lag): {X_train.shape}, "
        f"Shape valid (with lag): {X_valid.shape}\n"
    )

    metrics_all = {}

    # ---------- XGBoost ----------
    xgb_model = build_xgb_pipeline()
    xgb_model.fit(X_train, y_train)
    y_pred_xgb = xgb_model.predict(X_valid)
    metrics_xgb = compute_metrics(y_valid, y_pred_xgb)
    metrics_all["xgb"] = metrics_xgb

    print("Model: XGBoost")
    print(
        f"  MAE      : {metrics_xgb['MAE']:.2f} ({metrics_xgb['MAE_pct']:.2f}%)\n"
        f"  RMSE     : {metrics_xgb['RMSE']:.2f} ({metrics_xgb['RMSE_pct']:.2f}%)\n"
        f"  MedianAE : {metrics_xgb['MedianAE']:.2f} ({metrics_xgb['MedianAE_pct']:.2f}%)\n"
        f"  MAPE     : {metrics_xgb['MAPE']:.2f}%\n"
        f"  R2       : {metrics_xgb['R2']:.4f}\n"
    )

    # ---------- LightGBM ----------
    lgbm_model = build_lgbm_pipeline()
    lgbm_model.fit(X_train, y_train)
    y_pred_lgbm = lgbm_model.predict(X_valid)
    metrics_lgbm = compute_metrics(y_valid, y_pred_lgbm)
    metrics_all["lgbm"] = metrics_lgbm

    print("Model: LightGBM")
    print(
        f"  MAE      : {metrics_lgbm['MAE']:.2f} ({metrics_lgbm['MAE_pct']:.2f}%)\n"
        f"  RMSE     : {metrics_lgbm['RMSE']:.2f} ({metrics_lgbm['RMSE_pct']:.2f}%)\n"
        f"  MedianAE : {metrics_lgbm['MedianAE']:.2f} ({metrics_lgbm['MedianAE_pct']:.2f}%)\n"
        f"  MAPE     : {metrics_lgbm['MAPE']:.2f}%\n"
        f"  R2       : {metrics_lgbm['R2']:.4f}\n"
    )

    # ---------- RandomForest ----------
    rf_model = build_rf_pipeline()
    rf_model.fit(X_train, y_train)
    y_pred_rf = rf_model.predict(X_valid)
    metrics_rf = compute_metrics(y_valid, y_pred_rf)
    metrics_all["rf"] = metrics_rf

    print("Model: RandomForest")
    print(
        f"  MAE      : {metrics_rf['MAE']:.2f} ({metrics_rf['MAE_pct']:.2f}%)\n"
        f"  RMSE     : {metrics_rf['RMSE']:.2f} ({metrics_rf['RMSE_pct']:.2f}%)\n"
        f"  MedianAE : {metrics_rf['MedianAE']:.2f} ({metrics_rf['MedianAE_pct']:.2f}%)\n"
        f"  MAPE     : {metrics_rf['MAPE']:.2f}%\n"
        f"  R2       : {metrics_rf['R2']:.4f}\n"
    )

    # ---------- CatBoost ----------
    cat_model = build_catboost_model()
    cat_feature_indices = [X_train.columns.get_loc(col) for col in CAT_FEATURES]

    cat_model.fit(X_train, y_train, cat_features=cat_feature_indices)
    y_pred_cat = cat_model.predict(X_valid)
    metrics_cat = compute_metrics(y_valid, y_pred_cat)
    metrics_all["catboost"] = metrics_cat

    print("Model: CatBoost")
    print(
        f"  MAE      : {metrics_cat['MAE']:.2f} ({metrics_cat['MAE_pct']:.2f}%)\n"
        f"  RMSE     : {metrics_cat['RMSE']:.2f} ({metrics_cat['RMSE_pct']:.2f}%)\n"
        f"  MedianAE : {metrics_cat['MedianAE']:.2f} ({metrics_cat['MedianAE_pct']:.2f}%)\n"
        f"  MAPE     : {metrics_cat['MAPE']:.2f}%\n"
        f"  R2       : {metrics_cat['R2']:.4f}\n"
    )

    return metrics_all, cat_model


# =========================================================
# 6. Build 2025 feature rows and predict with CatBoost
# =========================================================
def build_2025_features_from_lagged(df_lagged: pd.DataFrame) -> pd.DataFrame:
    """
    Construct feature rows for the year 2025 based on lag information
    available up to 2024.

    For each (model, region, fuel_type, transmission, color) group:
        - sales_last1_2025 = 2024 sales_volume
        - sales_last2_2025 = 2024 sales_last1
        - sales_avg3_2025  = average of (2024, 2023, 2022) sales_volume
        - price_last1, mileage_last1 for 2025 are copied from 2024
        - price_trend and mileage_trend are reused from 2024

    The target column sales_volume is set to NaN for 2025 rows.
    """
    group_cols = ["model", "region", "fuel_type", "transmission", "color"]

    df_sorted = df_lagged.sort_values(group_cols + ["year"])
    last_rows = df_sorted.groupby(group_cols, as_index=False).tail(1)

    # Keep only combinations with complete lag information
    last_rows = last_rows.dropna(
        subset=["sales_last1", "sales_last2", "sales_avg3", "price_last1", "mileage_last1"]
    )

    rows_2025 = []

    for _, row in last_rows.iterrows():
        new_row = row.copy()
        new_row["year"] = 2025

        sales_last1_2025 = row["sales_volume"]
        sales_last2_2025 = row["sales_last1"]
        sales_avg3_2025 = (
            row["sales_volume"] + row["sales_last1"] + row["sales_last2"]
        ) / 3

        new_row["sales_last1"] = sales_last1_2025
        new_row["sales_last2"] = sales_last2_2025
        new_row["sales_avg3"] = sales_avg3_2025

        if not pd.isna(sales_last2_2025) and sales_last2_2025 != 0:
            new_row["sales_growth"] = (
                sales_last1_2025 - sales_last2_2025
            ) / sales_last2_2025
        else:
            new_row["sales_growth"] = 0.0

        new_row["price_last1"] = row["price_usd"]
        new_row["mileage_last1"] = row["mileage_km"]

        # Reuse the existing trend values from the latest year
        new_row["price_trend"] = row.get("price_trend", 0.0)
        new_row["mileage_trend"] = row.get("mileage_trend", 0.0)

        # Actual 2025 sales are unknown
        new_row["sales_volume"] = np.nan

        rows_2025.append(new_row)

    feat_2025 = pd.DataFrame(rows_2025)
    # Ensure column ordering: all features first, then sales_volume
    feat_2025 = feat_2025[[c for c in feat_2025.columns if c not in ["sales_volume"]] + ["sales_volume"]]

    return feat_2025


def retrain_catboost_and_predict_2025(df_lagged: pd.DataFrame):
    """
    Retrain CatBoost on all available data from 2020–2024 and
    predict 2025 sales using constructed 2025 feature rows.

    The function:
        - Cleans and uses all rows with non-missing features and target
        - Builds 2025 feature rows from lagged data
        - Fits a new CatBoost model on 2020–2024
        - Predicts row-level 2025 sales
        - Aggregates predictions by several dimensions (model, region,
          fuel_type, transmission, color)

    Parameters
    ----------
    df_lagged : pd.DataFrame
        Lag-augmented historical data including sales_volume.

    Returns
    -------
    total_2025 : float
        Total predicted sales volume for 2025.
    feat_2025_out : pd.DataFrame
        Row-level 2025 feature DataFrame with predictions attached.
    by_model : pd.DataFrame
        Predicted 2025 sales aggregated by model.
    by_region : pd.DataFrame
        Predicted 2025 sales aggregated by region.
    by_fuel : pd.DataFrame
        Predicted 2025 sales aggregated by fuel_type.
    by_trans : pd.DataFrame
        Predicted 2025 sales aggregated by transmission.
    by_color : pd.DataFrame
        Predicted 2025 sales aggregated by color.
    """
    df_clean = df_lagged.dropna(subset=FEATURE_COLS + ["sales_volume"]).copy()

    X_full = df_clean[FEATURE_COLS]
    y_full = df_clean["sales_volume"].values

    feat_2025 = build_2025_features_from_lagged(df_lagged)
    X_2025 = feat_2025[FEATURE_COLS]

    cat_model = build_catboost_model()
    cat_feature_indices = [X_full.columns.get_loc(col) for col in CAT_FEATURES]
    cat_model.fit(X_full, y_full, cat_features=cat_feature_indices)

    preds_2025 = cat_model.predict(X_2025)
    total_2025 = preds_2025.sum()

    feat_2025_out = feat_2025.copy()
    feat_2025_out["pred_sales_volume_2025_catboost"] = preds_2025

    print(f"\n=== CatBoost predicted total sales volume in 2025: {total_2025:.2f} ===\n")

    # Aggregate predictions by different dimensions
    col_pred = "pred_sales_volume_2025_catboost"

    by_model = (
        feat_2025_out.groupby("model")[col_pred]
        .sum()
        .reset_index()
        .sort_values(col_pred, ascending=False)
    )
    by_region = (
        feat_2025_out.groupby("region")[col_pred]
        .sum()
        .reset_index()
        .sort_values(col_pred, ascending=False)
    )
    by_fuel = (
        feat_2025_out.groupby("fuel_type")[col_pred]
        .sum()
        .reset_index()
        .sort_values(col_pred, ascending=False)
    )
    by_trans = (
        feat_2025_out.groupby("transmission")[col_pred]
        .sum()
        .reset_index()
        .sort_values(col_pred, ascending=False)
    )
    by_color = (
        feat_2025_out.groupby("color")[col_pred]
        .sum()
        .reset_index()
        .sort_values(col_pred, ascending=False)
    )

    return total_2025, feat_2025_out, by_model, by_region, by_fuel, by_trans, by_color


# =========================================================
# 7. Metrics comparison table and bar chart
# =========================================================
def build_metrics_table(metrics_all: dict) -> pd.DataFrame:
    """
    Convert the metrics dictionary into a DataFrame and
    enforce a consistent row ordering for models.

    Parameters
    ----------
    metrics_all : dict
        Dictionary mapping model names to metric dictionaries.

    Returns
    -------
    pd.DataFrame
        DataFrame indexed by model name with one row per model.
    """
    df = pd.DataFrame.from_dict(metrics_all, orient="index")
    df.index.name = "model"
    df = df.loc[["xgb", "lgbm", "rf", "catboost"]]
    return df


def plot_metrics_bar(metrics_df: pd.DataFrame, save_path: str = None):
    """
    Plot a single bar chart comparing four error-related metrics
    across multiple models:

        - MAE_pct
        - RMSE_pct
        - MedianAE_pct
        - MAPE (already expressed as a percentage)

    Parameters
    ----------
    metrics_df : pd.DataFrame
        DataFrame containing metrics for each model (one row per model).
    save_path : str, optional
        If provided, the bar chart is saved to this path.
        Otherwise, the figure is not written to disk.
    """
    models = metrics_df.index.tolist()
    x = np.arange(len(models))

    metrics_to_plot = ["MAE_pct", "RMSE_pct", "MedianAE_pct", "MAPE"]
    colors = ["#1C69D4", "#4C8BF5", "#82B1FF", "#FFB300"]
    width = 0.18

    fig, ax = plt.subplots(figsize=(10, 5))

    for i, (metric, c) in enumerate(zip(metrics_to_plot, colors)):
        vals = metrics_df[metric].values
        ax.bar(x + i * width - 1.5 * width, vals, width, label=metric, color=c)

    ax.set_xticks(x)
    ax.set_xticklabels(models, rotation=15)
    ax.set_ylabel("Error / Percentage")
    ax.set_title("Model Performance Comparison on 2024 (with lag features)")

    ax.legend()
    ax.grid(axis="y", linestyle="--", alpha=0.3)

    fig.tight_layout()
    if save_path:
        fig.savefig(save_path, dpi=300, bbox_inches="tight")
    plt.close(fig)


# =========================================================
# 8. Main script entry point
# =========================================================
if __name__ == "__main__":
    # 1. Load and aggregate data
    agg_df = load_and_aggregate(FILE_PATH)
    print("Aggregated shape (before lag):", agg_df.shape)
    print(agg_df.head(), "\n")

    # 2. Add lag features
    agg_df_lagged = add_lag_features(agg_df)
    print(
        "Shape after adding lag features (before dropna):",
        agg_df_lagged.shape,
        "\n",
    )

    # 3. Train and validate models (2020–2023 train, 2024 validate)
    metrics_all, cat_model_trained = train_and_validate_models(agg_df_lagged)

    # 4. Build metrics comparison table
    metrics_df = build_metrics_table(metrics_all)
    print("\n=== Model comparison on 2024 validation (with lag, row-level) ===")
    print(metrics_df, "\n")

    # 5. Plot bar chart comparing metrics across models
    plot_metrics_bar(metrics_df, save_path="model_performance_2024_with_lag.png")
    print("Saved bar chart to: model_performance_2024_with_lag.png\n")

    # 6. Use CatBoost to predict 2025 total and segmented sales
    (
        total_2025_cat,
        df_cat_2025,
        by_model_2025,
        by_region_2025,
        by_fuel_2025,
        by_trans_2025,
        by_color_2025,
    ) = retrain_catboost_and_predict_2025(agg_df_lagged)

    print("CatBoost - 2025 predicted sales by model:")
    print(by_model_2025.head(), "\n")

    print("CatBoost - 2025 predicted sales by region:")
    print(by_region_2025, "\n")

    print("CatBoost - 2025 predicted sales by fuel_type:")
    print(by_fuel_2025, "\n")

    print("CatBoost - 2025 predicted sales by transmission:")
    print(by_trans_2025, "\n")

    print("CatBoost - 2025 predicted sales by color:")
    print(by_color_2025, "\n")

    print(f"CatBoost (row-level) predicted 2025 total sales: {total_2025_cat:.2f}")


Aggregated shape (before lag): (10306, 10)
   year     model  region fuel_type transmission   color  engine_size_l  \
0  2020  3 Series  Africa    Diesel    Automatic   Black           2.90   
1  2020  3 Series  Africa    Diesel    Automatic    Blue           3.00   
2  2020  3 Series  Africa    Diesel    Automatic     Red           3.55   
3  2020  3 Series  Africa    Diesel    Automatic  Silver           3.30   
4  2020  3 Series  Africa    Diesel       Manual   Black           4.60   

   mileage_km  price_usd  sales_volume  
0     54293.0   100920.0          8279  
1     22846.0    79435.5          6282  
2     85078.5   106306.0         14847  
3    182077.0    59951.0          2355  
4    107799.0    75078.0          3975   

Shape after adding lag features (before dropna): (10306, 18) 

Shape train (with lag): (2343, 17), Shape valid (with lag): (1808, 17)

Model: XGBoost
  MAE      : 789.90 (9.41%)
  RMSE     : 1181.66 (14.08%)
  MedianAE : 575.95 (6.86%)
  MAPE     : 29.28%
  

# Comparing to Arima

In [None]:
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import (
    mean_absolute_error,
    root_mean_squared_error,
    r2_score,
    median_absolute_error,
)
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

from statsmodels.tsa.arima.model import ARIMA

import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

# =========================================================
# 0. Global configuration
# =========================================================
FILE_PATH = "../data/processed/bmw_sales_2020_2024_clean.csv"

CAT_FEATURES = ["model", "region", "fuel_type", "transmission", "color"]
NUM_FEATURES = ["year", "engine_size_l", "mileage_km", "price_usd"]
FEATURE_COLS = CAT_FEATURES + NUM_FEATURES

plt.rcParams["figure.dpi"] = 120


# =========================================================
# 1. Aggregate raw data (by year + category combination)
# =========================================================
def load_and_aggregate(path: str) -> pd.DataFrame:
    """
    Load the raw dataset and aggregate it by the combination of:
        year + model + region + fuel_type + transmission + color.

    Aggregation rules:
        - engine_size_l, mileage_km, price_usd: mean value
        - sales_volume: sum
    """
    df = pd.read_csv(path)

    group_cols = ["year", "model", "region", "fuel_type", "transmission", "color"]

    agg_df = (
        df.groupby(group_cols, as_index=False)
        .agg(
            {
                "engine_size_l": "mean",
                "mileage_km": "mean",
                "price_usd": "mean",
                "sales_volume": "sum",
            }
        )
    )

    return agg_df


# =========================================================
# 2. ARIMA baseline on yearly total sales
# =========================================================
def compute_metrics(y_true: np.ndarray, y_pred: np.ndarray) -> dict:
    """
    Compute generic regression metrics, including:
        - MAE / RMSE / MedianAE / MAPE / R2
    and percentage metrics relative to the mean of y_true:
        - MAE_pct / RMSE_pct / MedianAE_pct
    """
    mae = mean_absolute_error(y_true, y_pred)
    # rmse = root_mean_squared_error(y_true, y_pred, squared=False)
    rmse = root_mean_squared_error(y_true, y_pred)
    med_ae = median_absolute_error(y_true, y_pred)

    # Avoid division by zero when computing MAPE
    mask = y_true != 0
    if mask.sum() > 0:
        mape = (np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])).mean() * 100
    else:
        mape = np.nan

    if len(y_true) > 1:
        r2 = r2_score(y_true, y_pred)
    else:
        # R2 is not meaningful for a single data point
        r2 = np.nan

    mean_y = y_true.mean()
    mae_pct = mae / mean_y * 100 if mean_y != 0 else np.nan
    rmse_pct = rmse / mean_y * 100 if mean_y != 0 else np.nan
    med_ae_pct = med_ae / mean_y * 100 if mean_y != 0 else np.nan

    return {
        "MAE": mae,
        "RMSE": rmse,
        "MedianAE": med_ae,
        "MAPE": mape,
        "R2": r2,
        "MAE_pct": mae_pct,
        "RMSE_pct": rmse_pct,
        "MedianAE_pct": med_ae_pct,
    }


def arima_baseline_yearly(df_agg: pd.DataFrame):
    """
    Build an ARIMA baseline model on yearly total sales:

    - Training: 2020–2023
    - Testing: 2024
    - Forecast: 2025 total sales

    The model is fitted on annual total sales and used to:
        1) Evaluate 2024 as a one-step-ahead forecast
        2) Forecast 2025 total sales
    """
    df_yearly = (
        df_agg.groupby("year")["sales_volume"].sum().sort_index()
    )  # Series: index=year, value=total sales

    print("=== ARIMA baseline on yearly total sales ===")
    print(df_yearly, "\n")

    train_series = df_yearly[df_yearly.index < 2024]
    test_value = df_yearly.loc[2024]

    # Simple ARIMA(1,1,1) configuration
    model = ARIMA(train_series, order=(1, 1, 1))
    fitted = model.fit()

    # One-step forecast for 2024 (used as the test point)
    pred_2024 = float(fitted.forecast(steps=1).iloc[0])
    print(f"ARIMA predicted 2024 total sales: {pred_2024:.2f}")
    print(f"Actual 2024 total sales        : {float(test_value):.2f}")

    metrics_2024 = compute_metrics(
        np.array([test_value], dtype=float), np.array([pred_2024], dtype=float)
    )

    print("ARIMA metrics (yearly total, 2024 as test):")
    for k, v in metrics_2024.items():
        print(f"  {k}: {v:.4f}")
    print()

    # Forecast two steps ahead and take the second value as the 2025 forecast
    pred_2025 = float(fitted.forecast(steps=2).iloc[-1])
    print(f"=== ARIMA predicted total sales in 2025 (yearly total): {pred_2025:.2f} ===\n")

    return metrics_2024, pred_2025


# =========================================================
# 3. Lag feature construction (for ML models)
# =========================================================
def add_lag_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Add lag-based features for each combination of:
        model + region + fuel_type + transmission + color.

    Newly created features:
        - sales_last1, sales_last2
        - sales_avg3
        - sales_growth
        - price_last1, price_trend
        - mileage_last1, mileage_trend

    The global NUM_FEATURES and FEATURE_COLS lists are updated to
    include these lag-based numeric features.
    """
    df = df.sort_values(
        ["model", "region", "fuel_type", "transmission", "color", "year"]
    ).copy()

    group_cols = ["model", "region", "fuel_type", "transmission", "color"]

    df["sales_last1"] = df.groupby(group_cols)["sales_volume"].shift(1)
    df["sales_last2"] = df.groupby(group_cols)["sales_volume"].shift(2)

    df["sales_avg3"] = df.groupby(group_cols)["sales_volume"].transform(
        lambda x: x.rolling(3).mean()
    )

    df["sales_growth"] = (df["sales_last1"] - df["sales_last2"]) / df["sales_last2"]

    df["price_last1"] = df.groupby(group_cols)["price_usd"].shift(1)
    df["price_trend"] = (df["price_usd"] - df["price_last1"]) / df["price_last1"]

    df["mileage_last1"] = df.groupby(group_cols)["mileage_km"].shift(1)
    df["mileage_trend"] = (df["mileage_km"] - df["mileage_last1"]) / df["mileage_last1"]

    # Update numeric feature list with lag features
    lag_num_features = [
        "sales_last1",
        "sales_last2",
        "sales_avg3",
        "sales_growth",
        "price_last1",
        "price_trend",
        "mileage_last1",
        "mileage_trend",
    ]

    global NUM_FEATURES, FEATURE_COLS
    NUM_FEATURES = ["year", "engine_size_l", "mileage_km", "price_usd"] + lag_num_features
    FEATURE_COLS = CAT_FEATURES + NUM_FEATURES

    return df


# =========================================================
# 4. Build model pipelines
# =========================================================
def build_xgb_pipeline():
    """
    Build an XGBoost regression pipeline with preprocessing.

    The pipeline includes:
        - One-hot encoding for categorical features
        - Passthrough for numeric features
        - XGBRegressor as the final estimator
    """
    preprocessor = ColumnTransformer(
        transformers=[
            ("cat", OneHotEncoder(handle_unknown="ignore"), CAT_FEATURES),
            ("num", "passthrough", NUM_FEATURES),
        ]
    )

    xgb = XGBRegressor(
        n_estimators=400,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1,
    )

    return Pipeline(steps=[("preprocess", preprocessor), ("model", xgb)])


def build_lgbm_pipeline():
    """
    Build a LightGBM regression pipeline with preprocessing.

    The pipeline includes:
        - One-hot encoding for categorical features
        - Passthrough for numeric features
        - LGBMRegressor as the final estimator
    """
    preprocessor = ColumnTransformer(
        transformers=[
            ("cat", OneHotEncoder(handle_unknown="ignore"), CAT_FEATURES),
            ("num", "passthrough", NUM_FEATURES),
        ]
    )

    lgbm = LGBMRegressor(
        n_estimators=500,
        learning_rate=0.05,
        max_depth=-1,
        num_leaves=31,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1,
        objective="regression",
        verbose=-1,
    )

    return Pipeline(steps=[("preprocess", preprocessor), ("model", lgbm)])


def build_rf_pipeline():
    """
    Build a RandomForest regression pipeline with preprocessing.

    The pipeline includes:
        - One-hot encoding for categorical features
        - Passthrough for numeric features
        - RandomForestRegressor as the final estimator
    """
    preprocessor = ColumnTransformer(
        transformers=[
            ("cat", OneHotEncoder(handle_unknown="ignore"), CAT_FEATURES),
            ("num", "passthrough", NUM_FEATURES),
        ]
    )

    rf = RandomForestRegressor(
        n_estimators=300,
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        random_state=42,
        n_jobs=-1,
    )

    return Pipeline(steps=[("preprocess", preprocessor), ("model", rf)])


def build_catboost_model():
    """
    Build a CatBoostRegressor model.

    CatBoost directly uses the DataFrame as input and expects
    categorical feature indices passed via the cat_features parameter.
    """
    return CatBoostRegressor(
        depth=8,
        learning_rate=0.05,
        n_estimators=800,
        loss_function="RMSE",
        eval_metric="RMSE",
        random_seed=42,
        verbose=False,
    )


# =========================================================
# 5. Training and validation (2020–2023 train, 2024 validate)
# =========================================================
def train_and_validate_models(df_lagged: pd.DataFrame):
    """
    Train XGBoost, LightGBM, RandomForest, and CatBoost on data with lag features,
    and evaluate them on the 2024 validation set.

    Training/validation split:
        - Training: years < 2024
        - Validation: year == 2024

    Parameters
    ----------
    df_lagged : pd.DataFrame
        Input DataFrame with lag features and sales_volume.

    Returns
    -------
    metrics_all : dict
        Dictionary of evaluation metrics for each model on 2024.
    cat_model : CatBoostRegressor
        Trained CatBoost model (can be reused later).
    """
    # Drop rows with NaN in lag features or target
    df_clean = df_lagged.dropna(subset=FEATURE_COLS + ["sales_volume"]).copy()

    # Split by year
    train_df = df_clean[df_clean["year"] < 2024]
    valid_df = df_clean[df_clean["year"] == 2024]

    X_train = train_df[FEATURE_COLS]
    y_train = train_df["sales_volume"].values

    X_valid = valid_df[FEATURE_COLS]
    y_valid = valid_df["sales_volume"].values

    print(
        f"Shape train (with lag): {X_train.shape}, "
        f"Shape valid (with lag): {X_valid.shape}\n"
    )

    metrics_all = {}

    # ---------- XGBoost ----------
    xgb_model = build_xgb_pipeline()
    xgb_model.fit(X_train, y_train)
    y_pred_xgb = xgb_model.predict(X_valid)
    metrics_xgb = compute_metrics(y_valid, y_pred_xgb)
    metrics_all["xgb"] = metrics_xgb

    print("Model: XGBoost")
    print(
        f"  MAE      : {metrics_xgb['MAE']:.2f} ({metrics_xgb['MAE_pct']:.2f}%)\n"
        f"  RMSE     : {metrics_xgb['RMSE']:.2f} ({metrics_xgb['RMSE_pct']:.2f}%)\n"
        f"  MedianAE : {metrics_xgb['MedianAE']:.2f} ({metrics_xgb['MedianAE_pct']:.2f}%)\n"
        f"  MAPE     : {metrics_xgb['MAPE']:.2f}%\n"
        f"  R2       : {metrics_xgb['R2']:.4f}\n"
    )

    # ---------- LightGBM ----------
    lgbm_model = build_lgbm_pipeline()
    lgbm_model.fit(X_train, y_train)
    y_pred_lgbm = lgbm_model.predict(X_valid)
    metrics_lgbm = compute_metrics(y_valid, y_pred_lgbm)
    metrics_all["lgbm"] = metrics_lgbm

    print("Model: LightGBM")
    print(
        f"  MAE      : {metrics_lgbm['MAE']:.2f} ({metrics_lgbm['MAE_pct']:.2f}%)\n"
        f"  RMSE     : {metrics_lgbm['RMSE']:.2f} ({metrics_lgbm['RMSE_pct']:.2f}%)\n"
        f"  MedianAE : {metrics_lgbm['MedianAE']:.2f} ({metrics_lgbm['MedianAE_pct']:.2f}%)\n"
        f"  MAPE     : {metrics_lgbm['MAPE']:.2f}%\n"
        f"  R2       : {metrics_lgbm['R2']:.4f}\n"
    )

    # ---------- RandomForest ----------
    rf_model = build_rf_pipeline()
    rf_model.fit(X_train, y_train)
    y_pred_rf = rf_model.predict(X_valid)
    metrics_rf = compute_metrics(y_valid, y_pred_rf)
    metrics_all["rf"] = metrics_rf

    print("Model: RandomForest")
    print(
        f"  MAE      : {metrics_rf['MAE']:.2f} ({metrics_rf['MAE_pct']:.2f}%)\n"
        f"  RMSE     : {metrics_rf['RMSE']:.2f} ({metrics_rf['RMSE_pct']:.2f}%)\n"
        f"  MedianAE : {metrics_rf['MedianAE']:.2f} ({metrics_rf['MedianAE_pct']:.2f}%)\n"
        f"  MAPE     : {metrics_rf['MAPE']:.2f}%\n"
        f"  R2       : {metrics_rf['R2']:.4f}\n"
    )

    # ---------- CatBoost ----------
    cat_model = build_catboost_model()
    cat_feature_indices = [X_train.columns.get_loc(col) for col in CAT_FEATURES]

    cat_model.fit(X_train, y_train, cat_features=cat_feature_indices)
    y_pred_cat = cat_model.predict(X_valid)
    metrics_cat = compute_metrics(y_valid, y_pred_cat)
    metrics_all["catboost"] = metrics_cat

    print("Model: CatBoost")
    print(
        f"  MAE      : {metrics_cat['MAE']:.2f} ({metrics_cat['MAE_pct']:.2f}%)\n"
        f"  RMSE     : {metrics_cat['RMSE']:.2f} ({metrics_cat['RMSE_pct']:.2f}%)\n"
        f"  MedianAE : {metrics_cat['MedianAE']:.2f} ({metrics_cat['MedianAE_pct']:.2f}%)\n"
        f"  MAPE     : {metrics_cat['MAPE']:.2f}%\n"
        f"  R2       : {metrics_cat['R2']:.4f}\n"
    )

    return metrics_all, cat_model


# =========================================================
# 6. Construct 2025 features and predict with CatBoost
# =========================================================
def build_2025_features_from_lagged(df_lagged: pd.DataFrame) -> pd.DataFrame:
    """
    Construct feature rows for the year 2025 using lag information
    available up to 2024.

    For each (model, region, fuel_type, transmission, color) group:
        - sales_last1_2025 = 2024 sales_volume
        - sales_last2_2025 = 2024 sales_last1
        - sales_avg3_2025  = average of (2024, 2023, 2022)
        - price_last1_2025, mileage_last1_2025 are copied from 2024
        - price_trend and mileage_trend are reused from the latest year

    The 2025 sales_volume is set to NaN (unknown target).
    """
    group_cols = ["model", "region", "fuel_type", "transmission", "color"]

    df_sorted = df_lagged.sort_values(group_cols + ["year"])
    last_rows = df_sorted.groupby(group_cols, as_index=False).tail(1)

    # Filter combinations that must have complete lag information
    last_rows = last_rows.dropna(
        subset=["sales_last1", "sales_last2", "sales_avg3", "price_last1", "mileage_last1"]
    )

    rows_2025 = []

    for _, row in last_rows.iterrows():
        new_row = row.copy()
        new_row["year"] = 2025

        sales_last1_2025 = row["sales_volume"]
        sales_last2_2025 = row["sales_last1"]
        sales_avg3_2025 = (
            row["sales_volume"] + row["sales_last1"] + row["sales_last2"]
        ) / 3

        new_row["sales_last1"] = sales_last1_2025
        new_row["sales_last2"] = sales_last2_2025
        new_row["sales_avg3"] = sales_avg3_2025

        if not pd.isna(sales_last2_2025) and sales_last2_2025 != 0:
            new_row["sales_growth"] = (
                sales_last1_2025 - sales_last2_2025
            ) / sales_last2_2025
        else:
            new_row["sales_growth"] = 0.0

        new_row["price_last1"] = row["price_usd"]
        new_row["mileage_last1"] = row["mileage_km"]

        # Reuse the original trend values (can also be recomputed if needed)
        new_row["price_trend"] = row.get("price_trend", 0.0)
        new_row["mileage_trend"] = row.get("mileage_trend", 0.0)

        # Actual 2025 sales are unknown
        new_row["sales_volume"] = np.nan

        rows_2025.append(new_row)

    feat_2025 = pd.DataFrame(rows_2025)
    # Ensure column ordering is consistent with FEATURE_COLS, with sales_volume at the end
    feat_2025 = feat_2025[[c for c in feat_2025.columns if c not in ["sales_volume"]] + ["sales_volume"]]

    return feat_2025


def retrain_catboost_and_predict_2025(df_lagged: pd.DataFrame):
    """
    Retrain CatBoost using all data from 2020–2024 and
    predict 2025 row-level sales using constructed 2025 feature rows.

    Steps:
        - Clean and use all rows with valid features and target.
        - Build 2025 feature rows from lagged data.
        - Fit a new CatBoost model on full 2020–2024 data.
        - Predict 2025 sales at row level.
        - Aggregate predictions by model, region, fuel_type,
          transmission, and color.

    Parameters
    ----------
    df_lagged : pd.DataFrame
        Historical data with lag-based features and sales_volume.

    Returns
    -------
    total_2025 : float
        Total predicted 2025 sales volume.
    feat_2025_out : pd.DataFrame
        Row-level 2025 feature DataFrame with predictions appended.
    by_model : pd.DataFrame
        Predicted sales aggregated by model.
    by_region : pd.DataFrame
        Predicted sales aggregated by region.
    by_fuel : pd.DataFrame
        Predicted sales aggregated by fuel_type.
    by_trans : pd.DataFrame
        Predicted sales aggregated by transmission.
    by_color : pd.DataFrame
        Predicted sales aggregated by color.
    """
    df_clean = df_lagged.dropna(subset=FEATURE_COLS + ["sales_volume"]).copy()

    X_full = df_clean[FEATURE_COLS]
    y_full = df_clean["sales_volume"].values

    feat_2025 = build_2025_features_from_lagged(df_lagged)
    X_2025 = feat_2025[FEATURE_COLS]

    cat_model = build_catboost_model()
    cat_feature_indices = [X_full.columns.get_loc(col) for col in CAT_FEATURES]
    cat_model.fit(X_full, y_full, cat_features=cat_feature_indices)

    preds_2025 = cat_model.predict(X_2025)
    total_2025 = preds_2025.sum()

    feat_2025_out = feat_2025.copy()
    feat_2025_out["pred_sales_volume_2025_catboost"] = preds_2025

    print(f"\n=== CatBoost predicted total sales volume in 2025: {total_2025:.2f} ===\n")

    # Aggregate predictions by different dimensions
    col_pred = "pred_sales_volume_2025_catboost"

    by_model = (
        feat_2025_out.groupby("model")[col_pred]
        .sum()
        .reset_index()
        .sort_values(col_pred, ascending=False)
    )
    by_region = (
        feat_2025_out.groupby("region")[col_pred]
        .sum()
        .reset_index()
        .sort_values(col_pred, ascending=False)
    )
    by_fuel = (
        feat_2025_out.groupby("fuel_type")[col_pred]
        .sum()
        .reset_index()
        .sort_values(col_pred, ascending=False)
    )
    by_trans = (
        feat_2025_out.groupby("transmission")[col_pred]
        .sum()
        .reset_index()
        .sort_values(col_pred, ascending=False)
    )
    by_color = (
        feat_2025_out.groupby("color")[col_pred]
        .sum()
        .reset_index()
        .sort_values(col_pred, ascending=False)
    )

    return total_2025, feat_2025_out, by_model, by_region, by_fuel, by_trans, by_color


# =========================================================
# 7. Metrics comparison table and single bar chart
# =========================================================
def build_metrics_table(metrics_all: dict, arima_metrics: dict) -> pd.DataFrame:
    """
    Build a DataFrame to compare performance metrics across
    ARIMA (yearly baseline) and all ML models.

    Parameters
    ----------
    metrics_all : dict
        Dictionary of metrics for ML models (XGB, LGBM, RF, CatBoost).
    arima_metrics : dict
        Metrics dictionary for the yearly ARIMA baseline.

    Returns
    -------
    pd.DataFrame
        DataFrame with one row per model (including ARIMA) and
        metric names as columns.
    """
    df = pd.DataFrame.from_dict(metrics_all, orient="index")
    df.index.name = "model"

    # Add ARIMA baseline (yearly total)
    df.loc["arima_yearly"] = arima_metrics

    # Reorder rows for clearer comparison
    df = df.loc[["arima_yearly", "xgb", "lgbm", "rf", "catboost"]]

    return df


def plot_metrics_bar(metrics_df: pd.DataFrame, save_path: str = None):
    """
    Plot a single bar chart comparing four error-related metrics
    for all models:

        - MAE_pct
        - RMSE_pct
        - MedianAE_pct
        - MAPE (already in percentage)

    Parameters
    ----------
    metrics_df : pd.DataFrame
        DataFrame containing metrics for each model (one row per model).
    save_path : str, optional
        If provided, save the figure to the given path.
    """
    models = metrics_df.index.tolist()
    x = np.arange(len(models))

    metrics_to_plot = ["MAE_pct", "RMSE_pct", "MedianAE_pct", "MAPE"]
    colors = ["#1C69D4", "#4C8BF5", "#82B1FF", "#FFB300"]
    width = 0.18

    fig, ax = plt.subplots(figsize=(10, 5))

    for i, (metric, c) in enumerate(zip(metrics_to_plot, colors)):
        vals = metrics_df[metric].values
        ax.bar(x + i * width - 1.5 * width, vals, width, label=metric, color=c)

    ax.set_xticks(x)
    ax.set_xticklabels(models, rotation=15)
    ax.set_ylabel("Error / Percentage")
    ax.set_title("Model Performance Comparison on 2024 (with lag features)")

    ax.legend()
    ax.grid(axis="y", linestyle="--", alpha=0.3)

    fig.tight_layout()
    if save_path:
        fig.savefig(save_path, dpi=300, bbox_inches="tight")
    plt.close(fig)


# =========================================================
# 8. Main workflow
# =========================================================
if __name__ == "__main__":
    # 1. Load and aggregate raw data
    agg_df = load_and_aggregate(FILE_PATH)
    print("Aggregated shape (before lag):", agg_df.shape)
    print(agg_df.head(), "\n")

    # 2. ARIMA yearly baseline (total sales)
    arima_metrics_2024, arima_2025_pred = arima_baseline_yearly(agg_df)

    # 3. Add lag features
    agg_df_lagged = add_lag_features(agg_df)
    print(
        "Shape after adding lag features (before dropna):",
        agg_df_lagged.shape,
        "\n",
    )

    # 4. Train and validate models (2020–2023 train, 2024 validate)
    metrics_all, cat_model_trained = train_and_validate_models(agg_df_lagged)

    # 5. Build metrics comparison table (including percentages)
    metrics_df = build_metrics_table(metrics_all, arima_metrics_2024)
    print("\n=== Model comparison on 2024 validation (with lag, row-level vs yearly) ===")
    print(metrics_df, "\n")

    # 6. Plot a single bar chart comparing metrics across models
    plot_metrics_bar(metrics_df, save_path="model_performance_2024_with_lag.png")
    print("Saved bar chart to: model_performance_2024_with_lag.png\n")

    # 7. Use CatBoost to forecast 2025 total and segmented sales
    (
        total_2025_cat,
        df_cat_2025,
        by_model_2025,
        by_region_2025,
        by_fuel_2025,
        by_trans_2025,
        by_color_2025,
    ) = retrain_catboost_and_predict_2025(agg_df_lagged)

    print("CatBoost - 2025 predicted sales by model:")
    print(by_model_2025.head(), "\n")

    print("CatBoost - 2025 predicted sales by region:")
    print(by_region_2025, "\n")

    print("CatBoost - 2025 predicted sales by fuel_type:")
    print(by_fuel_2025, "\n")

    print("CatBoost - 2025 predicted sales by transmission:")
    print(by_trans_2025, "\n")

    print("CatBoost - 2025 predicted sales by color:")
    print(by_color_2025, "\n")

    print(f"ARIMA (yearly total) predicted 2025 sales: {arima_2025_pred:.2f}")
    print(f"CatBoost (row-level) predicted 2025 sales: {total_2025_cat:.2f}")


Aggregated shape (before lag): (10306, 10)
   year     model  region fuel_type transmission   color  engine_size_l  \
0  2020  3 Series  Africa    Diesel    Automatic   Black           2.90   
1  2020  3 Series  Africa    Diesel    Automatic    Blue           3.00   
2  2020  3 Series  Africa    Diesel    Automatic     Red           3.55   
3  2020  3 Series  Africa    Diesel    Automatic  Silver           3.30   
4  2020  3 Series  Africa    Diesel       Manual   Black           4.60   

   mileage_km  price_usd  sales_volume  
0     54293.0   100920.0          8279  
1     22846.0    79435.5          6282  
2     85078.5   106306.0         14847  
3    182077.0    59951.0          2355  
4    107799.0    75078.0          3975   

=== ARIMA baseline on yearly total sales ===
year
2020    16310843
2021    16884666
2022    17920946
2023    16268654
2024    17527854
Name: sales_volume, dtype: int64 

ARIMA predicted 2024 total sales: 16227225.32
Actual 2024 total sales        : 17527854.