# 0. Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import matplotlib.gridspec as gridspec

from ydata_profiling import ProfileReport
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from kneed import KneeLocator
from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from collections import Counter
from tabulate import tabulate
from tsfeatures import tsfeatures

In [2]:
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from statsmodels.tsa.holtwinters import Holt
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

In [3]:
# Configure seaborn plot style: set background color and use dark grid
sns.set(rc={'axes.facecolor':'#E6E6E6'}, style='darkgrid')

In [4]:
df_train = pd.read_csv("data/train_clustered.csv", index_col=0)
df_test = pd.read_csv("data/test_clustered.csv", index_col=0)
cluster = pd.read_csv("data/clustered_products.csv", index_col=0)

In [5]:
df_train.ds = pd.to_datetime(df_train.ds, format="%Y-%m-%d")
df_test.ds = pd.to_datetime(df_test.ds, format="%Y-%m-%d")

In [6]:
nb_clusters = df_train.cluster.nunique()

# IV. Model selection per cluster

In [7]:
# Evaluate the median time series for each cluster
def evaluate_models_on_median(train, test):
    results = {}
    
    # # ARIMA
    # try:
    #     arima_model = ARIMA(train, order=(1, 1, 1)).fit()
    #     arima_forecast = arima_model.forecast(steps=len(test))
    #     results['ARIMA'] = mean_absolute_error(test, arima_forecast)
    # except:
    #     results['ARIMA'] = float('inf')
    
    # SARIMA
    try:
        sarima_model = SARIMAX(train, order=(1, 1, 1), seasonal_order=(1, 1, 1, 12)).fit(disp=False)
        sarima_forecast = sarima_model.forecast(steps=len(test))
        results['SARIMA'] = mean_absolute_error(test, sarima_forecast)
    except:
        results['SARIMA'] = float('inf')
    
    # Exponential Smoothing
    try:
        es_model = ExponentialSmoothing(train, seasonal='add', seasonal_periods=12).fit()
        es_forecast = es_model.forecast(steps=len(test))
        results['Exponential Smoothing'] = mean_absolute_error(test, es_forecast)
    except:
        results['Exponential Smoothing'] = float('inf')

    # Holt's Linear Trend
    try:
        holt_model = Holt(train).fit()
        holt_forecast = holt_model.forecast(steps=len(test))
        results["Holt's Linear Trend"] = mean_absolute_error(test, holt_forecast)
    except:
        results["Holt's Linear Trend"] = float('inf')
    
    # # Linear Regression
    # try:
    #     lr_model = LinearRegression()
    #     X_train = np.arange(len(train)).reshape(-1, 1)
    #     lr_model.fit(X_train, train)
    #     X_test = np.arange(len(train), len(train) + len(test)).reshape(-1, 1)
    #     lr_forecast = lr_model.predict(X_test)
    #     results['Linear Regression'] = mean_absolute_error(test, lr_forecast)
    # except:
    #     results['Linear Regression'] = float('inf')
    
    # # XGBoost
    # try:
    #     xgb_model = XGBRegressor(objective='reg:squarederror', random_state=42)
    #     X_train = np.arange(len(train)).reshape(-1, 1)
    #     xgb_model.fit(X_train, train)
    #     X_test = np.arange(len(train), len(train) + len(test)).reshape(-1, 1)
    #     xgb_forecast = xgb_model.predict(X_test)
    #     results['XGBoost'] = mean_absolute_error(test, xgb_forecast)
    # except:
    #     results['XGBoost'] = float('inf')
    
    # # Random Forest
    # try:
    #     rf_model = RandomForestRegressor(random_state=42)
    #     X_train = np.arange(len(train)).reshape(-1, 1)
    #     rf_model.fit(X_train, train)
    #     X_test = np.arange(len(train), len(train) + len(test)).reshape(-1, 1)
    #     rf_forecast = rf_model.predict(X_test)
    #     results['Random Forest'] = mean_absolute_error(test, rf_forecast)
    # except:
    #     results['Random Forest'] = float('inf')
    
    # # LSTM
    # try:
    #     lstm_model = Sequential([
    #         LSTM(50, activation='relu', input_shape=(1, 1)),
    #         Dense(1)
    #     ])
    #     lstm_model.compile(optimizer='adam', loss='mae')
    #     train_reshaped = train.reshape(-1, 1, 1)
    #     lstm_model.fit(train_reshaped, train, epochs=10, batch_size=1, verbose=0)
    #     test_reshaped = np.arange(len(train), len(train) + len(test)).reshape(-1, 1, 1)
    #     lstm_forecast = lstm_model.predict(test_reshaped).flatten()
    #     results['LSTM'] = mean_absolute_error(test, lstm_forecast)
    # except:
    #     results['LSTM'] = float('inf')
    
    return results

In [8]:
# Inverse dictionary
def inverse_dict(d):
    return {v: k for k, v in d.items()}

In [9]:
# Find the best model for each cluster
clusters_centroid = inverse_dict(cluster[cluster["centroid"] == True]["cluster"].to_dict())
clusters_model ={}

for c, i in clusters_centroid.items():
    train = df_train[df_train["unique_id"] == i].y
    test = df_test[df_test["unique_id"] == i].y

    mae = evaluate_models_on_median(train, test)
    best_model = min(mae, key=mae.get)
    clusters_model[c] = best_model

    print(f"Cluster {c}: Best Model = {best_model} (MAE = {mae[best_model]:.2f})")

Cluster 1: Best Model = Exponential Smoothing (MAE = 8.35)
Cluster 0: Best Model = Holt's Linear Trend (MAE = 22.76)
Cluster 2: Best Model = Holt's Linear Trend (MAE = 24.90)


In [10]:
clusters_model_df = pd.DataFrame(data=clusters_model.items(), columns=["cluster", "model"])

In [11]:
clusters_model_df.to_csv("data/clusters_model.csv")