# 0. Imports

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import matplotlib.gridspec as gridspec

from ydata_profiling import ProfileReport
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from kneed import KneeLocator
from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from collections import Counter
from tabulate import tabulate
from tsfeatures import tsfeatures

In [13]:
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from statsmodels.tsa.holtwinters import Holt
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

ModuleNotFoundError: No module named 'xgboost'

In [10]:
# Configure seaborn plot style: set background color and use dark grid
sns.set(rc={'axes.facecolor':'#E6E6E6'}, style='darkgrid')

In [11]:
df_train = pd.read_csv("data/train.csv", index_col=0)
cluster = pd.read_csv("data/clustered_products.csv", index_col=0)

FileNotFoundError: [Errno 2] No such file or directory: 'data/clustered_products.csv'

In [None]:
df_train.ds = pd.to_datetime(df_train.ds, format="%Y-%m-%d")

In [None]:
nb_clusters = df_train.cluster.nunique()

NameError: name 'df_train' is not defined

# IV. Model selection per cluster

In [2]:
# Find the median time series for each cluster
def median_ts(cluster_indices, data):
    cluster_data = data[cluster_indices]
    median_series = np.median(cluster_data, axis=0)  # Median per timestep
    return median_series

In [3]:
clusters_median = {}

In [None]:
for cluster in range(num_clusters):
    clusters_median[cluster] = median_ts(##Input relevant information##)

In [None]:
# Evaluate the median time series for each cluster
def evaluate_models_on_median(train, test):
    results = {}
    
    # ARIMA
    try:
        arima_model = ARIMA(train, order=(1, 1, 1)).fit()
        arima_forecast = arima_model.forecast(steps=len(test))
        results['ARIMA'] = mean_absolute_error(test, arima_forecast)
    except:
        results['ARIMA'] = float('inf')
    
    # SARIMA
    try:
        sarima_model = SARIMAX(train, order=(1, 1, 1), seasonal_order=(1, 1, 1, 12)).fit(disp=False)
        sarima_forecast = sarima_model.forecast(steps=len(test))
        results['SARIMA'] = mean_absolute_error(test, sarima_forecast)
    except:
        results['SARIMA'] = float('inf')
    
    # Exponential Smoothing
    try:
        es_model = ExponentialSmoothing(train, seasonal='add', seasonal_periods=12).fit()
        es_forecast = es_model.forecast(steps=len(test))
        results['Exponential Smoothing'] = mean_absolute_error(test, es_forecast)
    except:
        results['Exponential Smoothing'] = float('inf')

    # Holt's Linear Trend
    try:
        holt_model = Holt(train).fit()
        holt_forecast = holt_model.forecast(steps=len(test))
        results["Holt's Linear Trend"] = mean_absolute_error(test, holt_forecast)
    except:
        results["Holt's Linear Trend"] = float('inf')
    
    # Linear Regression
    try:
        lr_model = LinearRegression()
        X_train = np.arange(len(train)).reshape(-1, 1)
        lr_model.fit(X_train, train)
        X_test = np.arange(len(train), len(train) + len(test)).reshape(-1, 1)
        lr_forecast = lr_model.predict(X_test)
        results['Linear Regression'] = mean_absolute_error(test, lr_forecast)
    except:
        results['Linear Regression'] = float('inf')
    
    # XGBoost
    try:
        xgb_model = XGBRegressor(objective='reg:squarederror', random_state=42)
        X_train = np.arange(len(train)).reshape(-1, 1)
        xgb_model.fit(X_train, train)
        X_test = np.arange(len(train), len(train) + len(test)).reshape(-1, 1)
        xgb_forecast = xgb_model.predict(X_test)
        results['XGBoost'] = mean_absolute_error(test, xgb_forecast)
    except:
        results['XGBoost'] = float('inf')
    
    # Random Forest
    try:
        rf_model = RandomForestRegressor(random_state=42)
        X_train = np.arange(len(train)).reshape(-1, 1)
        rf_model.fit(X_train, train)
        X_test = np.arange(len(train), len(train) + len(test)).reshape(-1, 1)
        rf_forecast = rf_model.predict(X_test)
        results['Random Forest'] = mean_absolute_error(test, rf_forecast)
    except:
        results['Random Forest'] = float('inf')
    
    # LSTM
    try:
        lstm_model = Sequential([
            LSTM(50, activation='relu', input_shape=(1, 1)),
            Dense(1)
        ])
        lstm_model.compile(optimizer='adam', loss='mae')
        train_reshaped = train.reshape(-1, 1, 1)
        lstm_model.fit(train_reshaped, train, epochs=10, batch_size=1, verbose=0)
        test_reshaped = np.arange(len(train), len(train) + len(test)).reshape(-1, 1, 1)
        lstm_forecast = lstm_model.predict(test_reshaped).flatten()
        results['LSTM'] = mean_absolute_error(test, lstm_forecast)
    except:
        results['LSTM'] = float('inf')
    
    return results

In [14]:
# Find the best model for each cluster
clusters_model = {}

for cluster, median_serie in median_ts.item():
    train = # input median training set
    test = # input median test set

    mae = evaluate_models_on_median(train, test)
    best_model = min(mae, key=mae.get)
    clusters_model[cluster] = best_model

    print(f"Cluster {cluster}: Best Model = {best_model} (MAE = {mae[best_model]:.2f})")

SyntaxError: invalid syntax (3648652720.py, line 5)