# 0. Imports

In [48]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import matplotlib.gridspec as gridspec

from ydata_profiling import ProfileReport
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from kneed import KneeLocator
from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from collections import Counter
from tabulate import tabulate
from tsfeatures import tsfeatures

In [49]:
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from statsmodels.tsa.holtwinters import Holt
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

In [50]:
# Configure seaborn plot style: set background color and use dark grid
sns.set(rc={'axes.facecolor':'#E6E6E6'}, style='darkgrid')

In [51]:
df_train = pd.read_csv("data/train_clustered.csv", index_col=0)
df_test = pd.read_csv("data/test_clustered.csv", index_col=0)
cluster = pd.read_csv("data/clustered_products.csv", index_col=0)
clusters_model = pd.read_csv("data/clusters_model.csv", index_col=0)

In [52]:
df_train.ds = pd.to_datetime(df_train.ds, format="%Y-%m-%d")
df_test.ds = pd.to_datetime(df_test.ds, format="%Y-%m-%d")

In [53]:
nb_clusters = df_train.cluster.nunique()

In [54]:
clusters_model

Unnamed: 0,cluster,model
0,0,Holt's Linear Trend
1,2,Holt's Linear Trend
2,1,Exponential Smoothing


# V. Forecasting

In [55]:
forecast_results = {}

for c in range(nb_clusters):
    items = df_train[df_train.cluster == c]["unique_id"].unique()
    best_model = clusters_model.loc[clusters_model.cluster == c, 'model'].squeeze()
    
    for item in items:
        train_serie = df_train[df_train.unique_id == item].y
        test_serie = df_test[df_test.unique_id == item].y
        
        # if best_model == 'ARIMA':
        #     model = ARIMA(train_serie, order=(1, 1, 1)).fit()
        #     forecast = model.forecast(steps=len(test_serie))
        if best_model == 'SARIMA':
            model = SARIMAX(train_serie, order=(1, 1, 1), seasonal_order=(1, 1, 1, 12)).fit(disp=False)
            forecast = model.forecast(steps=len(test_serie))
        elif best_model == 'Exponential Smoothing':
            model = ExponentialSmoothing(train_serie, seasonal='add', seasonal_periods=12).fit()
            forecast = model.forecast(steps=len(test_serie))
        elif best_model == "Holt's Linear Trend":
            model = Holt(train_serie).fit()
            forecast = model.forecast(steps=len(test_serie))
        # elif best_model == 'Linear Regression':
        #     lr_model = LinearRegression()
        #     X_train = np.arange(len(train_serie)).reshape(-1, 1)
        #     lr_model.fit(X_train, train_serie)
        #     X_test = np.arange(len(train_serie), len(train_serie) + len(test_serie)).reshape(-1, 1)
        #     forecast = lr_model.predict(X_test)
        # elif best_model == 'XGBoost':
        #     xgb_model = XGBRegressor(objective='reg:squarederror', random_state=42)
        #     X_train = np.arange(len(train_serie)).reshape(-1, 1)
        #     xgb_model.fit(X_train, train_serie)
        #     X_test = np.arange(len(train_serie), len(train_serie) + len(test_serie)).reshape(-1, 1)
        #     forecast = xgb_model.predict(X_test)
        # elif best_model == 'Random Forest':
        #     rf_model = RandomForestRegressor(random_state=42)
        #     X_train = np.arange(len(train_serie)).reshape(-1, 1)
        #     rf_model.fit(X_train, train_serie)
        #     X_test = np.arange(len(train_serie), len(train_serie) + len(test_serie)).reshape(-1, 1)
        #     forecast = rf_model.predict(X_test)
        # elif best_model == 'LSTM':
        #     model = Sequential([
        #         LSTM(50, activation='relu', input_shape=(1, 1)),
        #         Dense(1)
        #     ])
        #     model.compile(optimizer='adam', loss='mae')
        #     X_train = train_serie.reshape(-1, 1, 1)
        #     model.fit(X_train, train_serie, epochs=10, batch_size=1, verbose=0)
        #     X_test = np.arange(len(train_serie), len(train_serie) + len(test_serie)).reshape(-1, 1, 1)
        #     forecast = lstm_model.predict(X_test).flatten()
        
        forecast_results[item] = forecast
            

In [56]:
forecast_results
# Round to the closest integer
# To save to df 
# Comparaison of MAE

{'F00001015_CLR000021': 134    59.014507
 135    62.778754
 136    66.543001
 137    70.307247
 138    74.071494
 139    77.835740
 140    81.599987
 141    85.364234
 dtype: float64,
 'F00001015_CLR000023': 134    21.503639
 135    21.369253
 136    21.234867
 137    21.100482
 138    20.966096
 139    20.831710
 140    20.697324
 141    20.562938
 dtype: float64,
 'F00001111_151304TCX': 134    2.305809
 135    2.305881
 136    2.305952
 137    2.306023
 138    2.306095
 139    2.306166
 140    2.306238
 141    2.306309
 dtype: float64,
 'F00001111_193924TPX': 134    52.286039
 135    53.014136
 136    53.742233
 137    54.470330
 138    55.198428
 139    55.926525
 140    56.654622
 141    57.382719
 dtype: float64,
 'F00001111_CLR000021': 134    69.666534
 135    69.673393
 136    69.680252
 137    69.687111
 138    69.693970
 139    69.700829
 140    69.707688
 141    69.714547
 dtype: float64,
 'F00001111_CLR000391': 134    66.819806
 135    67.209323
 136    67.598839
 137    67.

In [57]:
df_test

Unnamed: 0,unique_id,ds,y,cluster
0,F00001015_CLR000021,2024-10-06,53.0,0.0
1,F00001015_CLR000023,2024-10-06,20.0,0.0
2,F00001111_151304TCX,2024-10-06,218.0,0.0
3,F00001111_193924TPX,2024-10-06,24.0,0.0
4,F00001111_CLR000021,2024-10-06,52.0,0.0
...,...,...,...,...
5595,M24400088_190414TCX,2024-11-24,0.0,
5596,M24400088_CLR000021,2024-11-24,0.0,
5597,M24900011_180515TCX,2024-11-24,0.0,2.0
5598,S00029_CLR001336,2024-11-24,0.0,0.0


In [58]:
df_forecast =pd.DataFrame(forecast_results)

In [59]:
df_forecast["ds"] = df_test.ds.unique()

In [60]:
df_forecast = df_forecast.melt(id_vars="ds",
                 var_name="unique_id",
                 value_name="forecast")

In [61]:
df_forecast["forecast"] = test.forecast.round(0).astype(int)

In [65]:
df_forecast = df_forecast[["unique_id", "ds", "forecast"]]

In [66]:
df_forecast.to_csv("data/forecast.csv")