# 0. Imports

In [46]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import matplotlib.gridspec as gridspec

from ydata_profiling import ProfileReport
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from kneed import KneeLocator
from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from collections import Counter
from tabulate import tabulate
from tsfeatures import tsfeatures

In [47]:
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from statsmodels.tsa.holtwinters import Holt
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

In [48]:
# Configure seaborn plot style: set background color and use dark grid
sns.set(rc={'axes.facecolor':'#E6E6E6'}, style='darkgrid')

In [49]:
df_train = pd.read_csv("data/train_clustered.csv", index_col=0)
df_test = pd.read_csv("data/test_clustered.csv", index_col=0)
cluster = pd.read_csv("data/clustered_products.csv", index_col=0)
clusters_model = pd.read_csv("data/clusters_model.csv", index_col=0)

In [50]:
df_train.ds = pd.to_datetime(df_train.ds, format="%Y-%m-%d")
df_test.ds = pd.to_datetime(df_test.ds, format="%Y-%m-%d")

In [51]:
nb_clusters = df_train.cluster.nunique()

In [52]:
clusters_model

Unnamed: 0,cluster,model
0,1,Exponential Smoothing
1,0,Holt's Linear Trend
2,2,Holt's Linear Trend


# V. Forecasting

In [53]:
forecast_results = {}

for c in range(nb_clusters):
    items = df_train[df_train.cluster == c]["unique_id"].unique()
    best_model = clusters_model.loc[clusters_model.cluster == c, 'model'].squeeze()
    
    for item in items:
        train_serie = df_train[df_train.unique_id == item].y
        test_serie = df_test[df_test.unique_id == item].y
        
        # if best_model == 'ARIMA':
        #     model = ARIMA(train_serie, order=(1, 1, 1)).fit()
        #     forecast = model.forecast(steps=len(test_serie))
        if best_model == 'SARIMA':
            model = SARIMAX(train_serie, order=(1, 1, 1), seasonal_order=(1, 1, 1, 12)).fit(disp=False)
            forecast = model.forecast(steps=len(test_serie))
        elif best_model == 'Exponential Smoothing':
            model = ExponentialSmoothing(train_serie, seasonal='add', seasonal_periods=12).fit()
            forecast = model.forecast(steps=len(test_serie))
        elif best_model == "Holt's Linear Trend":
            model = Holt(train_serie).fit()
            forecast = model.forecast(steps=len(test_serie))
        # elif best_model == 'Linear Regression':
        #     lr_model = LinearRegression()
        #     X_train = np.arange(len(train_serie)).reshape(-1, 1)
        #     lr_model.fit(X_train, train_serie)
        #     X_test = np.arange(len(train_serie), len(train_serie) + len(test_serie)).reshape(-1, 1)
        #     forecast = lr_model.predict(X_test)
        # elif best_model == 'XGBoost':
        #     xgb_model = XGBRegressor(objective='reg:squarederror', random_state=42)
        #     X_train = np.arange(len(train_serie)).reshape(-1, 1)
        #     xgb_model.fit(X_train, train_serie)
        #     X_test = np.arange(len(train_serie), len(train_serie) + len(test_serie)).reshape(-1, 1)
        #     forecast = xgb_model.predict(X_test)
        # elif best_model == 'Random Forest':
        #     rf_model = RandomForestRegressor(random_state=42)
        #     X_train = np.arange(len(train_serie)).reshape(-1, 1)
        #     rf_model.fit(X_train, train_serie)
        #     X_test = np.arange(len(train_serie), len(train_serie) + len(test_serie)).reshape(-1, 1)
        #     forecast = rf_model.predict(X_test)
        # elif best_model == 'LSTM':
        #     model = Sequential([
        #         LSTM(50, activation='relu', input_shape=(1, 1)),
        #         Dense(1)
        #     ])
        #     model.compile(optimizer='adam', loss='mae')
        #     X_train = train_serie.reshape(-1, 1, 1)
        #     model.fit(X_train, train_serie, epochs=10, batch_size=1, verbose=0)
        #     X_test = np.arange(len(train_serie), len(train_serie) + len(test_serie)).reshape(-1, 1, 1)
        #     forecast = lstm_model.predict(X_test).flatten()
        
        forecast_results[item] = forecast
            