# Sales Forecast Project

## Understanding Data

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error

In [None]:
df = pd.read_csv("sales.csv", sep=";")

df = (
    df.rename(columns={"Posting Date": "date"})
      .rename(columns=str.lower)
)

df.head()

In [None]:
df.info()

In [None]:
# Data Preparation
df['date'] = pd.to_datetime(df['date'], format='%d/%m/%Y')
df['material'] = df['material'].astype(str)
df['quantity'] = df['quantity'].astype(str).str.replace('.', '', regex=False).str.replace(',', '.', regex=False)
df['quantity'] = pd.to_numeric(df['quantity'])

# Removing negative and zero quantities (business rule)
df = df[df['quantity'] > 0]

df.info()

In [None]:
df = df.groupby(['date', 'plant', 'material'], as_index=False)['quantity'].sum()

df.set_index('date', inplace=True)

df.head()

In [None]:
df.info()

In [None]:
# Filtering for plant 'loc30' and dropping unnecessary columns
df = df[df['plant'] == 'loc30']
df.drop(columns=['plant', 'material'], inplace=True)

# Creating a copy of the cleaned DataFrame for final evaluation
final_df = df.copy()

# Resampling to daily frequency and filling missing dates with 0 sales (business rule)
df = df.asfreq('D')
df.fillna(0, inplace=True)

# Splitting data into training and testing sets (80% train, 20% test)
train_size = int(len(df) * 0.8)
train, test = df.iloc[:train_size], df.iloc[train_size:]

## Functions

In [None]:
def create_dashboard(df, model_name):
    plt.figure(figsize=(12, 6))
    sns.lineplot(data=df, x=df.index, y='quantity', label='Real', alpha=0.7)
    sns.lineplot(data=df, x=df.index, y=f'{model_name} quantity', label='Forecast', alpha=0.7)
    plt.title(f'{model_name} - Forecast vs Real')
    plt.legend()
    plt.xlim(left=df.index[0])
    plt.tight_layout()
    plt.show()

In [None]:
def evaluate_forecast(df, model_name):
    
    y_true = df["quantity"]
    y_pred = df[f"{model_name} quantity"]

    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    mape = mean_absolute_percentage_error(y_true, y_pred)

    return f"{model_name} - MSE: {mse:.2f} | MAE: {mae:.2f} | MAPE: {mape:.2f}%", (mse, mae, mape)

## Holt Winters Model

In [None]:
from statsmodels.tsa.holtwinters import ExponentialSmoothing

In [None]:
model_hw = ExponentialSmoothing(
    train['quantity'], 
    seasonal='add', 
    seasonal_periods=7
).fit()

forecast_hw = model_hw.forecast(steps=len(test))

In [None]:
forecast_hw = (
    forecast_hw.rename("hw quantity")
               .to_frame()
               .reset_index(names="date")
               .set_index("date")
)

final_df = pd.concat([final_df, forecast_hw], axis=1, join='inner')

final_df.head()

In [None]:
create_dashboard(final_df, "hw")

In [None]:
evaluate_forecast(final_df, "hw")[0]

## Prophet Model

In [None]:
from prophet import Prophet

In [None]:
df_prophet = df.copy()
df_prophet = df_prophet.reset_index()
df_prophet.columns = ['ds', 'y']

train_prophet, test_prophet = df_prophet.iloc[:train_size], df_prophet.iloc[train_size:]

model_prophet = Prophet(weekly_seasonality=True)
model_prophet.fit(train_prophet)

future = model_prophet.make_future_dataframe(periods=len(test_prophet), freq='D')
forecast = model_prophet.predict(future)

forecast_prophet = forecast[['ds', 'yhat']].tail(len(test_prophet))

forecast_prophet.head()

In [None]:
forecast_prophet = (
    forecast_prophet.rename(columns={"ds": "date", "yhat": "prophet quantity"})
                    .set_index("date")
)

final_df = pd.concat([final_df, forecast_prophet], axis=1, join="inner")

final_df.head()

In [None]:
create_dashboard(final_df, "prophet")

In [None]:
evaluate_forecast(final_df, "prophet")[0]

## ARIMA Model

In [None]:
from pmdarima import auto_arima

In [None]:
model_arima=auto_arima(
    train['quantity'],
    seasonal=False,
    stepwise=True
)

forecast_arima = model_arima.predict(n_periods=len(test))

In [None]:
forecast_arima = (
    forecast_arima.rename('arima quantity')
                  .to_frame()
                  .reset_index(names='date')
                  .set_index('date')
)

final_df = pd.concat([final_df, forecast_arima], axis=1, join='inner')

final_df.head()

In [None]:
create_dashboard(final_df, "arima")

In [None]:
evaluate_forecast(final_df, "arima")[0]

In [None]:
# Verifying Basic Statistics
print("Series Statistics:")
print(train['quantity'].describe())

# Verifying if there are many zeros
zeros_pct = (train['quantity'] == 0).sum() / len(train) * 100
print(f"\nZeros Percentage: {zeros_pct:.1f}%")

# Verifying current model
print(f"\nSelected Modell: {model_arima.order}")
if hasattr(model_arima, 'seasonal_order'):
    print(f"Seasonal Order: {model_arima.seasonal_order}")

## SARIMA Model

In [None]:
from pmdarima import auto_arima

In [None]:
model_sarima=auto_arima(
    train['quantity'],
    seasonal=True,
    m=7,
    stepwise=True
)

forecast_sarima = model_sarima.predict(n_periods=len(test))

In [None]:
forecast_sarima = (
    forecast_sarima.rename('sarima quantity')
                   .to_frame()
                   .reset_index(names='date')
                   .set_index('date')
)

final_df = pd.concat([final_df, forecast_sarima], axis=1, join='inner')

final_df.head()

In [None]:
create_dashboard(final_df, "sarima")

In [None]:
evaluate_forecast(final_df, "sarima")[0]

## Models Comparison

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
models = ['hw quantity', 'prophet quantity', 'arima quantity', 'sarima quantity']
model_names = ['Holt-Winters', 'Prophet', 'ARIMA', 'SARIMA']
    
for i, (model, name) in enumerate(zip(models, model_names)):
    ax = axes[i//2, i%2]
    ax.plot(final_df.index, final_df['quantity'], label='Real', alpha=0.7)
    ax.plot(final_df.index, final_df[model], label='Forecast', alpha=0.7)
    ax.set_title(f'{name} - Forecast vs Real')
    ax.set_xlim(left=final_df.index[0])
    ax.legend()
    
plt.tight_layout()
plt.show()

In [None]:
print(f"{'Model':<15}{'MSE':<12}{'MAE':<12}{'MAPE'}")
print("-"*50)

for model in ["hw", "prophet", "arima", "sarima"]:
    mse, mae, mape = evaluate_forecast(final_df, model)[1]
    model_name = model.upper().replace("HW", "Holt-Winters")
    print(f"{model_name:<15}{mse:<12.1f}{mae:<12.1f}{mape:.1f}%")