## This code is to analyze and experiment with the sales data

In [None]:
# Commonly used python functions and display settings
import pandas as pd
import numpy as np
pd.options.display.float_format = '{:,.2f}'.format

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from IPython.core.display import display, HTML

import warnings
warnings.filterwarnings("ignore") # specify to ignore warning messages

In [None]:
# Key imports for this code (various ML and Stat Models)
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from statsmodels.tsa.api import ExponentialSmoothing
from statsmodels.tsa.statespace.sarimax import SARIMAX
import statsmodels.api as sm
from statsmodels.tsa.stattools import acf
from statsmodels.tsa.stattools import pacf
import pmdarima as pm
from pmdarima import model_selection
from pmdarima import auto_arima

In [None]:
# import viz libraries
import matplotlib.pyplot as plt
import plotly
plotly.offline.init_notebook_mode(connected=True)
from plotly.graph_objs import *
from plotly import tools
import plotly.graph_objects as go
from matplotlib import pyplot
from pandas.plotting import autocorrelation_plot

### Get data and analyze

In [None]:
# fetch data from the CSV file
sales_data = pd.read_csv('prod2store3sales.csv', parse_dates = ['Date'])

sales_data.head()
sales_data.tail()

# Finding how many rows of data we have and if there are any NaN values
len(sales_data)
sales_data.isna().sum()

## While analyzing, we will use only the first 600 data points reserving the remaining for testing which technically is not observable IRL

In [None]:
# Creating a plot of sales made over time
plot_data = []
plot_data.append(go.Scatter(x= sales_data['Date'][0:600], y= sales_data['Units Sold'][0:600]))
layout = go.Layout(xaxis = dict(title='Date'), yaxis = dict(title= 'Units Sold'), 
                   title = 'Time Series of Daily Units Sold in Training Data')
fig = go.Figure(data= plot_data, layout=layout)

plotly.offline.iplot(fig)

In [None]:
# Zooming into the first three weeks of data from the above plot
plot_data = []
plot_data.append(go.Scatter(x= sales_data['Date'][0:21], y= sales_data['Units Sold'][0:21]))
layout = go.Layout(xaxis = dict(title='Date'), yaxis = dict(title= 'Units Sold'), 
                   title = 'Time Series of Daily Units Sold in Training Data')
fig = go.Figure(data= plot_data, layout=layout)

plotly.offline.iplot(fig)

In [None]:
# This creates a graph of the autocorrelation function versus lags for the sales data
sm.graphics.tsa.plot_acf(sales_data['Units Sold'][0:600].values.squeeze(), lags=40)

In [None]:
# This creates a graph of the partial autocorrelation function versus lags for the calls data
sm.graphics.tsa.plot_pacf(sales_data['Units Sold'][0:600].values.squeeze(), lags=40)

## Adding Lag Features

In [None]:
# Since there is no trending, we are not going to detrend
# But we will use use season of 1 week (7 days) and also use previous day's sales
sales_data['lag1 sales'] = sales_data['Units Sold'].shift(periods = 1)
sales_data['lag7 sales'] = sales_data['Units Sold'].shift(periods = 7)
# Due to the shift, the first 7 rows will have NaN that we drop
sales_data.dropna(inplace = True)
sales_data.head(10)

In [None]:
# The 'Weather Condition' column is categorical; what are the categories? 
set(sales_data['Weather Condition'])

### Initially let us ignore the date and month

In [None]:
# Creating dummy variables and dropping first of the 4 types (does not add value here)
sales_data = pd.get_dummies(sales_data, drop_first=True) 
sales_data.head()

In [None]:
# We reset our index so our datafra starts with index 0
sales_data.reset_index(drop = True, inplace = True)
sales_data.head()

## Uncomment following if we wish to add month and day-of-week

In [None]:
# # We can use the dates and day-of-week as features
# sales_data['month'] = sales_data['Date'].dt.month
# sales_data['day'] = sales_data['Date'].dt.weekday
# sales_data.head()
# sales_data.tail()

In [None]:
# We split the data into a training set and a testing set
train_data = sales_data[sales_data['Date'] < '2023-10-01']
test_data = sales_data[sales_data['Date'] >= '2023-10-01']

In [None]:
# Creating training data dropping columns not needed and also ground truth
X_train = train_data.drop(columns = ['Date', 'Units Sold'])
y_train = train_data['Units Sold']

## Gradient Boosting

In [None]:
# Defining the model and parameters
gb = GradientBoostingRegressor(n_estimators = 100, max_depth = 6, min_samples_leaf = 2)
# Asking the model to fit the training data
gb = gb.fit(X_train, y_train) 
# Asking what the importance of features
gb.feature_importances_

In [None]:
# Define the testing data sets
X_test = test_data.drop(columns = ['Date', 'Units Sold'])
y_test = test_data['Units Sold']
# Make predictions
y_preds = gb.predict(X_test)
# Calculate percentage and absolute errors
perc_errors = np.abs(y_test-y_preds)/y_test
abs_errors = np.abs(y_test-y_preds)

In [None]:
# Print the percentage-error results
print('Mean absolute percentage error:', np.mean(perc_errors))
print('Median absolute percentage error:', np.median(perc_errors))
print('75th percentile of absolute percentage error:', np.percentile(perc_errors, 75))
print('90th percentile of absolute percentage error:', np.percentile(perc_errors, 90))

In [None]:
# Print the absolute error ratio results
avg_test = test_data['Units Sold'].mean()
print('Mean absolute error ratio:', np.mean(abs_errors)/avg_test)
print('Median absolute error ratio:', np.median(abs_errors)/avg_test)
print('75th percentile absolute error ratio:', np.percentile(abs_errors, 75)/avg_test)
print('90th percentile absolute error ratio:', np.percentile(abs_errors, 90)/avg_test)

## XGB

In [None]:
# Define the XGBoost regressor with specific hyperparameters
model = XGBRegressor(
        n_estimators=100,
        max_depth=6,
        learning_rate=0.2,
        subsample=1.0,
        colsample_bytree=1.0,
        objective='reg:squarederror',
        random_state=42
    )

# Train the model
model.fit(X_train, y_train)

In [None]:
# Make predictions
y_preds = model.predict(X_test)
# Calculate percentage and absolute errors
perc_errors = np.abs(y_test-y_preds)/y_test
abs_errors = np.abs(y_test-y_preds)

In [None]:
# Print the percentage-error results
print('Mean absolute percentage error:', np.mean(perc_errors))
print('Median absolute percentage error:', np.median(perc_errors))
print('75th percentile of absolute percentage error:', np.percentile(perc_errors, 75))
print('90th percentile of absolute percentage error:', np.percentile(perc_errors, 90))

In [None]:
# Print the absolute error ratio results
avg_test = test_data['Units Sold'].mean()
print('Mean absolute error ratio:', np.mean(abs_errors)/avg_test)
print('Median absolute error ratio:', np.median(abs_errors)/avg_test)
print('75th percentile absolute error ratio:', np.percentile(abs_errors, 75)/avg_test)
print('90th percentile absolute error ratio:', np.percentile(abs_errors, 90)/avg_test)

## Random Forest

In [None]:
# Defining the model and parameters
rf = RandomForestRegressor(max_depth = 6, min_samples_leaf = 2, max_features = "sqrt")
# Asking the model to fit the training data
rf = rf.fit(X_train, y_train) 
# Asking what the importance of features
rf.feature_importances_

In [None]:
# Make predictions
y_preds = rf.predict(X_test)
# Calculate percentage and absolute errors
perc_errors = np.abs(y_test-y_preds)/y_test
abs_errors = np.abs(y_test-y_preds)

In [None]:
# Print the percentage-error results
print('Mean absolute percentage error:', np.mean(perc_errors))
print('Median absolute percentage error:', np.median(perc_errors))
print('75th percentile of absolute percentage error:', np.percentile(perc_errors, 75))
print('90th percentile of absolute percentage error:', np.percentile(perc_errors, 90))

In [None]:
# Print the absolute error ratio results
avg_test = test_data['Units Sold'].mean()
print('Mean absolute error ratio:', np.mean(abs_errors)/avg_test)
print('Median absolute error ratio:', np.median(abs_errors)/avg_test)
print('75th percentile absolute error ratio:', np.percentile(abs_errors, 75)/avg_test)
print('90th percentile absolute error ratio:', np.percentile(abs_errors, 90)/avg_test)