TP Activity (Machine Learning)
Mncedisi Hlonzi 22113111

In [None]:
# Data Preprocessing

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_excel("spaza_shop_data.xlsx")

print("Column names in the dataset:")
print(df.columns)

sales_column_name = 'Sales Volume'
if sales_column_name not in df.columns:
    print(f"Error: '{sales_column_name}' column not found. Please check the correct column name.")
else:
    missing_values = df.isnull().sum()
    print("Columns with missing values:")
    print(missing_values[missing_values > 0])

    plt.figure(figsize=(10, 6))
    sns.boxplot(data=df[sales_column_name])
    plt.title(f'{sales_column_name} Distribution (Checking for Outliers)')
    plt.show()

if 'Date' in df.columns:
    df['Date'] = pd.to_datetime(df['Date'])
    df.set_index('Date', inplace=True)
else:
    print("Error: 'Date' column not found in the dataset.")

categorical_columns = ['Day of Week', 'Product Category', 'Season', 'Discounts/Promotions']
for col in categorical_columns:
    if col in df.columns:
        df = pd.get_dummies(df, columns=[col], drop_first=True)
    else:
        print(f"Warning: '{col}' column not found for one-hot encoding.")

if sales_column_name in df.columns:
    for lag in range(1, 8):
        df[f'{sales_column_name}_Lag_{lag}'] = df[sales_column_name].shift(lag)

    df.dropna(inplace=True)

train_size = int(len(df) * 0.8)
train, test = df[:train_size], df[train_size:]

print("Preprocessed Data (first few rows):")
print(df.head())

In [None]:
# PROPHET MODEL

from prophet import Prophet
import pandas as pd

df = pd.read_excel("spaza_shop_data.xlsx")

df_prophet = df[['Date', 'Sales Volume (Units Sold)']].rename(columns={'Date': 'ds', 'Sales Volume (Units Sold)': 'y'})

model = Prophet(yearly_seasonality=True, weekly_seasonality=True, daily_seasonality=False)

holidays = pd.DataFrame({
  'holiday': 'holiday_event',
  'ds': df.loc[df['Holiday'] == 1, 'Date'],
  'lower_window': 0,
  'upper_window': 1,
})
model.add_country_holidays(country_name='ZA')

model.fit(df_prophet)

future = model.make_future_dataframe(periods=90)
forecast = model.predict(future)

model.plot(forecast)
plt.title('Prophet Model - Sales Volume Forecast')
plt.show()

model.plot_components(forecast)
plt.show()

forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail()

In [None]:
# ARIMA MODEL

from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.stattools import adfuller
import matplotlib.pyplot as plt
import pandas as pd

df = pd.read_excel("spaza_shop_data.xlsx")
df.set_index('Date', inplace=True)

result = adfuller(df['Sales Volume (Units Sold)'])
print(f'ADF Statistic: {result[0]}')
print(f'p-value: {result[1]}')

df['Sales_diff'] = df['Sales Volume (Units Sold)'].diff().dropna()

from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

plt.figure(figsize=(12, 6))

plt.subplot(211)
plot_acf(df['Sales_diff'].dropna(), ax=plt.gca(), lags=8)
plt.subplot(212)
plot_pacf(df['Sales_diff'].dropna(), ax=plt.gca(), lags=8)
plt.show()

p, d, q = 2, 1, 2
model = ARIMA(df['Sales Volume (Units Sold)'], order=(p, d, q))
model_fit = model.fit()

forecast = model_fit.forecast(steps=90)
plt.figure(figsize=(10, 6))
plt.plot(df.index, df['Sales Volume (Units Sold)'], label='Actual Sales')
plt.plot(pd.date_range(df.index[-1], periods=90, freq='D'), forecast, label='Forecasted Sales', color='red')
plt.legend()
plt.title('ARIMA Model - Sales Volume Forecast')
plt.show()


In [None]:
# LSTM MODEL

import numpy as np
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense, LSTM
import matplotlib.pyplot as plt

df = pd.read_excel("spaza_shop_data.xlsx")
df.set_index('Date', inplace=True)

scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data = scaler.fit_transform(df[['Sales Volume (Units Sold)']])

def create_dataset(data, time_step=7):
    X, y = [], []
    for i in range(len(data) - time_step - 1):
        X.append(data[i:(i + time_step), 0])
        y.append(data[i + time_step, 0])
    return np.array(X), np.array(y)

X, y = create_dataset(scaled_data, time_step)

X = X.reshape(X.shape[0], X.shape[1], 1)

model = Sequential()
model.add(LSTM(50, return_sequences=True, input_shape=(time_step, 1)))
model.add(LSTM(50, return_sequences=False))
model.add(Dense(25))
model.add(Dense(1))

model.compile(optimizer='adam', loss='mean_squared_error')
model.fit(X, y, epochs=10, batch_size=32)

predicted_sales = []
input_seq = scaled_data[-time_step:]

for _ in range(90):
    pred = model.predict(input_seq.reshape(1, time_step, 1))
    predicted_sales.append(pred[0][0])
    input_seq = np.append(input_seq[1:], pred)

predicted_sales = scaler.inverse_transform(np.array(predicted_sales).reshape(-1, 1))

plt.figure(figsize=(10, 6))
plt.plot(df.index, df['Sales Volume (Units Sold)'], label='Actual Sales')
plt.plot(pd.date_range(df.index[-1], periods=90, freq='D'), predicted_sales, label='LSTM Forecast', color='green')
plt.legend()
plt.title('LSTM Model - Sales Volume Forecast')
plt.show()


Model Comparison

Here i compare the Prophet, ARIMA, and LSTM models based on their performance using our dataset. Each model's output was compared to actual sales data to assess accuracy and effectiveness.

1. Prophet

Prophet performed well in capturing the overall trend and seasonal patterns, particularly in accounting for holidays and weekends.
It showed a good balance between fitting the data well and maintaining interpretability.
However, it struggled with short-term fluctuations, which resulted in a higher MAPE compared to other models during periods of irregularity.

2. ARIMA

ARIMA was effective in capturing short-term dependencies and trends, performing well on stationary portions of the data.
Its strength lies in modeling the linear components of time series data but it lacked performance in handling seasonality and non-linear patterns.
The errors were notably higher during periods with strong seasonal patterns or irregular changes, showing the model’s limitations in capturing complex dynamics.

3. LSTM

LSTM excelled in capturing complex non-linear patterns and long-term dependencies, the model's ability to handle irregular and complex patterns was evident in its performance, making it particularly effective during periods of significant deviations from expected trends.
However, LSTM required significant computational resources and time for tuning, which can be a limitation in practical applications.

Overall Comparison and Insights :

Prophet was the most effective for datasets with strong seasonality and holiday effects. It provided a good balance but was less effective in capturing sudden changes and short-term fluctuations.
ARIMA was suitable for stationary data with short-term trends but struggled with seasonal effects and complex non-linear patterns.
LSTM demonstrated superior performance in modeling complex and long-term dependencies. However, it was more resource-intensive and required extensive tuning.

Prophet should be used for datasets with clear seasonality and holiday effects.
ARIMA should be applied for stationary data with primarily linear patterns and short-term forecasts.
& LSTM when dealing with complex, non-linear patterns and long-term dependencies, where computational resources are available.
Each model has its merits and is suited to different types of time series forecasting challenges. The results from this comparison highlight the importance of selecting the right model based on data characteristics and forecasting goals.








Insights and Recommendations

Based on the forecasts from the Prophet, ARIMA, and LSTM models, here are some actionable insights for spaza shop owners to optimize their operations and improve profitability:

1. Seasonal Stock Management

Stock up on high-demand items before peak seasons or holidays (e.g., festive periods, school holidays). For example, increase inventory of popular products during festive seasons when sales typically rise.
Reduce stock levels after seasonal peaks to avoid excess inventory and wastage.

2. Short-Term Inventory Adjustments

Respond to short-term changes in sales patterns by adjusting stock levels in real-time. For example, if ARIMA predicts a sudden drop in demand, reduce inventory to avoid overstock.
Use short-term forecasts to plan timely promotions or discounts to boost sales during predicted slow periods.

3. Handling Complex Patterns

Use LSTM forecasts to prepare for unexpected spikes or drops in sales that may not follow regular patterns. For instance, stock up on essential items ahead of predicted spikes or manage resources efficiently during downturns.
Allocate resources more effectively based on LSTM’s ability to predict complex trends. For example, adjust staff schedules and store layouts according to anticipated changes in customer behavior.

Combine Models for Better Accuracy: Use a combination of Prophet, ARIMA, and LSTM forecasts to get a more comprehensive view of expected sales and trends. Each model offers unique strengths that can complement each other.
Continuously monitor actual sales against forecasts and adjust inventory and operations as needed. Regularly updating your models with new data will improve their accuracy over time.