<a href="https://colab.research.google.com/github/nuwanc/ml/blob/main/sarimax.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import numpy as np

# Generate sample data
np.random.seed(42)
date_rng = pd.date_range(start='2023-01-01', end='2023-12-31', freq='H')
data = pd.DataFrame(date_rng, columns=['date'])

# Simulate sales data with some seasonality and trend
data['sales'] = np.random.poisson(lam=20, size=len(date_rng)) + \
                10 * np.sin(2 * np.pi * data['date'].dt.dayofyear / 365) + \
                5 * (data['date'].dt.dayofyear // 100)

# Add weather conditions
weather_conditions = ['sunny', 'rainy', 'cloudy']
data['weather'] = np.random.choice(weather_conditions, size=len(date_rng))

# Add promotions
data['promotion'] = np.random.choice([0, 1], size=len(date_rng))

# Display the first few rows of the dataset
print(data.head())

# Save dataset to a CSV file
data.to_csv('sample_bakery_sales.csv', index=False)


                 date      sales weather  promotion
0 2023-01-01 00:00:00  23.172134   sunny          1
1 2023-01-01 01:00:00  15.172134  cloudy          0
2 2023-01-01 02:00:00  21.172134   sunny          0
3 2023-01-01 03:00:00  25.172134   rainy          0
4 2023-01-01 04:00:00  15.172134   rainy          1


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_absolute_error

# Load sample data
data = pd.read_csv('sample_bakery_sales.csv', parse_dates=['date'])
data.set_index('date', inplace=True)

# Add time-based features
data['hour'] = data.index.hour
data['day_of_week'] = data.index.dayofweek
data['month'] = data.index.month

# Encode categorical weather data
data = pd.get_dummies(data, columns=['weather'], drop_first=True)

# Print data types to check
print(data.dtypes)

# Ensure all columns are numeric
data = data.apply(pd.to_numeric, errors='coerce')

# Check for missing values and handle them
print(data.isnull().sum())
data = data.fillna(0)

# Print data types again to confirm
print(data.dtypes)

# Explicitly convert boolean columns to integers
data['weather_rainy'] = data['weather_rainy'].astype(int)
data['weather_sunny'] = data['weather_sunny'].astype(int)

# Split data into training and test sets
train_data = data[:'2023-12-15']
test_data = data['2023-12-16':]

# Select exogenous variables
exog_train = train_data[['promotion', 'weather_rainy', 'weather_sunny']].to_numpy()
exog_test = test_data[['promotion', 'weather_rainy', 'weather_sunny']].to_numpy()

# Fit SARIMAX model
model = SARIMAX(train_data['sales'],
                order=(1, 1, 1),
                seasonal_order=(1, 1, 1, 24),
                exog=exog_train)
results = model.fit()

# Forecast
forecast = results.get_forecast(steps=len(test_data), exog=exog_test)
forecast_values = forecast.predicted_mean
confidence_intervals = forecast.conf_int()

# Evaluate the model
mae = mean_absolute_error(test_data['sales'], forecast_values)
print(f'Mean Absolute Error: {mae}')

# Plot the results
plt.figure(figsize=(10, 6))
plt.plot(train_data.index, train_data['sales'], label='Training Data')
plt.plot(test_data.index, test_data['sales'], label='Actual Sales')
plt.plot(test_data.index, forecast_values, label='Forecasted Sales')
plt.fill_between(test_data.index, confidence_intervals.iloc[:, 0], confidence_intervals.iloc[:, 1], color='k', alpha=0.2)
plt.title('SARIMAX Forecast vs Actual Sales')
plt.legend()
plt.show()


sales            float64
promotion          int64
hour               int32
day_of_week        int32
month              int32
weather_rainy       bool
weather_sunny       bool
dtype: object
sales            0
promotion        0
hour             0
day_of_week      0
month            0
weather_rainy    0
weather_sunny    0
dtype: int64
sales            float64
promotion          int64
hour               int32
day_of_week        int32
month              int32
weather_rainy       bool
weather_sunny       bool
dtype: object


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
