In [None]:
# 10/23/24


In [5]:
# Now I have the 10 year simulated data. I will now create lag variables and forecast the data.



In [3]:
%pip install xgboost


Note: you may need to restart the kernel to use updated packages.


In [4]:
import requests
import pandas as pd

import csv
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

import os
from pandas import DataFrame, Series

import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb


In [5]:
df = pd.read_csv('10_year_data.csv')

len(df)


521

In [6]:
df.dtypes


product     object
store       object
date        object
demand     float64
dtype: object

In [7]:
# Ensure the 'date' column is in datetime formats
df['date'] = pd.to_datetime(df['date'])

df.dtypes


product            object
store              object
date       datetime64[ns]
demand            float64
dtype: object

In [8]:
df


Unnamed: 0,product,store,date,demand
0,cordless screwdriver_1,Store_1,2014-11-03,73.0
1,cordless screwdriver_1,Store_1,2014-11-10,53.0
2,cordless screwdriver_1,Store_1,2014-11-17,104.0
3,cordless screwdriver_1,Store_1,2014-11-24,76.0
4,cordless screwdriver_1,Store_1,2014-12-01,76.0
...,...,...,...,...
516,cordless screwdriver_1,Store_1,2024-09-23,55.0
517,cordless screwdriver_1,Store_1,2024-09-30,82.0
518,cordless screwdriver_1,Store_1,2024-10-07,110.0
519,cordless screwdriver_1,Store_1,2024-10-14,75.0


In [9]:
# Let me add some features. Year, month, quarter, lagged demand up to 4 weeks back, and mean of last 4 weeks demand.

# Assuming your dataframe is already loaded as `df` and the 'date' column is in datetime format
df['date'] = pd.to_datetime(df['date'])

# Add year, month, and quarter features
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['quarter'] = df['date'].dt.quarter

# Sort by date to ensure correct lag and rolling calculations
df = df.sort_values(by='date')

# Add lagged demand features (up to 4 weeks)
for lag in range(1, 5):
    df[f'demand_lag_{lag}w'] = df['demand'].shift(lag)

# Add rolling mean over the previous 4 weeks, excluding the current week
df['demand_rolling_mean_4w'] = df['demand'].shift(1).rolling(window=4).mean()

# Drop rows with NaN values created by lagging/rolling
df = df.dropna().reset_index(drop=True)

# Display the first few rows to verify the new features
df


Unnamed: 0,product,store,date,demand,year,month,quarter,demand_lag_1w,demand_lag_2w,demand_lag_3w,demand_lag_4w,demand_rolling_mean_4w
0,cordless screwdriver_1,Store_1,2014-12-01,76.0,2014,12,4,76.0,104.0,53.0,73.0,76.50
1,cordless screwdriver_1,Store_1,2014-12-08,89.0,2014,12,4,76.0,76.0,104.0,53.0,77.25
2,cordless screwdriver_1,Store_1,2014-12-15,91.0,2014,12,4,89.0,76.0,76.0,104.0,86.25
3,cordless screwdriver_1,Store_1,2014-12-22,44.0,2014,12,4,91.0,89.0,76.0,76.0,83.00
4,cordless screwdriver_1,Store_1,2014-12-29,96.0,2014,12,4,44.0,91.0,89.0,76.0,75.00
...,...,...,...,...,...,...,...,...,...,...,...,...
512,cordless screwdriver_1,Store_1,2024-09-23,55.0,2024,9,3,81.0,67.0,82.0,59.0,72.25
513,cordless screwdriver_1,Store_1,2024-09-30,82.0,2024,9,3,55.0,81.0,67.0,82.0,71.25
514,cordless screwdriver_1,Store_1,2024-10-07,110.0,2024,10,4,82.0,55.0,81.0,67.0,71.25
515,cordless screwdriver_1,Store_1,2024-10-14,75.0,2024,10,4,110.0,82.0,55.0,81.0,82.00


In [10]:
# Now let me drop product and store columns as they are not needed for the model.

df = df.drop(['product', 'store'], axis=1)

df


Unnamed: 0,date,demand,year,month,quarter,demand_lag_1w,demand_lag_2w,demand_lag_3w,demand_lag_4w,demand_rolling_mean_4w
0,2014-12-01,76.0,2014,12,4,76.0,104.0,53.0,73.0,76.50
1,2014-12-08,89.0,2014,12,4,76.0,76.0,104.0,53.0,77.25
2,2014-12-15,91.0,2014,12,4,89.0,76.0,76.0,104.0,86.25
3,2014-12-22,44.0,2014,12,4,91.0,89.0,76.0,76.0,83.00
4,2014-12-29,96.0,2014,12,4,44.0,91.0,89.0,76.0,75.00
...,...,...,...,...,...,...,...,...,...,...
512,2024-09-23,55.0,2024,9,3,81.0,67.0,82.0,59.0,72.25
513,2024-09-30,82.0,2024,9,3,55.0,81.0,67.0,82.0,71.25
514,2024-10-07,110.0,2024,10,4,82.0,55.0,81.0,67.0,71.25
515,2024-10-14,75.0,2024,10,4,110.0,82.0,55.0,81.0,82.00


In [11]:
# now let me save df

# df.to_csv('10_year_data_with_features.csv', index=False)


In [12]:
# Now let me do the linear regression model.

# Define feature columns (excluding 'date' and 'demand' which is the target)
features = ['year', 'month', 'quarter', 'demand_lag_1w', 'demand_lag_2w', 'demand_lag_3w', 'demand_lag_4w', 'demand_rolling_mean_4w']

# Split the data into training and testing sets (first 90% for training, last 10% for testing)
train_size = int(len(df) * 0.8)
df_train = df.iloc[:train_size]
df_test = df.iloc[train_size:]

# Define X (features) and y (target variable) for training and testing
X_train = df_train[features]
y_train = df_train['demand']
X_test = df_test[features]
y_test = df_test['demand']

# Initialize and train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"Root Mean Squared Error: {rmse}")
print(f"R-squared: {r2}")

# Optional: Print out model coefficients to see the feature importance
print("Model Coefficients:")
for feature, coef in zip(features, model.coef_):
    print(f"{feature}: {coef}")

Root Mean Squared Error: 15.017879981824413
R-squared: 0.04310897750875975
Model Coefficients:
year: -0.23228557138240927
month: 0.37583903383854866
quarter: -1.0055970559354352
demand_lag_1w: 31810922011740.516
demand_lag_2w: 31810922011740.508
demand_lag_3w: 31810922011740.582
demand_lag_4w: 31810922011740.57
demand_rolling_mean_4w: -127243688046962.0


In [13]:
# Now let me do XGBoost model.

# Drop the date column and separate features and target
df = df.drop(['date'], axis=1)
X = df.drop(['demand'], axis=1)
y = df['demand']

# Split the data chronologically: first 90% for training, last 10% for testing
split_index = int(0.9 * len(df))
X_train, X_test = X[:split_index], X[split_index:]
y_train, y_test = y[:split_index], y[split_index:]

# Create and train the XGBoost model
model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1, max_depth=5)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f'RMSE: {rmse:.2f}')
print(f'R²: {r2:.2f}')


RMSE: 13.66
R²: 0.04
