In [196]:
#pip install scikit-learn
!pip install xgboost



In [197]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.metrics import root_mean_squared_error, mean_squared_error, mean_absolute_error, r2_score

file = r"../data/raw/sales_data.csv"
df = pd.read_csv(file)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76000 entries, 0 to 75999
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Date                76000 non-null  object 
 1   Store ID            76000 non-null  object 
 2   Product ID          76000 non-null  object 
 3   Category            76000 non-null  object 
 4   Region              76000 non-null  object 
 5   Inventory Level     76000 non-null  int64  
 6   Units Sold          76000 non-null  int64  
 7   Units Ordered       76000 non-null  int64  
 8   Price               76000 non-null  float64
 9   Discount            76000 non-null  int64  
 10  Weather Condition   76000 non-null  object 
 11  Promotion           76000 non-null  int64  
 12  Competitor Pricing  76000 non-null  float64
 13  Seasonality         76000 non-null  object 
 14  Epidemic            76000 non-null  int64  
 15  Demand              76000 non-null  int64  
dtypes: f

In [198]:
#Perform date feature extraction
df["Date"] = pd.to_datetime(df["Date"])
df["Year"] = df["Date"].dt.year
df["Month"] = df["Date"].dt.month
df["Day"] = df["Date"].dt.day

#Extract Store ID as int
df["Store ID"] = df["Store ID"].str[-1]
df["Store ID"] = df["Store ID"].astype(int)

#Extract Product ID as int
df["Product ID"] = df["Product ID"].str[-2:]
df["Product ID"] = df["Product ID"].astype(int)

#Drop columns to allow for linear regression
df = df.drop(["Date","Category", "Region", "Weather Condition", "Seasonality"], axis=1)

df.info()
df["Store ID"].head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76000 entries, 0 to 75999
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Store ID            76000 non-null  int64  
 1   Product ID          76000 non-null  int64  
 2   Inventory Level     76000 non-null  int64  
 3   Units Sold          76000 non-null  int64  
 4   Units Ordered       76000 non-null  int64  
 5   Price               76000 non-null  float64
 6   Discount            76000 non-null  int64  
 7   Promotion           76000 non-null  int64  
 8   Competitor Pricing  76000 non-null  float64
 9   Epidemic            76000 non-null  int64  
 10  Demand              76000 non-null  int64  
 11  Year                76000 non-null  int32  
 12  Month               76000 non-null  int32  
 13  Day                 76000 non-null  int32  
dtypes: float64(2), int32(3), int64(9)
memory usage: 7.2 MB


0    1
1    1
2    1
3    1
4    1
Name: Store ID, dtype: int64

In [199]:
#perform the data splitting into test and train
X = df.drop("Demand", axis=1)
y = df["Demand"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [200]:
#Baseline - guess the mean
meanDemand = y_train.mean()
y_pred_baseline = [meanDemand] * len(y_test)

print("MAE: ", mean_absolute_error(y_test, y_pred_baseline))
print("R2: ", r2_score(y_test, y_pred_baseline))

MAE:  37.04164861495845
R2:  -3.0133924244779564e-08


In [201]:
#Baseline - predict yesterday's demand
dfBaseline = df
dfBaseline["Demand Yesterday"] = dfBaseline["Demand"].shift(1)

dfBaseline = dfBaseline.dropna(subset=["Demand Yesterday"])

baselineMAE = mean_absolute_error(dfBaseline["Demand"], dfBaseline["Demand Yesterday"])
print("Baseline MAE: ", baselineMAE)

Baseline MAE:  47.46949301964499


In [202]:
#Develop linear regression model and test to compare with baseline
model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("MAE: ", mean_absolute_error(y_test, y_pred))
print("RMSE: ", root_mean_squared_error(y_test, y_pred))
print("R2: ", r2_score(y_test, y_pred))

MAE:  17.47813879019439
RMSE:  23.434553385433485
R2:  0.7513360313734248


In [203]:
#Try an xgboost regressor and see if this outperforms out linear regression model
modelTwo = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
modelTwo.fit(X_train, y_train)

y_pred = modelTwo.predict(X_test)

print("MAE: ", mean_absolute_error(y_test, y_pred))
print("RMSE: ", root_mean_squared_error(y_test, y_pred))
print("R2: ", r2_score(y_test, y_pred))

MAE:  14.7510347366333
RMSE:  19.265045166015625
R2:  0.8319495916366577


In [204]:
#Untuned xgboost performed better than linear regression, let's see what happens when we tune it.
modelTwo = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=8, random_state=42)
modelTwo.fit(X_train, y_train)

y_pred = modelTwo.predict(X_test)

print("Test Results:")
print("MAE: ", mean_absolute_error(y_test, y_pred))
print("RMSE: ", root_mean_squared_error(y_test, y_pred))
print("R2: ", r2_score(y_test, y_pred))

MAE:  12.460892677307129
RMSE:  16.474903106689453
R2:  0.8771018981933594
