# This file is representing the basic Random Forest and HistGradientBoosting Regressor Models without any hyperparameters and shows their performance

### WARNING - please place the file in the same directory as the dataset csv file

##### importing the necessary modules

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import time

In [2]:
# importing data
data = pd.read_csv('Products_Information.csv')

# as the date is an 'object', changing it into datetime64(ms) format
data['date'] = pd.to_datetime(data['date'])

# setting the date as index
data.set_index('date', inplace=True)



##### Removing Outliers

In [3]:
data = data[data['sales'] <= 40000]

##### extracting date features from date index and label encoding them

In [4]:
# breaking the date into day_of_week, month and day_of_month
data['day_of_week'] = data.index.dayofweek
data['month'] = data.index.month
data['day_of_month'] = data.index.day

# label encoding the month
label_encoder = LabelEncoder()
data['month'] = label_encoder.fit_transform(data['month'])


In [5]:
# storing our data into a new variable 
data_encoded = data.copy(deep=True)

##### Lagged Feature and Rolling Windows

In [6]:
# lagged feature capturing the sales data of the previous week's same day
data_encoded['sales_lag_7'] = data_encoded.groupby(['store_nbr', 'product_type'])['sales'].shift(7)

# rolling windows 
data_encoded['rolling_window_7_skew'] = data_encoded.groupby(['store_nbr', 'product_type'])['sales'].shift(1).rolling(window=7).skew()

data_encoded['rolling_window_7_std'] = data_encoded.groupby(['store_nbr', 'product_type'])['sales'].shift(1).rolling(window=7).std()


## Label Encoding product_type

In [7]:
label_encoder = LabelEncoder()

data_encoded['product_type'] = label_encoder.fit_transform(data['product_type'])

In [8]:
# removing the id column
data_encoded = data_encoded.drop('id',axis = 1)

## splitting the data

In [9]:
# splitting the dataset into training and predictions

training_data = data_encoded['2016-01-01':'2017-07-30']

prediction_data = data_encoded['2017-07-31':'2017-08-15']


## HISTGRADIENTBOOSTING REGRESSOR MODEL

In [10]:
# Getting the features and target variable
X = training_data.drop(['sales'], axis=1)
y = training_data['sales']

# Initialize the TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=3) 

# Initialize the model
# Using the random_state = 20 for everything in our coursework to mentain
# consistency and reproducability.
model = HistGradientBoostingRegressor(random_state=20,
                                      categorical_features = ['day_of_week', 'month', 'day_of_month','store_nbr', 'product_type'])

# Iterate through the splits
for train_index, test_index in tscv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # using the time module to capture the time it takes to run the model
    # Start the timer
    start_time = time.time()

    # Fit the model on the training data
    model.fit(X_train, y_train)

    # Stop the timer
    elapsed_time = time.time() - start_time
    print(f"HGBR Model took {elapsed_time:.2f} seconds.")


    # Make predictions on the test set
    predictions = model.predict(X_test)

    # Evaluate the model's performance on validation set
    mse = mean_squared_error(y_test, predictions)
    rmse = np.sqrt(mse)

    print(f"RMSE for this fold's validation set: {rmse}")
    print("\n")


HGBR Model took 1.41 seconds.
RMSE for this fold's validation set: 226.2983536990731


HGBR Model took 1.66 seconds.
RMSE for this fold's validation set: 379.5254662991275


HGBR Model took 1.76 seconds.
RMSE for this fold's validation set: 295.95613014409764




In [11]:
# This is to test on our Predictons Dataset

X_evaluation = prediction_data.drop(['sales'], axis = 1)
y_evaluation = prediction_data['sales'] 
y_predictions = model.predict(X_evaluation)

# Evaluate the model performance on test set
mse_test = mean_squared_error(y_evaluation, y_predictions)
rmse_test = np.sqrt(mse_test)
print(f"RMSE for our test data for HGBR model: {rmse_test}")

RMSE for our test data for HGBR model: 245.79410385450058


## RANDOM FOREST REGRESSOR MODEL

In [12]:
# Getting the features and target variable
X = training_data.drop(['sales'], axis=1)
y = training_data['sales']

# Initialize the TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=3) 

# Initialize the model
# Using the random_state = 20 for everything in our coursework to mentain
# consistency and reproducability.
model = RandomForestRegressor(random_state=20, n_jobs=-1)

# Iterate through the splits
for train_index, test_index in tscv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # using the time module to capture the time it takes to run the model
    # Start the timer
    start_time = time.time()

    # Fit the model on the training data
    model.fit(X_train, y_train)

    # Stop the timer
    elapsed_time = time.time() - start_time
    print(f"Random Forest Model took {elapsed_time:.2f} seconds.")


    # Make predictions on the test set
    predictions = model.predict(X_test)

    # Evaluate the model's performance on validation set
    mse = mean_squared_error(y_test, predictions)
    rmse = np.sqrt(mse)

    print(f"RMSE for this fold's validation set: {rmse}")
    print("\n")


Random Forest Model took 18.20 seconds.
RMSE for this fold's validation set: 238.3311445333391


Random Forest Model took 40.24 seconds.
RMSE for this fold's validation set: 408.916293306572


Random Forest Model took 66.03 seconds.
RMSE for this fold's validation set: 286.74186146205767




In [13]:
# This is to test on our Predictons Dataset

X_evaluation = prediction_data.drop(['sales'], axis = 1)
y_evaluation = prediction_data['sales'] 
y_predictions = model.predict(X_evaluation)

# Evaluate the model performance on test set
mse = mean_squared_error(y_evaluation, y_predictions)
rmse = np.sqrt(mse)
print(f"RMSE for our test data for Random Forest model: {rmse}")

RMSE for our test data for Random Forest model: 272.4838764974486


## END OF FILE