#  Model 1 Code: This is the model we started with to tune the hyperparamters using RandomizedSearchCV which uses TimesSeriesSplits

### WARNING - please place the file in the same directory as the dataset csv file

##### importing the necessary modules

In [1]:
# necessary imports
import pandas as pd
import numpy as np
from scipy.stats import skew
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.preprocessing import LabelEncoder
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.inspection import permutation_importance
import time

## Importing the dataset

In [2]:
# importing data
data = pd.read_csv('Products_Information.csv')

# as the date is an 'object', changing it into datetime64(ms) format
data['date'] = pd.to_datetime(data['date'])

# setting the date as index
data.set_index('date', inplace=True)



##### Removing Outliers

In [3]:
data = data[data['sales'] <= 40000]

##### extracting date features from date index and label encoding the necessary ones

In [4]:
# breaking the date into day_of_week, month and day_of_month
data['day_of_week'] = data.index.dayofweek
data['month'] = data.index.month
data['day_of_month'] = data.index.day


In [5]:
# storing our data into a new variable 
data_encoded = data.copy(deep=True)

##### Lagged Feature and Rolling Windows

In [6]:
# lagged feature capturing the sales data of the previous week's same day
data_encoded['sales_lag_7'] = data_encoded.groupby(['store_nbr', 'product_type'])['sales'].shift(7)

# rolling windows 
data_encoded['rolling_window_7_skew'] = data_encoded.groupby(['store_nbr', 'product_type'])['sales'].shift(1).rolling(window=7).skew()

data_encoded['rolling_window_7_std'] = data_encoded.groupby(['store_nbr', 'product_type'])['sales'].shift(1).rolling(window=7).std()


## Label Encoding product_type

In [7]:
label_encoder = LabelEncoder()

data_encoded['product_type'] = label_encoder.fit_transform(data['product_type'])

## Removing the ID column

In [8]:
# removing the id column for preparation of training and prediction datasets
data_encoded = data_encoded.drop('id',axis = 1)

## splitting the data into train and test

In [9]:
# splitting the dataset into training and predictions

training_data = data_encoded['2016-01-01':'2017-07-30']

prediction_data = data_encoded['2017-07-31':'2017-08-15']


## HISTGRADIENTBOOSTING REGRESSOR MODEL with RandomizedSearchCV

In [10]:
# Separating the dataset into input and output arrays.
X = training_data.drop(['sales'], axis=1)
y = training_data['sales']

# Implementing Time Series Split with 3 splits
tscv = TimeSeriesSplit(n_splits=3)  

# Defining the parameter grid for randomized search
param_grid = {
    'max_depth': [1,2,3],
    'learning_rate': [0.01,0.02, 0.03,0.04,0.05],
    'min_samples_leaf': [30,40,60],
    'l2_regularization': [0.7, 0.8, 0.9, 1.0]
}

# Creating HGBR Model 1 (worst performing model)
model = HistGradientBoostingRegressor(random_state=20, categorical_features = ['day_of_week', 'month', 'day_of_month','store_nbr', 'product_type'])

# Creating RandomizedSearch CV grid
random_search = RandomizedSearchCV(
    model, param_distributions=param_grid,n_iter=10, scoring='neg_root_mean_squared_error',random_state=20,cv=tscv, n_jobs=-1
)

# Create a list to score the scores for each fold
rmse_scores = []

# Measuring run time for Cross Validation
start_time = time.time()

# Performing RandomizedSearchCV using Time Series split to separate training and validation sets.
random_search.fit(X, y)

# Storing the best model from  random search CV
best_model = random_search.best_estimator_


# Print the run time for random search CV
elapsed_time = time.time() - start_time
print(f"Randomized Search took {elapsed_time:.2f} seconds.")
print("\n")

# Print the best hyperparameters from the randomized search CV
print("Best Hyperparameters:", random_search.best_params_)



# Printing the performance of the best model on each TimeSeriesSplit to measure performance across folds
for train_index, test_index in tscv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    
    predictions = best_model.predict(X_test)

    rmse = np.sqrt(mean_squared_error(y_test, predictions))

    rmse_scores.append(rmse)

    print(f"RMSE for this fold: {rmse}")

Randomized Search took 10.49 seconds.


Best Hyperparameters: {'min_samples_leaf': 30, 'max_depth': 2, 'learning_rate': 0.05, 'l2_regularization': 0.9}
RMSE for this fold: 264.62214463678424
RMSE for this fold: 436.1347069204234
RMSE for this fold: 342.60561933542044


# END OF FILE