In [1]:
import datetime
import json
import numpy as np
import pandas as pd

from datetime import datetime
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split

# Data Exploration

In [2]:
store_data = pd.read_csv("../data/store.csv")

In [3]:
train_data = pd.read_csv("../data/train.csv")

  train_data = pd.read_csv("../data/train.csv")


In [4]:
store_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1115 entries, 0 to 1114
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Store                      1115 non-null   int64  
 1   StoreType                  1115 non-null   object 
 2   Assortment                 1115 non-null   object 
 3   CompetitionDistance        1112 non-null   float64
 4   CompetitionOpenSinceMonth  761 non-null    float64
 5   CompetitionOpenSinceYear   761 non-null    float64
 6   Promo2                     1115 non-null   int64  
 7   Promo2SinceWeek            571 non-null    float64
 8   Promo2SinceYear            571 non-null    float64
 9   PromoInterval              571 non-null    object 
dtypes: float64(5), int64(2), object(3)
memory usage: 87.2+ KB


In [5]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 914629 entries, 0 to 914628
Data columns (total 9 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   Store          914629 non-null  int64 
 1   DayOfWeek      914629 non-null  int64 
 2   Date           914629 non-null  object
 3   Sales          914629 non-null  int64 
 4   Customers      914629 non-null  int64 
 5   Open           914629 non-null  int64 
 6   Promo          914629 non-null  int64 
 7   StateHoliday   914629 non-null  object
 8   SchoolHoliday  914629 non-null  int64 
dtypes: int64(7), object(2)
memory usage: 62.8+ MB


According to our sales prediction problem, the input values are Store, DayOfWeek, Date, Customers, Open, Promo, StateHoliday, SchoolHoliday, and Sales.

Based on the dataframes info, we'll only consider `train_data` df, and assign it as `sales_data`

In [6]:
sales_data = train_data

In [7]:
sales_data.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday
0,1,4,2015-04-30,6228,650,1,1,0,0
1,2,4,2015-04-30,6884,716,1,1,0,0
2,3,4,2015-04-30,9971,979,1,1,0,0
3,4,4,2015-04-30,16106,1854,1,1,0,0
4,5,4,2015-04-30,6598,729,1,1,0,0


# Feature Engineering

Splitting date into Year, Month, Day

In [8]:
import datetime

# convert date column to datetime type
sales_data['Date'] = pd.to_datetime(sales_data['Date'])

# create new columns for year, month, and day of week
sales_data['Year'] = sales_data['Date'].dt.year
sales_data['Month'] = sales_data['Date'].dt.month
sales_data['Day'] = sales_data['Date'].dt.day

In [9]:
sales_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 914629 entries, 0 to 914628
Data columns (total 12 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   Store          914629 non-null  int64         
 1   DayOfWeek      914629 non-null  int64         
 2   Date           914629 non-null  datetime64[ns]
 3   Sales          914629 non-null  int64         
 4   Customers      914629 non-null  int64         
 5   Open           914629 non-null  int64         
 6   Promo          914629 non-null  int64         
 7   StateHoliday   914629 non-null  object        
 8   SchoolHoliday  914629 non-null  int64         
 9   Year           914629 non-null  int64         
 10  Month          914629 non-null  int64         
 11  Day            914629 non-null  int64         
dtypes: datetime64[ns](1), int64(10), object(1)
memory usage: 83.7+ MB


Converting StateHoliday to int

In [10]:
sales_data['StateHoliday'].unique()

array(['0', 'b', 'a', 'c', 0], dtype=object)

In [11]:
sales_data['StateHoliday'] = sales_data['StateHoliday'].replace({'0': 0, 'a': 1, 'b': 2, 'c': 3})
sales_data['StateHoliday'].unique()

array([0, 2, 1, 3])

In [12]:
sales_data.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,Year,Month,Day
0,1,4,2015-04-30,6228,650,1,1,0,0,2015,4,30
1,2,4,2015-04-30,6884,716,1,1,0,0,2015,4,30
2,3,4,2015-04-30,9971,979,1,1,0,0,2015,4,30
3,4,4,2015-04-30,16106,1854,1,1,0,0,2015,4,30
4,5,4,2015-04-30,6598,729,1,1,0,0,2015,4,30


## Feature Selection

Since some stores are closed for refurbishment — and thus will have no sales — OPEN feature will not be considered

In [13]:
features = ['Store', 'DayOfWeek', 'Customers', 'Promo', 'StateHoliday', 'SchoolHoliday', 'Year', 'Month', 'Day']

In [14]:
X = sales_data[features]

In [15]:
y = sales_data.Sales

# Model Building

## Train-Test Split

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

## Hyperparameter Tuning

Based on the Grid Search that tested the following combinations:

`grid_space={'max_depth':[3,7,10], 'n_estimators':[5,10,20], 'max_features':[1,5,7], 'min_samples_leaf':[1,2,3], 'min_samples_split':[1,2,3]}`

The model with hyperparameters:

`{'max_depth': 10, 'max_features': 7, 'min_samples_leaf': 2, 'min_samples_split': 3, 'n_estimators': 20}`

yielded the best score (mean squared error)

## Model Evaluation

In [17]:
model = RandomForestRegressor(max_depth=10, max_features=7, min_samples_leaf=2, min_samples_split=3, n_estimators=20)
model.fit(X_train, y_train)
preds = model.predict(X_test)
mae = mean_absolute_error(y_test, preds)
mse = mean_squared_error(y_test, preds)
accuracy = model.score(X_test, y_test)
print("MSE: " + str(mse) + ", MAE: " + str(mae) + ", Accuracy: " + str(accuracy))

MSE: 1563896.4657481278, MAE: 839.4276261476954, Accuracy: 0.8940749323057495


# Model Test

In [18]:
# Exporting model
import joblib
joblib.dump(model, 'rf_model.pkl')

['rf_model.pkl']

In [19]:
# Loading the model
test_model = joblib.load('rf_model.pkl')

In [20]:
json_input = {"Store":1111, "DayOfWeek":4, "Date":"2014-07-10", "Customers":410, "Open":1, "Promo":0, "StateHoliday":"0", "SchoolHoliday":1}
json_str = json.dumps(json_input)

In [21]:
def preprocess_data(json_data):
    # Convert JSON to a Python dictionary
    data = json.loads(json_data)

    # Extract the date string
    date_string = data['Date']

    # Convert the date string to a datetime object
    date_obj = datetime.datetime.strptime(date_string, '%Y-%m-%d')

    # Extract the year, month, and day from the datetime object
    data['Year'] = date_obj.year
    data['Month'] = date_obj.month
    data['Day'] = date_obj.day
    
    state_holiday_map = {'0': 0, 'a': 1, 'b': 2, 'c': 3}
    data['StateHoliday'] = state_holiday_map.get(data['StateHoliday'], data['StateHoliday'])
    
    del data['Date']
    del data['Open']

    return data

In [22]:
input_preprocess = preprocess_data(json_str)

In [23]:
input_dataframe = pd.DataFrame.from_records([input_preprocess])

In [24]:
predictions = test_model.predict(input_dataframe)
sales = predictions[0]
print ("sales: ", sales)

sales:  3972.271263615245
