# Feature Engineering

## Import libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder
from category_encoders import TargetEncoder

## Load the data

In [2]:
cat = pd.read_pickle("../data/cat_eda.pickle")
num = pd.read_pickle("../data/num_eda.pickle")

## Create new variables

### In order to make our forecasting model more accurate it is good practice to include new variables that capture the intermitent demand as well as the stock outage. To do that we need:

- Calendar variables (year, month, wday and weekday - already created in the dataset)
- Intermitent demand variables
- Lag variables
- Moving averages

#### Let's join the datasets again in a single dataframe to do it.

In [3]:
df = pd.concat([cat,num], axis=1)
df

Unnamed: 0_level_0,store_id,item_id,d,year,month,wday,weekday,event_name_1,event_type_1,wm_yr_wk,sales,sell_price
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2013-01-01,CA_3,FOODS_3_090,d_704,2013,1,4,Tuesday,NewYear,National,11249,0,1.25
2013-01-01,CA_3,FOODS_3_120,d_704,2013,1,4,Tuesday,NewYear,National,11249,33,1.25
2013-01-01,CA_3,FOODS_3_202,d_704,2013,1,4,Tuesday,NewYear,National,11249,0,4.98
2013-01-01,CA_3,FOODS_3_252,d_704,2013,1,4,Tuesday,NewYear,National,11249,0,4.98
2013-01-01,CA_3,FOODS_3_288,d_704,2013,1,4,Tuesday,NewYear,National,11249,20,4.28
...,...,...,...,...,...,...,...,...,...,...,...,...
2015-11-30,CA_4,FOODS_3_329,d_1767,2015,11,3,Monday,no_event,no_event,11544,9,1.68
2015-11-30,CA_4,FOODS_3_555,d_1767,2015,11,3,Monday,no_event,no_event,11544,26,2.48
2015-11-30,CA_4,FOODS_3_586,d_1767,2015,11,3,Monday,no_event,no_event,11544,13,2.48
2015-11-30,CA_4,FOODS_3_587,d_1767,2015,11,3,Monday,no_event,no_event,11544,11,1.58


### Intermitent demand variables

#### This variable should identify how many days in a row there are 0 sales.
#### Given N days without sales then we assume there is stock outage.

In [4]:
def stock_outage(sales, n):
    '''''
    Function that returns a binary variable, with a value of 1 after N days of
    0 sales in a row. It takes as an input the sales variable and the number of days
    N.
    '''''
    zero_sales = pd.Series(np.where(sales == 0, 1, 0))
    num_zeros = zero_sales.rolling(n).sum()
    stock_outage = np.where(num_zeros == n, 1, 0)
    return stock_outage

In [5]:
# It is very important to sort the values as shown in order to avoid issues when applying
# the stock_outage function
df = df.sort_values(by = ['store_id','item_id','date'])
df.head(7)

Unnamed: 0_level_0,store_id,item_id,d,year,month,wday,weekday,event_name_1,event_type_1,wm_yr_wk,sales,sell_price
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2013-01-01,CA_3,FOODS_3_090,d_704,2013,1,4,Tuesday,NewYear,National,11249,0,1.25
2013-01-02,CA_3,FOODS_3_090,d_705,2013,1,5,Wednesday,no_event,no_event,11249,224,1.25
2013-01-03,CA_3,FOODS_3_090,d_706,2013,1,6,Thursday,no_event,no_event,11249,241,1.25
2013-01-04,CA_3,FOODS_3_090,d_707,2013,1,7,Friday,no_event,no_event,11249,232,1.25
2013-01-05,CA_3,FOODS_3_090,d_708,2013,1,1,Saturday,no_event,no_event,11250,301,1.25
2013-01-06,CA_3,FOODS_3_090,d_709,2013,1,2,Sunday,no_event,no_event,11250,270,1.25
2013-01-07,CA_3,FOODS_3_090,d_710,2013,1,3,Monday,OrthodoxChristmas,Religious,11250,188,1.25


#### Let's create a stock outage variable for 3, 7 and 15 days in a row.

In [6]:
df["stock_outage_3"] = df.groupby(["store_id", "item_id"])["sales"].transform(lambda x: stock_outage(x,3))

In [7]:
df["stock_outage_7"] = df.groupby(["store_id", "item_id"])["sales"].transform(lambda x: stock_outage(x,7))

In [8]:
df["stock_outage_15"] = df.groupby(["store_id", "item_id"])["sales"].transform(lambda x: stock_outage(x,15))

### Lag variables

- sales: 15 days lag
- sell_price: 7 days lag
- stock_outage: 1 day lag

In [9]:
def create_lags(df, variable, n_lags):
    '''''
    Function that returns a dataframe with a n_lags variables which receive the
    name of the input variable + "_lag_". Its value is just the shift of n_lags
    backwards.
    '''''
    lags = pd.DataFrame()

    #create all lags
    for i in range(1, n_lags+1):
        lags[variable + "_lag_" + str(i)] = df[variable].shift(i)

    return lags

In [10]:
lags_sell_price_df = df.groupby(["store_id", "item_id"]).apply(lambda x: create_lags(df=x, variable="sell_price", n_lags=7))
lags_stock_outage_3_df = df.groupby(["store_id", "item_id"]).apply(lambda x: create_lags(df=x, variable="stock_outage_3", n_lags=1))
lags_stock_outage_7_df = df.groupby(["store_id", "item_id"]).apply(lambda x: create_lags(df=x, variable="stock_outage_7", n_lags=1))
lags_stock_outage_15_df = df.groupby(["store_id", "item_id"]).apply(lambda x: create_lags(df=x, variable="stock_outage_15", n_lags=1))
lags_sales_df = df.groupby(["store_id", "item_id"]).apply(lambda x: create_lags(df=x, variable="sales", n_lags=15))

### Moving average variables

#### In this case we are going to create mean, minimum and maximum moving averages of 15 days.

In [11]:
def moving_minimum(df, variable, n_days):
    '''''
    Function that returns a dataframe with a n_days-1 variables. 
    It checks the variable "variable" and takes the minimum of the last i values
    for each record of the original dataset df.
    '''''
    mvgmin = pd.DataFrame()

    for i in range(2, n_days+1):
        mvgmin[variable + "_mvgmin_" + str(i)] = df[variable].shift(1).rolling(i).min()

    return mvgmin

In [12]:
def moving_average(df, variable, n_days):
    
    mvgmean = pd.DataFrame()

    for i in range(2, n_days+1):
        mvgmean[variable + "_mvgmean_" + str(i)] = df[variable].shift(1).rolling(i).mean()

    return mvgmean

In [13]:
def moving_maximum(df, variable, n_days):

    mvgmax = pd.DataFrame()

    for i in range(2, n_days+1):
        mvgmax[variable + "_mvgmax_" + str(i)] = df[variable].shift(1).rolling(i).max()

    return mvgmax

In [14]:
moving_minimum_df = df.groupby(["store_id","item_id"]).apply(lambda x: moving_minimum(df=x, variable="sales", n_days = 15))
moving_average_df = df.groupby(["store_id","item_id"]).apply(lambda x: moving_average(df=x, variable="sales", n_days = 15))
moving_maximum_df = df.groupby(["store_id","item_id"]).apply(lambda x: moving_maximum(df=x, variable="sales", n_days = 15))

#### Join all the new variables into the original df.

In [15]:
temp =pd.concat([lags_sell_price_df,
           lags_stock_outage_3_df,
           lags_stock_outage_7_df,
           lags_stock_outage_15_df,
           lags_sales_df,
           moving_minimum_df,
           moving_average_df,
           moving_maximum_df], axis=1)
temp

Unnamed: 0_level_0,sell_price_lag_1,sell_price_lag_2,sell_price_lag_3,sell_price_lag_4,sell_price_lag_5,sell_price_lag_6,sell_price_lag_7,stock_outage_3_lag_1,stock_outage_7_lag_1,stock_outage_15_lag_1,...,sales_mvgmax_6,sales_mvgmax_7,sales_mvgmax_8,sales_mvgmax_9,sales_mvgmax_10,sales_mvgmax_11,sales_mvgmax_12,sales_mvgmax_13,sales_mvgmax_14,sales_mvgmax_15
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-01,,,,,,,,,,,...,,,,,,,,,,
2013-01-02,1.25,,,,,,,0.0,0.0,0.0,...,,,,,,,,,,
2013-01-03,1.25,1.25,,,,,,0.0,0.0,0.0,...,,,,,,,,,,
2013-01-04,1.25,1.25,1.25,,,,,0.0,0.0,0.0,...,,,,,,,,,,
2013-01-05,1.25,1.25,1.25,1.25,,,,0.0,0.0,0.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2015-11-26,1.58,1.58,1.58,1.58,1.58,1.58,1.58,0.0,0.0,0.0,...,15.0,15.0,15.0,15.0,15.0,17.0,17.0,17.0,17.0,17.0
2015-11-27,1.58,1.58,1.58,1.58,1.58,1.58,1.58,0.0,0.0,0.0,...,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0
2015-11-28,1.58,1.58,1.58,1.58,1.58,1.58,1.58,0.0,0.0,0.0,...,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0
2015-11-29,1.58,1.58,1.58,1.58,1.58,1.58,1.58,0.0,0.0,0.0,...,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0


In [16]:
df_concat = pd.concat([df, temp], axis=1)
df_concat

Unnamed: 0_level_0,store_id,item_id,d,year,month,wday,weekday,event_name_1,event_type_1,wm_yr_wk,...,sales_mvgmax_6,sales_mvgmax_7,sales_mvgmax_8,sales_mvgmax_9,sales_mvgmax_10,sales_mvgmax_11,sales_mvgmax_12,sales_mvgmax_13,sales_mvgmax_14,sales_mvgmax_15
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-01,CA_3,FOODS_3_090,d_704,2013,1,4,Tuesday,NewYear,National,11249,...,,,,,,,,,,
2013-01-02,CA_3,FOODS_3_090,d_705,2013,1,5,Wednesday,no_event,no_event,11249,...,,,,,,,,,,
2013-01-03,CA_3,FOODS_3_090,d_706,2013,1,6,Thursday,no_event,no_event,11249,...,,,,,,,,,,
2013-01-04,CA_3,FOODS_3_090,d_707,2013,1,7,Friday,no_event,no_event,11249,...,,,,,,,,,,
2013-01-05,CA_3,FOODS_3_090,d_708,2013,1,1,Saturday,no_event,no_event,11250,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2015-11-26,CA_4,FOODS_3_714,d_1763,2015,11,6,Thursday,Thanksgiving,National,11543,...,15.0,15.0,15.0,15.0,15.0,17.0,17.0,17.0,17.0,17.0
2015-11-27,CA_4,FOODS_3_714,d_1764,2015,11,7,Friday,no_event,no_event,11543,...,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0
2015-11-28,CA_4,FOODS_3_714,d_1765,2015,11,1,Saturday,no_event,no_event,11544,...,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0
2015-11-29,CA_4,FOODS_3_714,d_1766,2015,11,2,Sunday,no_event,no_event,11544,...,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0


#### It is also necessary to get rid of the null values generated by the new variables.

#### Since we have lag variables of 15 days, there are 15 days without actual information about stock outage. That means there are going to be 15 days less in the dataframe to train the model as well.

In [17]:
df_concat.dropna(inplace=True)

#### Remove variables that are not needed for modelling.

In [18]:
to_remove = ["d","wm_yr_wk","sell_price","stock_outage_3","stock_outage_7","stock_outage_15"]

df_concat.drop(columns=to_remove, inplace=True)

### Separate target variable, as well as categorical and numerical data again.

In [19]:
target = df_concat["sales"]

In [20]:
cat = df_concat.select_dtypes(include="O")
num = df_concat.select_dtypes(exclude="O")

## Categorical data transformation

### One Hot Encoding

In [21]:
var_ohe = ["year",
           "month",
           "wday",
           "weekday",
           "event_name_1",
           "event_type_1"]

ohe = OneHotEncoder(sparse=False, handle_unknown="ignore")
cat_ohe = ohe.fit_transform(cat[var_ohe])

In [22]:
# Save as a dataframe
cat_ohe = pd.DataFrame(data=cat_ohe, columns=ohe.get_feature_names_out())
cat_ohe.head()

Unnamed: 0,year_2013,year_2014,year_2015,month_1,month_2,month_3,month_4,month_5,month_6,month_7,...,event_name_1_SuperBowl,event_name_1_Thanksgiving,event_name_1_ValentinesDay,event_name_1_VeteransDay,event_name_1_no_event,event_type_1_Cultural,event_type_1_National,event_type_1_Religious,event_type_1_Sporting,event_type_1_no_event
0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


### Target Encoding

In [23]:
var_te = ["year",
           "month",
           "wday",
           "weekday",
           "event_name_1",
           "event_type_1"]

te = TargetEncoder(min_samples_leaf=100, return_df=False)
cat_te = te.fit_transform(cat[var_te], y=target)

In [24]:
# Save as a dataframe
te_variables = [variables + "_te" for variables in var_te]
cat_te = pd.DataFrame(data=cat_te, columns=te_variables)

## Join all the transformed datasets

In [25]:
# We need the store_id and item_id information as well in the final dataframe
temp = df_concat[["store_id", "item_id"]].reset_index()
temp.head()

Unnamed: 0,date,store_id,item_id
0,2013-01-16,CA_3,FOODS_3_090
1,2013-01-17,CA_3,FOODS_3_090
2,2013-01-18,CA_3,FOODS_3_090
3,2013-01-19,CA_3,FOODS_3_090
4,2013-01-20,CA_3,FOODS_3_090


In [26]:
df_final = pd.concat([temp, cat_ohe, cat_te, num.reset_index(drop=True)], axis=1)

df_final

Unnamed: 0,date,store_id,item_id,year_2013,year_2014,year_2015,month_1,month_2,month_3,month_4,...,sales_mvgmax_6,sales_mvgmax_7,sales_mvgmax_8,sales_mvgmax_9,sales_mvgmax_10,sales_mvgmax_11,sales_mvgmax_12,sales_mvgmax_13,sales_mvgmax_14,sales_mvgmax_15
0,2013-01-16,CA_3,FOODS_3_090,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0
1,2013-01-17,CA_3,FOODS_3_090,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0
2,2013-01-18,CA_3,FOODS_3_090,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0
3,2013-01-19,CA_3,FOODS_3_090,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,281.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0
4,2013-01-20,CA_3,FOODS_3_090,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,362.0,362.0,362.0,362.0,362.0,362.0,362.0,362.0,362.0,362.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20975,2015-11-26,CA_4,FOODS_3_714,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,15.0,15.0,15.0,15.0,15.0,17.0,17.0,17.0,17.0,17.0
20976,2015-11-27,CA_4,FOODS_3_714,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0
20977,2015-11-28,CA_4,FOODS_3_714,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0
20978,2015-11-29,CA_4,FOODS_3_714,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0


## Save dataframe after feature engineering

In [None]:
df_final.to_pickle("../data/df_feature_engineering.pickle")