# Feature Engineering

## Import libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder
from category_encoders import TargetEncoder

## Load the data

In [2]:
cat = pd.read_pickle("../data/cat_eda.pickle")
num = pd.read_pickle("../data/num_eda.pickle")

## Create new variables

### In order to make our forecasting model more accurate it is good practice to include new variables that capture the intermitent demand as well as the stock outage. To do that we need:

- Calendar variables (year, month, wday and weekday - already created in the dataset)
- Intermitent demand variables
- Lag variables
- Moving averages

#### Let's join the datasets again in a single dataframe to do it.

In [4]:
df = pd.concat([cat,num], axis=1)
df

Unnamed: 0_level_0,store_id,item_id,d,year,month,wday,weekday,event_name_1,event_type_1,wm_yr_wk,sales,sell_price
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2013-01-01,CA_3,FOODS_3_090,d_704,2013,1,4,Tuesday,NewYear,National,11249,0,1.25
2013-01-01,CA_3,FOODS_3_120,d_704,2013,1,4,Tuesday,NewYear,National,11249,33,1.25
2013-01-01,CA_3,FOODS_3_202,d_704,2013,1,4,Tuesday,NewYear,National,11249,0,4.98
2013-01-01,CA_3,FOODS_3_252,d_704,2013,1,4,Tuesday,NewYear,National,11249,0,4.98
2013-01-01,CA_3,FOODS_3_288,d_704,2013,1,4,Tuesday,NewYear,National,11249,20,4.28
...,...,...,...,...,...,...,...,...,...,...,...,...
2015-11-30,CA_4,FOODS_3_329,d_1767,2015,11,3,Monday,no_event,no_event,11544,9,1.68
2015-11-30,CA_4,FOODS_3_555,d_1767,2015,11,3,Monday,no_event,no_event,11544,26,2.48
2015-11-30,CA_4,FOODS_3_586,d_1767,2015,11,3,Monday,no_event,no_event,11544,13,2.48
2015-11-30,CA_4,FOODS_3_587,d_1767,2015,11,3,Monday,no_event,no_event,11544,11,1.58


### Intermitent demand variables

#### This variable should identify how many days in a row there are 0 sales.
#### Given N days without sales then we assume there is stock outage.

In [None]:
def stock_outage(sales, n):
    zero_sales = pd.Series(np.where(sales == 0, 1, 0))
    num_zeros = zero_sales.rolling(n).sum()
    stock_outage = np.where(num_zeros == n, 1, 0)
    return stock_outage