# Data preparation

In this notebook we preprocess our data for ML algorithms and create new features.

## Load and reshape data

Import libraries

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime  
from datetime import timedelta  
import gc
import pickle
from sklearn import preprocessing, metrics
import pandas as pd
import matplotlib.pyplot as plt
#from sagemaker import get_execution_role
#import boto3
import seaborn as sns

  import pandas.util.testing as tm


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Helper functions

In [0]:
# this function is taken from https://www.kaggle.com/ragnar123/very-fst-model
def reduce_mem_usage(df, verbose=True):
    '''Reduce memory usage of dataframe by converting ints and floats 
    Args:
        df: dataframe
            
    Returns:
        dataframe with converted columns
    '''
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df


Read data.

In [4]:
print('Reading files...')
#data_location = 's3://{}/{}/{}'.format(bucket, prefix, 'calendar.csv')
data_folder = '/content/drive/My Drive/Kaggle_M5/01_preprocessed_data/'
X = pd.read_pickle(data_folder + 'X.pkl')

Reading files...


In [5]:
X.tail()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,demand,date,wm_yr_wk,weekday,month,event_name_1,event_type_1,year,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,start_date,days_from_start,start_date_from_start,sell_price
46881672,FOODS_3_823_WI_3_validation,FOODS_3_823,FOODS_3,FOODS,WI_3,WI,d_1941,0,2016-05-22,11617,Sunday,5,no event,no event,2016,no event,no event,0,0,0,2011-01-29,1940,0,2.980469
46881673,FOODS_3_824_WI_3_validation,FOODS_3_824,FOODS_3,FOODS,WI_3,WI,d_1941,0,2016-05-22,11617,Sunday,5,no event,no event,2016,no event,no event,0,0,0,2011-01-29,1940,0,2.480469
46881674,FOODS_3_825_WI_3_validation,FOODS_3_825,FOODS_3,FOODS,WI_3,WI,d_1941,0,2016-05-22,11617,Sunday,5,no event,no event,2016,no event,no event,0,0,0,2011-01-29,1940,0,3.980469
46881675,FOODS_3_826_WI_3_validation,FOODS_3_826,FOODS_3,FOODS,WI_3,WI,d_1941,0,2016-05-22,11617,Sunday,5,no event,no event,2016,no event,no event,0,0,0,2013-08-24,1940,938,1.280273
46881676,FOODS_3_827_WI_3_validation,FOODS_3_827,FOODS_3,FOODS,WI_3,WI,d_1941,0,2016-05-22,11617,Sunday,5,no event,no event,2016,no event,no event,0,0,0,2014-03-01,1940,1127,1.0


## Lag features

In [6]:
lags = [x for x in range(28, 28+8)]
windows = [3, 7]

lag_cols = [f"lag_{lag}" for lag in lags ]
for lag, lag_col in zip(lags, lag_cols):
    print('Lag {}'.format(lag))
    X[lag_col] = X.groupby("id")["demand"].shift(lag)

X = reduce_mem_usage(X)
lags = [28]
for window in windows:
    for lag,lag_col in zip(lags, lag_cols):
        print("Lag {}, window {}".format(lag, window))
        X[f"rmean_{lag}_{window}"] = X[["id", lag_col]].groupby("id")[lag_col].\
        transform(lambda x : x.rolling(window).mean())
X = reduce_mem_usage(X)
# For each id drop early rows for which lag cannot be calculated 
    
#X = X.dropna()


Lag 28
Lag 29
Lag 30
Lag 31
Lag 32
Lag 33
Lag 34
Lag 35
Mem. usage decreased to 4025.37 Mb (40.0% reduction)
Lag 28, window 3
Lag 28, window 7
Mem. usage decreased to 4204.21 Mb (0.0% reduction)


In [7]:
X.columns

Index(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'd',
       'demand', 'date', 'wm_yr_wk', 'weekday', 'month', 'event_name_1',
       'event_type_1', 'year', 'event_name_2', 'event_type_2', 'snap_CA',
       'snap_TX', 'snap_WI', 'start_date', 'days_from_start',
       'start_date_from_start', 'sell_price', 'lag_28', 'lag_29', 'lag_30',
       'lag_31', 'lag_32', 'lag_33', 'lag_34', 'lag_35', 'rmean_28_3',
       'rmean_28_7'],
      dtype='object')

In [0]:
X = X.dropna()

In [0]:
X.to_pickle(data_folder + 'X_lags.pkl')

In [0]:
#X = pd.read_pickle("../01_preprocessed_data/X_lags.pkl")

# From train, validation, and test datasets

Split X into X_train and X_val

In [0]:
first_day = pd.Timestamp(X['date'].values.min())
last_day =  pd.Timestamp(X['date'].values.max())
n_val_days = 28# round(n_days*val_size)
n_test_days = 28# round(n_days*val_size)

first_val_day  =  last_day - timedelta(days = (n_test_days + n_val_days  -1 ))
first_test_day =  last_day - timedelta(days = n_test_days - 1)


In [12]:
print("first_val_day {}".format(first_val_day))
print("first_test_day {}".format(first_test_day))


first_val_day 2016-03-28 00:00:00
first_test_day 2016-04-25 00:00:00


In [0]:
X_train = X[X['date'] < first_val_day]
y_train = X_train[['demand']]
X_val = X[(X['date'] >= first_val_day) & (X['date'] < first_test_day)]
y_val = X_val[['demand']]
X_test = X[X['date'] >= first_test_day]
#X_test = reduce_mem_usage(X_test)
#X_train = reduce_mem_usage(X_train)
#X_val = reduce_mem_usage(X_val)
#X = reduce_mem_usage(X)

In [14]:
print("X dates:")
print(X['date'].values.min())
print(X['date'].values.max())
print((X['date'].values.max() - X['date'].values.min())/np.timedelta64(1, 'D'))

print("X_train dates:")
print(X_train['date'].values.min())
print(X_train['date'].values.max())
print((X_train['date'].values.max() - X_train['date'].values.min())/np.timedelta64(1, 'D'))

print("X_val dates:")
print(X_val['date'].values.min())
print(X_val['date'].values.max())
print((X_val['date'].values.max() - X_val['date'].values.min())/np.timedelta64(1, 'D'))

print("X_test dates:")
print(X_test['date'].values.min())
print(X_test['date'].values.max())
print((X_test['date'].values.max() - X_test['date'].values.min())/np.timedelta64(1, 'D'))

X dates:
2011-03-05T00:00:00.000000000
2016-05-22T00:00:00.000000000
1905.0
X_train dates:
2011-03-05T00:00:00.000000000
2016-03-27T00:00:00.000000000
1849.0
X_val dates:
2016-03-28T00:00:00.000000000
2016-04-24T00:00:00.000000000
27.0
X_test dates:
2016-04-25T00:00:00.000000000
2016-05-22T00:00:00.000000000
27.0


In [0]:
del X

### Sales pattern

In this part we determine demand_type of the time series, as it was done at https://github.com/Mcompetitions/M5-methods

Determine average demand interval.

In [16]:
temp = X_train.groupby(['id'])['demand'].sum() 
temp[temp == 0]

id
FOODS_3_135_WI_2_validation        0.0
HOUSEHOLD_1_032_TX_1_validation    0.0
HOUSEHOLD_1_400_CA_4_validation    0.0
HOUSEHOLD_1_518_CA_2_validation    0.0
Name: demand, dtype: float64

In [0]:
X_train['ADI'] = X_train.groupby(['id'])['demand'].transform(lambda x:  0 if (sum(x) == 0) else len(x)/sum(x > 0))

Compute square of the Coefficient of Variation (CV²).

In [0]:
X_train['CV2'] = X_train.groupby(['id'])['demand'].transform(lambda x: (np.std(x[x>0])/np.mean(x[x>0]))**2)

Calculate demand_type.

In [0]:
X_train['demand_type'] = ((X_train['ADI'] <= 1.32) & (X_train['CV2'] < 0.5))*1 + \
    ((X_train['ADI'] > 1.32) & (X_train['CV2'] < 0.5))*10 + \
    ((X_train['ADI'] <= 1.32) & (X_train['CV2'] >= 0.5))*100 + \
    ((X_train['ADI'] > 1.32) & (X_train['CV2'] > 0.5))*1000 
X_train['demand_type'] = X_train['demand_type'].replace({1:'smooth', 10:'intermittent', 100:'erratic', 1000:'lumpy'})

In [0]:
c = 'demand_type'
col_type = X_train[c].dtype
if col_type == 'object' or col_type.name == 'category':
    X_train[c] = X_train[c].astype('category')


In [0]:
temp = X_train[['id', 'ADI', 'CV2', 'demand_type']].drop_duplicates()

In [0]:
X_val = X_val.merge(temp, on = ['id'], how = 'left')
X_test = X_test.merge(temp, on = ['id'], how = 'left')
del temp

### Aggregations

Add average sales per week per item, per month per item and simply average sales per item. Add maximum items sold.

In [0]:
X_train['avg_weekday_demand'] = X_train.groupby(['id', 'weekday'])['demand'].transform('mean') 
X_train['avg_demand'] =  X_train.groupby(['id'])['demand'].transform('mean') 
X_train['max_demand'] =  X_train.groupby(['id'])['demand'].transform('max') 
X_train['std_demand'] =  X_train.groupby(['id'])['demand'].transform('std') 


In [0]:
X_train['quantile025_week'] = X_train.groupby(['id', 'weekday'])['demand'].transform(lambda x: x.quantile(0.25))

In [0]:
X_train['max_weekday_demand'] = X_train.groupby(['id', 'weekday'])['demand'].transform('max') 

Add average price per item

In [0]:
X_train['avg_price'] = X_train.groupby(['id'])['sell_price'].transform('mean') 

In [0]:
temp = X_train[['id', 'avg_price', 'avg_demand', 'max_demand', 'std_demand']].drop_duplicates()

In [0]:
X_val = X_val.merge(temp, on = ['id'], how = 'left')
X_test = X_test.merge(temp, on = ['id'], how = 'left')


In [0]:
temp = X_train[['id', 'weekday', 'avg_weekday_demand', 'quantile025_week', 
                'max_weekday_demand']].drop_duplicates()

In [0]:
X_val = X_val.merge(temp, on = ['id', 'weekday'], how = 'left')
X_test = X_test.merge(temp, on = ['id', 'weekday'], how = 'left')


In [0]:
X_train['cheaper_than_usual'] = (X_train['sell_price'] < X_train['avg_price'])
X_val['cheaper_than_usual'] = (X_val['sell_price'] < X_val['avg_price'])
X_test['cheaper_than_usual'] = (X_test['sell_price'] < X_test['avg_price'])

In [0]:
X_train = X_train.drop(columns = ['wm_yr_wk'])
X_val = X_val.drop(columns = ['wm_yr_wk'])
X_test = X_test.drop(columns = ['wm_yr_wk'])

In [34]:
X_train = reduce_mem_usage(X_train)
X_val = reduce_mem_usage(X_val)
X_test = reduce_mem_usage(X_test)


Mem. usage decreased to 4712.63 Mb (0.0% reduction)
Mem. usage decreased to 92.67 Mb (24.0% reduction)
Mem. usage decreased to 91.86 Mb (24.7% reduction)


In [0]:
#X_train = X_train.drop(columns = ['demand'])
#X_val = X_val.drop(columns = ['demand'])
#X_test = X_test.drop(columns = ['demand'])


X_val.to_pickle(data_folder + "X_val.pkl")
X_test.to_pickle(data_folder + "X_test.pkl")
X_train.to_pickle(data_folder + "/X_train.pkl")


In [0]:
#del X_test
#del X_val, y_val
#del X_train, y_train

In [36]:
X_val.columns

Index(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'd',
       'demand', 'date', 'weekday', 'month', 'event_name_1', 'event_type_1',
       'year', 'event_name_2', 'event_type_2', 'snap_CA', 'snap_TX', 'snap_WI',
       'start_date', 'days_from_start', 'start_date_from_start', 'sell_price',
       'lag_28', 'lag_29', 'lag_30', 'lag_31', 'lag_32', 'lag_33', 'lag_34',
       'lag_35', 'rmean_28_3', 'rmean_28_7', 'ADI', 'CV2', 'demand_type',
       'avg_price', 'avg_demand', 'max_demand', 'std_demand',
       'avg_weekday_demand', 'quantile025_week', 'max_weekday_demand',
       'cheaper_than_usual'],
      dtype='object')

## Correlations with demand

In [0]:
X_train.columns

In [39]:
numerical_features = ['days_from_start', 'sell_price',
       'lag_28', 'lag_29', 'lag_30', 'lag_31', 'lag_32', 'lag_33', 'lag_34',
       'rmean_28_7', 'rmean_28_3',
       'ADI', 'CV2', 'avg_weekday_demand', 'avg_demand', 'max_demand',
       'std_demand', 'quantile025_week', 'max_weekday_demand', 'avg_price']
for feature in numerical_features:
    print("{}: {}".format(feature, X_train[feature].corr(X_train['demand'])))

days_from_start: -0.03996247651158752
sell_price: -0.1513283127030617
lag_28: 0.668002419750039
lag_29: 0.6527569517440468
lag_30: 0.634829192352799
lag_31: 0.6247735505052979
lag_32: 0.6209171779489195
lag_33: 0.6268593469844339
lag_34: 0.6413506181185805
rmean_28_7: 0.7322895290054409
rmean_28_3: 0.7144788129636607
ADI: -0.14116904656842705
CV2: 0.04408657325472014
avg_weekday_demand: 0.7378555158643114
avg_demand: 0.7247737532318004
max_demand: 0.5382724257803655
std_demand: 0.65335088072252
quantile025_week: 0.617229443522539
max_weekday_demand: 0.5922586124354491
avg_price: -0.15094942769833902


In [41]:
numerical_features = ['lag_28', 'lag_29', 'lag_30', 'lag_31', 'lag_32', 'lag_33', 'lag_34',
       'rmean_28_7', 'rmean_28_3']
X_train[numerical_features].corr()


Unnamed: 0,lag_28,lag_29,lag_30,lag_31,lag_32,lag_33,lag_34,rmean_28_7,rmean_28_3
lag_28,1.0,0.763601,0.718587,0.698114,0.690188,0.691977,0.70469,0.862954,0.906853
lag_29,0.763601,1.0,0.763601,0.718518,0.698127,0.690134,0.691982,0.872596,0.923324
lag_30,0.718587,0.763601,1.0,0.763567,0.71856,0.698142,0.690186,0.876978,0.906897
lag_31,0.698114,0.718518,0.763567,1.0,0.76356,0.718557,0.698114,0.878264,0.796548
lag_32,0.690188,0.698127,0.71856,0.76356,1.0,0.763536,0.718575,0.876988,0.769756
lag_33,0.691977,0.690134,0.698142,0.718557,0.763536,1.0,0.763513,0.872627,0.760028
lag_34,0.70469,0.691982,0.690186,0.698114,0.718575,0.763513,1.0,0.863007,0.762439
rmean_28_7,0.862954,0.872596,0.876978,0.878264,0.876988,0.872627,0.863007,1.0,0.954497
rmean_28_3,0.906853,0.923324,0.906897,0.796548,0.769756,0.760028,0.762439,0.954497,1.0


### Save by demand type

In [44]:
X_train = X_train.dropna()
for i in ['smooth', 'erratic', 'lumpy', 'intermittent']:
    print(i)
    X_val[X_val['demand_type'] == i].to_pickle(data_folder + "X_val_{}.pkl".format(i))
    X_train[X_train['demand_type'] == i].to_pickle(data_folder + "X_train_{}.pkl".format(i))
    X_test[X_test['demand_type'] == i].to_pickle(data_folder + "X_test_{}.pkl".format(i))
    print()

smooth

erratic

lumpy

intermittent



In [45]:
X_test.columns

Index(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'd',
       'demand', 'date', 'weekday', 'month', 'event_name_1', 'event_type_1',
       'year', 'event_name_2', 'event_type_2', 'snap_CA', 'snap_TX', 'snap_WI',
       'start_date', 'days_from_start', 'start_date_from_start', 'sell_price',
       'lag_28', 'lag_29', 'lag_30', 'lag_31', 'lag_32', 'lag_33', 'lag_34',
       'lag_35', 'rmean_28_3', 'rmean_28_7', 'ADI', 'CV2', 'demand_type',
       'avg_price', 'avg_demand', 'max_demand', 'std_demand',
       'avg_weekday_demand', 'quantile025_week', 'max_weekday_demand',
       'cheaper_than_usual'],
      dtype='object')