Import libraries

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime  
from datetime import timedelta  
import gc
import pickle
from sklearn import preprocessing, metrics


Helper functions

In [2]:
# this function is taken from https://www.kaggle.com/ragnar123/very-fst-model
def reduce_mem_usage(df, verbose=True):
    '''Reduce memory usage of dataframe by converting ints and floats 
    Args:
        df: dataframe
            
    Returns:
        dataframe with converted columns
    '''
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df


Read data

In [3]:
print('Reading files...')
calendar = pd.read_csv('../00_data/calendar.csv')
calendar = reduce_mem_usage(calendar)
print('Calendar has {} rows and {} columns'.format(calendar.shape[0], calendar.shape[1]))
sell_prices = pd.read_csv('../00_data/sell_prices.csv')
sell_prices = reduce_mem_usage(sell_prices)
print('Sell prices has {} rows and {} columns'.format(sell_prices.shape[0], sell_prices.shape[1]))
sales_train_validation = pd.read_csv('../00_data/sales_train_validation.csv')
sales_train_validation = reduce_mem_usage(sales_train_validation)
print('Sales train validation has {} rows and {} columns'.format(sales_train_validation.shape[0], sales_train_validation.shape[1]))
submission = pd.read_csv('../00_data/sample_submission.csv')
submission = reduce_mem_usage(submission)

print('Sample submisson has {} rows and {} columns'.format(submission.shape[0], submission.shape[1]))


Reading files...
Mem. usage decreased to  0.12 Mb (41.9% reduction)
Calendar has 1969 rows and 14 columns
Mem. usage decreased to 130.48 Mb (37.5% reduction)
Sell prices has 6841121 rows and 4 columns
Mem. usage decreased to 95.00 Mb (78.7% reduction)
Sales train validation has 30490 rows and 1919 columns
Mem. usage decreased to  2.09 Mb (84.5% reduction)
Sample submisson has 60980 rows and 29 columns


In [4]:
#sales_train_validation = pd.read_csv('../00_data/sales_train_validation.csv')
#sales_train_validation = reduce_mem_usage(sales_train_validation)


In [5]:
calendar.head()

Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,2011-01-29,11101,Saturday,1,1,2011,d_1,,,,,0,0,0
1,2011-01-30,11101,Sunday,2,1,2011,d_2,,,,,0,0,0
2,2011-01-31,11101,Monday,3,1,2011,d_3,,,,,0,0,0
3,2011-02-01,11101,Tuesday,4,2,2011,d_4,,,,,1,1,0
4,2011-02-02,11101,Wednesday,5,2,2011,d_5,,,,,1,0,1


In [6]:
sell_prices.head()

Unnamed: 0,store_id,item_id,wm_yr_wk,sell_price
0,CA_1,HOBBIES_1_001,11325,9.578125
1,CA_1,HOBBIES_1_001,11326,9.578125
2,CA_1,HOBBIES_1_001,11327,8.257812
3,CA_1,HOBBIES_1_001,11328,8.257812
4,CA_1,HOBBIES_1_001,11329,8.257812


In [7]:
sales_train_validation.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,3,0,1,1,1,3,0,1,1
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,2,1,1,1,0,1,1,1
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,0,5,4,1,0,1,3,7,2
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,1,0,1,1,2,2,2,4


In [8]:
submission.head()

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,HOBBIES_1_002_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,HOBBIES_1_004_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,HOBBIES_1_005_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


We will work with data in long format, for that we reshape sales_train_validation. The reshaped variable is called X. This is our feature matrix.

In [9]:
X = pd.melt(sales_train_validation, id_vars = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], var_name = 'd', value_name = 'items_sold')
print('Melted sales train validation has {} rows and {} columns'.format(sales_train_validation.shape[0], sales_train_validation.shape[1]))
X = reduce_mem_usage(X)
X.head()

Melted sales train validation has 30490 rows and 1919 columns
Mem. usage decreased to 3226.27 Mb (0.0% reduction)


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,items_sold
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0


In [10]:
del sales_train_validation

We want to determine the start of sales date for every product and remove from the X all rows before a product release

Determine  the start of sales date for each item.

In [11]:
start_sales_df = sell_prices.groupby(['store_id','item_id'])['wm_yr_wk'].agg(['min']).reset_index()
start_sales_df = start_sales_df.rename(columns={"min": "wm_yr_wk"})
start_sales_df.head()

Unnamed: 0,store_id,item_id,wm_yr_wk
0,CA_1,FOODS_1_001,11101
1,CA_1,FOODS_1_002,11101
2,CA_1,FOODS_1_003,11101
3,CA_1,FOODS_1_004,11206
4,CA_1,FOODS_1_005,11101


For each week we calculate the first day of the week.

In [12]:
start_of_week = calendar.groupby(['wm_yr_wk'])['d','date'].agg(['min']).reset_index()
start_of_week.columns = start_of_week.columns.get_level_values(0)
start_of_week = start_of_week.rename(columns={"d": "start_date_d", "date":"start_date"})


In [13]:
start_of_week.head()

Unnamed: 0,wm_yr_wk,start_date_d,start_date
0,11101,d_1,2011-01-29
1,11102,d_10,2011-02-05
2,11103,d_15,2011-02-12
3,11104,d_22,2011-02-19
4,11105,d_29,2011-02-26


In [14]:
start_sales_df = start_sales_df.merge(start_of_week, on=['wm_yr_wk'], how='left')
start_sales_df.head()


Unnamed: 0,store_id,item_id,wm_yr_wk,start_date_d,start_date
0,CA_1,FOODS_1_001,11101,d_1,2011-01-29
1,CA_1,FOODS_1_002,11101,d_1,2011-01-29
2,CA_1,FOODS_1_003,11101,d_1,2011-01-29
3,CA_1,FOODS_1_004,11206,d_400,2012-03-03
4,CA_1,FOODS_1_005,11101,d_1,2011-01-29


Add date to X.

In [15]:
#X = X.set_index('d').join(calendar[['date', 'd']].set_index('d')).reset_index()
calendar_features = ['date', 'wm_yr_wk', 'weekday', 'month', 'event_name_1', 'event_type_1', 
                      'event_name_2', 'event_type_2', 'snap_CA', 'snap_TX', 'snap_WI', 'd']
X = X.merge(calendar[calendar_features], on = 'd', how = 'left')

In [16]:
X.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,items_sold,date,wm_yr_wk,weekday,month,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,Saturday,1,,,,,0,0,0
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,Saturday,1,,,,,0,0,0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,Saturday,1,,,,,0,0,0
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,Saturday,1,,,,,0,0,0
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,Saturday,1,,,,,0,0,0


In [17]:
calendar.head()

Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,2011-01-29,11101,Saturday,1,1,2011,d_1,,,,,0,0,0
1,2011-01-30,11101,Sunday,2,1,2011,d_2,,,,,0,0,0
2,2011-01-31,11101,Monday,3,1,2011,d_3,,,,,0,0,0
3,2011-02-01,11101,Tuesday,4,2,2011,d_4,,,,,1,1,0
4,2011-02-02,11101,Wednesday,5,2,2011,d_5,,,,,1,0,1


Add start date

In [None]:
X = X.merge(start_sales_df[['store_id', 'item_id', 'start_date']], 
                          on=['store_id', 'item_id'], how='left')


In [None]:
#del start_sales_df
X.head()

Convert strings to datetime

In [None]:
X['date']= pd.to_datetime(X['date']) 
X['start_date']= pd.to_datetime(X['start_date']) 
calendar['date'] =  pd.to_datetime(calendar['date']) 

Delete rows before start date itemwise

In [None]:
X = X.drop(X[X.date < X.start_date].index)

# Feature engineering

## Trends

Add weekday, month and SNAP days to X. Add the number of days from the begining of sales and from the first date in the dataset.

In [None]:
X.columns

In [None]:
first_day = pd.Timestamp(X['date'].values.min())
last_day =  pd.Timestamp(X['date'].values.max())

In [None]:
X['days_from_start'] = X['date'] - first_day
X['days_from_start'] = X['days_from_start'].dt.days

In [None]:
X['start_date_from_start'] = X['start_date'] - first_day
X['start_date_from_start'] = X['start_date_from_start'].dt.days

In [None]:
X = X.merge(sell_prices, on = ['store_id', 'item_id', 'wm_yr_wk'], how = 'left')

In [None]:
X.head()

## Min, max, etc..

# Sales pattern

## Aggregations

Add everage sales per week per item, per month per item and simply average sales per item. Add maximum items sold.

In [None]:
X['avg_weekday_demand'] = X.groupby(['id', 'weekday'])['items_sold'].transform('mean') 
X['avg_month_demand'] = X.groupby(['id', 'month'])['items_sold'].transform('mean') 
X['avg_demand'] =  X.groupby(['id'])['items_sold'].transform('mean') 
X['median_weekday_demand'] = X.groupby(['id', 'weekday'])['items_sold'].transform('median') 
X['median_month_demand'] = X.groupby(['id', 'month'])['items_sold'].transform('median') 
X['median_demand'] =  X.groupby(['id'])['items_sold'].transform('mean') 
X['max_demand'] =  X.groupby(['id'])['items_sold'].transform('max') 


Add average price per item

In [None]:
X['avg_price'] = X.groupby(['id'])['sell_price'].transform('mean') 
X['cheaper_than_usual'] = ( X['sell_price'] < X['avg_price'])

In [None]:
X = X.drop(columns = ['d'])

In [None]:
X.head()

Prepare test set.

In [None]:
test1_rows = [row for row in submission['id'] if 'validation' in row]
test2_rows = [row for row in submission['id'] if 'evaluation' in row]
test1 = submission[submission['id'].isin(test1_rows)]
test2 = submission[submission['id'].isin(test2_rows)]

In [None]:
test_dates = calendar.loc[1913:1940, 'date']
column_names = test_dates.dt.strftime('%Y-%m-%d').to_list()
column_names.insert(0,'id' )
test1.columns = column_names

In [None]:
test1 = pd.melt(test1, id_vars = ['id'], var_name = 'date')
test1 = test1.drop(columns = 'value')
test1['date'] =  pd.to_datetime(test1['date']) 
test1.head()

In [None]:
test1 = test1.merge(calendar[calendar_features], on = 'date', how = 'left')
test1 = test1.fillna("no event")
test1.head()

In [None]:
X = reduce_mem_usage(X)
#feather.write_dataframe(X, "../01_preprocessed_data/X.feather")
#del X
gc.collect()

In [None]:
#temp = X[['id',   'weekday', 'month', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'start_date', 'start_date_from_start',
#             'avg_weekday', 'avg_month', 'avg_price']].drop_duplicates()
temp = X[['id',  'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'median_demand', 'max_demand'
          'start_date', 'start_date_from_start', 'avg_price']].drop_duplicates()
test1 = test1.merge(temp, on = ['id'], how = 'left')
test1['item_id'].unique()

In [None]:
test1 = test1.merge(sell_prices, on = ['store_id', 'item_id', 'wm_yr_wk'], how = 'left')

In [None]:
test1['days_from_start'] = test1['date'] - first_day
test1['days_from_start'] = test1['days_from_start'].dt.days

In [None]:
test1['items_sold'] = 0

In [None]:
temp = X[['id',  'weekday', 'avg_weekday_demand', 'median_weekday_demand']].drop_duplicates()
test1 = test1.merge(temp, on = ['id', 'weekday'], how = 'left')
test1['item_id'].unique()

In [None]:
temp = X[['id',   'month', 'avg_month_demand', 'median_month_demand']].drop_duplicates()
test1 = test1.merge(temp, on = ['id', 'month'], how = 'left')


In [None]:
test1.loc[test1['avg_month_demand'].isna(), 'avg_month_demand'] = test1['avg_weekday_demand'][test1['avg_month_demand'].isna()]

In [None]:
X.columns[~X.columns.isin(test1.columns)]

In [None]:
test1 = test1.drop(columns='d')
test1 = test1[X.columns]

In [None]:
X.columns[~X.columns.isin(test1.columns)]

In [None]:
last_X_day = X.date.values.max()

In [None]:
X = pd.concat([X, test1], axis = 0)

In [None]:
events = ['event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
X[events] = X[events].fillna(value='no event')

In [None]:
X['lag_t28'] = X.groupby(['id'])['items_sold'].transform(lambda x: x.shift(28))
X['lag_t29'] = X.groupby(['id'])['items_sold'].transform(lambda x: x.shift(29))
X['lag_t30'] = X.groupby(['id'])['items_sold'].transform(lambda x: x.shift(30))

In [None]:
del test1

Encode labels

In [None]:
X['event_name_1'].unique()

In [None]:
X = X.drop(columns = ['wm_yr_wk'])

In [None]:
cat = ['weekday', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'event_name_1', 'event_type_1',
       'event_name_2', 'event_type_2']
for feature in cat:
    print(feature)
    encoder = preprocessing.LabelEncoder()
    X[feature] = encoder.fit_transform(X[feature])

# From train, validation, and test datasets

Split X into X_train and X_val

In [None]:
n_days = last_day-first_day
n_days = n_days.days
val_size = 0.05
n_val_days = 2*28# round(n_days*val_size)

In [None]:
print("n_val_days {}".format(n_val_days))

In [None]:
first_val_day = first_day + timedelta(days = n_days - n_val_days + 1)
first_val_day

In [None]:
X_train = X[X['date'] < first_val_day]
y_train = X_train[['items_sold']]
X_val = X[(X['date'] >= first_val_day) & (X['date'] <= last_X_day)]
y_val = X_val[['items_sold']]
X_test = X[X['date'] > last_X_day]

In [None]:
X_train = X_train.drop(columns = ['items_sold'])
X_val = X_val.drop(columns = ['items_sold'])


In [None]:
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)


In [None]:
#X_train = X_train.drop(columns = ['wm_yr_wk'])
#X_val = X_val.drop(columns = ['wm_yr_wk'])
#X_test = X_test.drop(columns = ['wm_yr_wk'])

Form test dataset

In [None]:
X_test = reduce_mem_usage(X_test)
X_train = reduce_mem_usage(X_train)
X_val = reduce_mem_usage(X_val)


Save datasets

In [None]:
X_val.to_pickle("../01_preprocessed_data/X_val.pkl")
y_val.to_pickle("../01_preprocessed_data/y_val.pkl")
del X_val, y_val

In [None]:
X_test.to_pickle("../01_preprocessed_data/X_test.pkl")
del X_test

In [None]:
calendar.to_pickle("../01_preprocessed_data/calendar.pkl")

In [None]:
X_train.to_pickle("../01_preprocessed_data/X_train.pkl")
y_train.to_pickle("../01_preprocessed_data/y_train.pkl")
del X_train, y_train