Import libraries

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime  
from datetime import timedelta  
import feather
import gc
import pickle
from sklearn import preprocessing, metrics


Helper functions

In [None]:
# this function is taken from https://www.kaggle.com/ragnar123/very-fst-model
def reduce_mem_usage(df, verbose=True):
    '''Reduce memory usage of dataframe by converting ints and floats 
    Args:
        df: dataframe
            
    Returns:
        dataframe with converted columns
    '''
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df


Read data

In [None]:
print('Reading files...')
calendar = pd.read_csv('../00_data/calendar.csv')
calendar = reduce_mem_usage(calendar)
print('Calendar has {} rows and {} columns'.format(calendar.shape[0], calendar.shape[1]))
sell_prices = pd.read_csv('../00_data/sell_prices.csv')
sell_prices = reduce_mem_usage(sell_prices)
print('Sell prices has {} rows and {} columns'.format(sell_prices.shape[0], sell_prices.shape[1]))
sales_train_validation = pd.read_csv('../00_data/sales_train_validation.csv')
sales_train_validation = reduce_mem_usage(sales_train_validation)
print('Sales train validation has {} rows and {} columns'.format(sales_train_validation.shape[0], sales_train_validation.shape[1]))
submission = pd.read_csv('../00_data/sample_submission.csv')
submission = reduce_mem_usage(submission)

print('Sample submisson has {} rows and {} columns'.format(submission.shape[0], submission.shape[1]))


In [None]:
#sales_train_validation = pd.read_csv('../00_data/sales_train_validation.csv')
#sales_train_validation = reduce_mem_usage(sales_train_validation)


In [None]:
calendar.head()

In [None]:
sell_prices.head()

In [None]:
sales_train_validation.head()

In [None]:
submission.head()

We will work with data in long format, for that we reshape sales_train_validation. The reshaped variable is called X. This is our feature matrix.

In [None]:
X = pd.melt(sales_train_validation, id_vars = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], var_name = 'd', value_name = 'items_sold')
print('Melted sales train validation has {} rows and {} columns'.format(sales_train_validation.shape[0], sales_train_validation.shape[1]))
X = reduce_mem_usage(X)
X.head()

In [None]:
del sales_train_validation

We want to determine the start of sales date for every product and remove from the X all rows before a product release

Determine  the start of sales date for each item.

In [None]:
start_sales_df = sell_prices.groupby(['store_id','item_id'])['wm_yr_wk'].agg(['min']).reset_index()
start_sales_df = start_sales_df.rename(columns={"min": "wm_yr_wk"})
start_sales_df.head()

For each week we calculate the first day of the week.

In [None]:
start_of_week = calendar.groupby(['wm_yr_wk'])['d','date'].agg(['min']).reset_index()
start_of_week.columns = start_of_week.columns.get_level_values(0)
start_of_week = start_of_week.rename(columns={"d": "start_date_d", "date":"start_date"})


In [None]:
start_of_week.head()

In [None]:
start_sales_df = start_sales_df.merge(start_of_week, on=['wm_yr_wk'], how='left')
start_sales_df.head()


Add date to X.

In [None]:
#X = X.set_index('d').join(calendar[['date', 'd']].set_index('d')).reset_index()
calendar_features = ['date', 'wm_yr_wk', 'weekday', 'month', 'event_name_1', 'event_type_1', 
                      'event_name_2', 'event_type_2', 'snap_CA', 'snap_TX', 'snap_WI', 'd']
X = X.merge(calendar[calendar_features], on = 'd', how = 'left')

In [None]:
X.head()

In [None]:
calendar.head()

Add start date

In [None]:
X = X.merge(start_sales_df[['store_id', 'item_id', 'start_date']], 
                          on=['store_id', 'item_id'], how='left')


In [None]:
#del start_sales_df
X.head()

Convert strings to datetime

In [None]:
X['date']= pd.to_datetime(X['date']) 
X['start_date']= pd.to_datetime(X['start_date']) 
calendar['date'] =  pd.to_datetime(calendar['date']) 

Delete rows before start date itemwise

In [None]:
X = X.drop(X[X.date < X.start_date].index)

# Feature engineering

Add weekday, month and SNAP days to X.

In [None]:
X.columns

In [None]:
first_day = pd.Timestamp(X['date'].values.min())
last_day =  pd.Timestamp(X['date'].values.max())

In [None]:
X['days_from_start'] = X['date'] - first_day
X['days_from_start'] = X['days_from_start'].dt.days

In [None]:
X['start_date_from_start'] = X['start_date'] - first_day
X['start_date_from_start'] = X['start_date_from_start'].dt.days

In [None]:
X = X.merge(sell_prices, on = ['store_id', 'item_id', 'wm_yr_wk'], how = 'left')

In [None]:
X.head()

# Add aggregations

Add everage sales per week per item and per month per item

In [None]:
X['avg_weekday'] = X.groupby(['id', 'weekday'])['items_sold'].transform('mean') 

In [None]:
X['avg_month'] = X.groupby(['id', 'month'])['items_sold'].transform('mean') 

Add average price per item

In [None]:
X['avg_price'] = X.groupby(['id'])['sell_price'].transform('mean') 

In [None]:
X = X.drop(columns = ['d'])

In [None]:
X.head()

Prepare test set.

In [None]:
test1_rows = [row for row in submission['id'] if 'validation' in row]
test2_rows = [row for row in submission['id'] if 'evaluation' in row]
test1 = submission[submission['id'].isin(test1_rows)]
test2 = submission[submission['id'].isin(test2_rows)]

In [None]:
test_dates = calendar.loc[1913:1940, 'date']
column_names = test_dates.dt.strftime('%Y-%m-%d').to_list()
column_names.insert(0,'id' )
test1.columns = column_names

In [None]:
test1 = pd.melt(test1, id_vars = ['id'], var_name = 'date')
test1 = test1.drop(columns = 'value')
test1['date'] =  pd.to_datetime(test1['date']) 
test1.head()

In [None]:
test1 = test1.merge(calendar[calendar_features], on = 'date', how = 'left')
test1 = test1.fillna("no event")
test1.head()

In [None]:
X = reduce_mem_usage(X)
#feather.write_dataframe(X, "../01_preprocessed_data/X.feather")
#del X
gc.collect()

In [None]:
#temp = X[['id',   'weekday', 'month', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'start_date', 'start_date_from_start',
#             'avg_weekday', 'avg_month', 'avg_price']].drop_duplicates()
temp = X[['id',  'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'start_date', 'start_date_from_start', 'avg_price']].drop_duplicates()
test1 = test1.merge(temp, on = ['id'], how = 'left')
test1['item_id'].unique()

In [None]:
test1[['store_id', 'item_id', 'wm_yr_wk']]

In [None]:
test1 = test1.merge(sell_prices, on = ['store_id', 'item_id', 'wm_yr_wk'], how = 'left')

In [None]:
test1['days_from_start'] = test1['date'] - first_day
test1['days_from_start'] = test1['days_from_start'].dt.days

In [None]:
test1['items_sold'] = 0

In [None]:
temp = X[['id',   'weekday', 'avg_weekday']].drop_duplicates()
test1 = test1.merge(temp, on = ['id', 'weekday'], how = 'left')
test1['item_id'].unique()

In [None]:
temp = X[['id',   'month', 'avg_month']].drop_duplicates()
test1 = test1.merge(temp, on = ['id', 'month'], how = 'left')


In [None]:
test1.loc[test1['avg_month'].isna(), 'avg_month'] = test1[ 'avg_weekday'][test1['avg_month'].isna()]

In [None]:
X.columns[~X.columns.isin(test1.columns)]

In [None]:
test1.head()

In [None]:
test1 = test1.drop(columns='d')
test1 = test1[X.columns]

In [None]:
last_X_day = X.date.values.max()

In [None]:
X = pd.concat([X, test1], axis = 0)

In [None]:
events = ['event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
X[events] = X[events].fillna(value='no event')

In [None]:
del test1

Encode labels

In [None]:
X['event_name_1'].unique()

In [None]:
X = X.drop(columns = ['wm_yr_wk'])

In [None]:
cat = ['weekday', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'event_name_1', 'event_type_1',
       'event_name_2', 'event_type_2']
for feature in cat:
    print(feature)
    encoder = preprocessing.LabelEncoder()
    X[feature] = encoder.fit_transform(X[feature])

# From train, validation, and test datasets

Split X into X_train and X_val

In [None]:
n_days = last_day-first_day
n_days = n_days.days
val_size = 0.05
n_val_days = 2*28# round(n_days*val_size)

In [None]:
print("n_val_days {}".format(n_val_days))

In [None]:
first_val_day = first_day + timedelta(days = n_days - n_val_days + 1)
first_val_day

In [None]:
X_train = X[X['date'] < first_val_day]
y_train = X_train[['items_sold']]
X_val = X[(X['date'] >= first_val_day) & (X['date'] <= last_X_day)]
y_val = X_val[['items_sold']]
X_test = X[X['date'] > last_X_day]

In [None]:
X_train = X_train.drop(columns = ['items_sold'])
X_val = X_val.drop(columns = ['items_sold'])


In [None]:
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)


In [None]:
#X_train = X_train.drop(columns = ['wm_yr_wk'])
#X_val = X_val.drop(columns = ['wm_yr_wk'])
#X_test = X_test.drop(columns = ['wm_yr_wk'])

Form test dataset

In [None]:
X_test = reduce_mem_usage(X_test)
X_train = reduce_mem_usage(X_train)
X_val = reduce_mem_usage(X_val)


Save datasets

In [None]:
X_val.to_pickle("../01_preprocessed_data/X_val.pkl")
y_val.to_pickle("../01_preprocessed_data/y_val.pkl")
del X_val, y_val

In [None]:
X_test.to_pickle("../01_preprocessed_data/X_test.pkl")
del X_test

In [None]:
calendar.to_pickle("../01_preprocessed_data/calendar.pkl")

In [None]:
X_train.to_pickle("../01_preprocessed_data/X_train.pkl")
y_train.to_pickle("../01_preprocessed_data/y_train.pkl")
del X_train, y_train