# Sales Prediction Challenge

This notebook is based on the following Kaggle kernel:

https://www.kaggle.com/minhtriet/a-beginner-guide-for-sale-data-prediction

## Imports

In [1]:
from salesprediction.config import config
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime, date
from dateutil.relativedelta import relativedelta

from sklearn.preprocessing import StandardScaler

from math import ceil

from keras.callbacks import LambdaCallback
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM
from keras.optimizers import RMSprop

%matplotlib inline

  '{0}.{1}.{2}'.format(*version.hdf5_built_version_tuple)
Using TensorFlow backend.


In [2]:
train = pd.read_csv(os.path.join(config.DATA_PATH, './sales_train_v2.csv'))
test = pd.read_csv(os.path.join(config.DATA_PATH, './test.csv'))
submission = pd.read_csv(os.path.join(config.DATA_PATH, './sample_submission.csv'))
items = pd.read_csv(os.path.join(config.DATA_PATH,'./items.csv'))
item_cats = pd.read_csv(os.path.join(config.DATA_PATH,'./item_categories.csv'))
shops = pd.read_csv(os.path.join(config.DATA_PATH,'./shops.csv'))

In [4]:
test_shops = test.shop_id.unique()
train = train[train.shop_id.isin(test_shops)]
test_items = test.item_id.unique()
train = train[train.item_id.isin(test_items)]

In [5]:
MAX_BLOCK_NUM = train.date_block_num.max()
MAX_ITEM = len(test_items)
MAX_CAT = len(item_cats)
MAX_YEAR = 3
MAX_MONTH = 4 # 7 8 9 10
MAX_SHOP = len(test_shops)

## Data Exploration

Data exploration has been done with Tableau. A Tableau project can be found in the root/exploration/. If you don't own a Tableau license, check the Kaggle kernel linked above. Similar charts can be found there.

In [6]:
train.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
10,03.01.2013,0,25,2574,399.0,2.0
11,05.01.2013,0,25,2574,399.0,1.0
12,07.01.2013,0,25,2574,399.0,1.0
13,08.01.2013,0,25,2574,399.0,2.0


From the data explortation we know that there is a peak in shop shop at the end of the year, probably due to christmas. Therefore, we add month and year, so that the network can pick up this pattern. Furthermore, product category may add some additional information to the prediction. Thus we include product category in the training set.

In [7]:
# add categories
train = train\
    .set_index('item_id')\
    .join(items.set_index('item_id'))\
    .drop('item_name', axis=1)\
    .reset_index()

In [8]:
# add month and year
train['month'] = train.date.apply(lambda x: datetime.strptime(x, '%d.%m.%Y').strftime('%m'))
train['year'] = train.date.apply(lambda x: datetime.strptime(x, '%d.%m.%Y').strftime('%Y'))

In [9]:
train.head()

Unnamed: 0,item_id,date,date_block_num,shop_id,item_price,item_cnt_day,item_category_id,month,year
0,30,28.02.2013,1,50,399.0,1.0,40,2,2013
1,30,26.02.2013,1,50,399.0,1.0,40,2,2013
2,30,12.02.2013,1,50,399.0,1.0,40,2,2013
3,30,14.02.2013,1,50,399.0,2.0,40,2,2013
4,30,15.02.2013,1,50,399.0,3.0,40,2,2013


In [10]:
# remove the date column
# train = train.drop('date', axis=1)
train = train.drop('item_category_id', axis=1)
train = train.groupby(['shop_id', 'item_id', 'date_block_num', 'month', 'year']).sum()
train = train.sort_index()

In [13]:
train.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,item_price,item_cnt_day
shop_id,item_id,date_block_num,month,year,Unnamed: 5_level_1,Unnamed: 6_level_1
2,30,2,3,2013,-0.248665,-0.150189
2,30,5,6,2013,-0.242961,-0.150189
2,30,15,4,2014,-0.27576,-0.150189
2,30,16,5,2014,-0.27576,-0.150189
2,31,1,2,2013,0.098859,0.115543


## Training

In [11]:
scaler = StandardScaler()
cnt_scaler = StandardScaler()

scaler.fit(train.item_price.values.reshape(-1, 1))
cnt_scaler.fit(train.item_cnt_day.values.reshape(-1, 1))

train.item_price = scaler.transform(train.item_price.values.reshape(-1, 1))
train.item_cnt_day = cnt_scaler.transform(train.item_cnt_day.values.reshape(-1, 1))

### Missing Data

Not every item is saled in the above time period, we will add a record for them with item_cnt_day as 0. The price is a little bit tricky. As can be seen at the code below, price of an item depends on shop and point of time. We will fill in empty values with the closest past record.

In [14]:
price = train.reset_index().set_index(['item_id', 'shop_id', 'date_block_num'])
price = price.sort_index()

In [15]:
price.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,month,year,item_price,item_cnt_day
item_id,shop_id,date_block_num,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
30,2,2,3,2013,-0.248665,-0.150189
30,2,5,6,2013,-0.242961,-0.150189
30,2,15,4,2014,-0.27576,-0.150189
30,2,16,5,2014,-0.27576,-0.150189
30,3,1,2,2013,-0.072265,0.115543


Helper functions to fill data gaps:

In [16]:
def convert(date_block):
    date = datetime(2013, 1, 1)
    date += relativedelta(months = date_block)
    return (date.month, date.year)

def closest_date_block(current_day, item_id, shop_id):
    """Find the block_date which is closest to the current_day, given item_id and shop_id. Returns index integer"""
    if (item_id, shop_id) in price.index:
        search_lst = np.array(price.loc[(item_id, shop_id)].index)        
        return search_lst[np.abs(current_day - search_lst).argmin()]
    return -1
                
def closest_price(current_day, item_id, shop_id):
    closest_date = closest_date_block(current_day, item_id, shop_id)
    if closest_date != -1:
        return price.loc[( item_id, shop_id, closest_date )]['item_price']
    return np.nan

def closest_price_lambda(x):
    return closest_price(34, x.item_id, x.shop_id)

Math to find out, which block to train on. **Don't run this. Resulting data has been pickled and can be loaded with the code blocks below.**

In [17]:
maxlen = 4 # 4 months
step = 1
# 0: train, 1: val, 2:test
sentences = [[],[],[]]
next_chars = [[], []]
BLOCKS = [6, 18, 30]

for s in test_shops:
    shop_items = list(train.loc[s].index.get_level_values(0).unique())
    for it in shop_items:        
        for i_index, i in enumerate(BLOCKS):
            sentence = []
            closest_pc = closest_price(i, it, s)            
            for j in range(maxlen+1):
                if j < maxlen:
                    if (s, it, i+j) in train.index:
                        r = train.loc[(s, it, i + j)].to_dict(orient='list')                    
                        closest_pc = r['item_price'][0]
                        item_cnt_day = r['item_cnt_day'][0]
                        row = {'shop_id': s, 'date_block_num': i+j, 'item_cnt_day': item_cnt_day, 
                               'month': month, 'item_id': it, 'item_price': closest_pc, 'year': year}
                    else:
                        month, year = convert(i+j)                    
                        row = {'shop_id': s, 'date_block_num': i+j, 'item_cnt_day': 0, 
                               'month': month, 'item_id': it, 'item_price': closest_pc, 'year': year}
                    sentence.append(row)
                elif i_index < 2:   # not in test set
                    next_chars[i_index].append(row)
            sentences[i_index].append(sentence)


In [43]:
x_train_o = np.array(sentences[0])
x_val_o = np.array(sentences[1])
x_test_o = np.array(sentences[2])
y_train = np.array([x['item_cnt_day'] for x in next_chars[0]])
y_val = np.array([x['item_cnt_day'] for x in next_chars[1]])

In [19]:
length = MAX_SHOP + MAX_ITEM + MAX_MONTH + 1 + 1 + 1

In [44]:
import pickle

_dumps = {
    'x_train_o': x_train_o,
    'x_val_o':x_val_o,
    'x_test_o':x_test_o,
    'y_train': y_train,
    'y_val':y_val}

for key, value in _dumps.items():
    with open(os.path.join(config.DATA_PATH, key), 'wb') as file:
        pickle.dump(value, file)

In [49]:
import pickle

with open(os.path.join(config.DATA_PATH, 'x_val_o'), 'rb') as file:
    x_val_o = pickle.load(file)
    
with open(os.path.join(config.DATA_PATH, './x_test_o'), 'rb') as file:
    x_test_o = pickle.load(file)
    
with open(os.path.join(config.DATA_PATH, './y_train'), 'rb') as file:
    y_train = pickle.load(file)
    
with open(os.path.join(config.DATA_PATH, './y_val'), 'rb') as file:
    y_val = pickle.load(file)
    
with open(os.path.join(config.DATA_PATH, './x_train_o'), 'rb') as file:
    x_train_o = pickle.load(file)

### Transform to one-hot encoding

In [50]:
from sklearn import preprocessing

shop_le = preprocessing.LabelEncoder()
shop_le.fit(test_shops)
shop_dm = dict(zip(test_shops, shop_le.transform(test_shops)))

item_le = preprocessing.LabelEncoder()
item_le.fit(test_items)
item_dm = dict(zip(test_items, item_le.transform(test_items)))

month_le = preprocessing.LabelEncoder()
month_le.fit(range(7,11))
month_dm = dict(zip(range(7,11), month_le.transform(range(7,11))))

#cat_le = preprocessing.LabelEncoder()
#cat_le.fit(item_cats.item_category_id)
#cat_dm = dict(zip(item_cats.item_category_id.unique(), cat_le.transform(item_cats.item_category_id.unique())))

### Vectorize

In [51]:
def vectorize(inp):
    print('Vectorization...')   
    x = np.zeros((len(inp), maxlen, length), dtype=np.float32)
    for i, sentence in enumerate(inp):
        for t, char in enumerate(sentence):            
            x[i][t][ shop_dm[char['shop_id']] ] = 1        
            x[i][t][ MAX_SHOP + item_dm[char['item_id']] ] = 1
            x[i][t][ MAX_SHOP + MAX_ITEM + month_dm[char['month']] ] = 1
            x[i][t][ MAX_SHOP + MAX_ITEM + MAX_MONTH + 1 ] = char['item_price']
            x[i][t][ MAX_SHOP + MAX_ITEM + MAX_MONTH + 1 + 1] = char['item_cnt_day']    
    return x

In [54]:
x_train = vectorize(x_train_o)
x_val = vectorize(x_val_o)
x_test = vectorize(x_test_o)

Vectorization...
Vectorization...
Vectorization...


In [55]:
import pickle

_dumps = {
    'x_train': x_train,
    'x_val':x_val,
    'x_test':x_test
}

for key, value in _dumps.items():
    with open(os.path.join(config.DATA_PATH, key), 'wb') as file:
        pickle.dump(value, file)

OverflowError: cannot serialize a bytes object larger than 4 GiB

### Build a model

In [25]:
# build the model: a single LSTM
print('Build model...')
model = Sequential()
model.add(LSTM(32, input_shape=(maxlen, length)))
model.add(Dense(1, activation='relu'))

optimizer = RMSprop(lr=0.005)
model.compile(loss='mean_squared_error', optimizer=optimizer)

model.fit(x_train, y_train, batch_size=128, epochs=13)

Build model...


TypeError: while_loop() got an unexpected keyword argument 'maximum_iterations'