https://www.kaggle.com/minhtriet/a-beginner-guide-for-sale-data-prediction

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime, date
from dateutil.relativedelta import relativedelta

from sklearn.preprocessing import StandardScaler

from math import ceil

from keras.callbacks import LambdaCallback
from keras.models import Sequential
from keras.layers import Dense, Activation, LSTM
from keras.optimizers import RMSprop

In [None]:
train = pd.read_csv('../input/sales_train.csv')
test = pd.read_csv('../input/test.csv')

items = pd.read_csv('../input/items.csv')
item_cats = pd.read_csv('../input/item_categories.csv')
shops = pd.read_csv('../input/shops.csv')

In [None]:
test_shops = test.shop_id.unique()
train = train[train.shop_id.isin(test_shops)]
test_items = test.item_id.unique()
train = train[train.item_id.isin(test_items)]

In [None]:
MAX_BLOCK_NUM = train.date_block_num.max() #33
MAX_ITEM = len(test_items)
MAX_CAT = len(item_cats)
MAX_YEAR = 3
MAX_MONTH = 4
MAX_SHOP = len(test_shops)

# Data Exploration
shop & item categories 두 관점에서 보겠다

## shop_id

In [None]:
grouped = pd.DataFrame(train.groupby(['shop_id', 'date_block_num'])['item_cnt_day'].sum().reset_index()) #shop_id 별로 월별 판매량
grouped.head(3)

In [None]:
fig, axes = plt.subplots(nrows = 5, ncols = 2, sharex=True, sharey = True, figsize = (16,20)) #축 공유
num_graph = 10
id_per_graph = ceil(grouped.shop_id.max() / num_graph) #6
count = 0

for i in range(5) :
    for j in range(2) :
        sns.pointplot(x = 'date_block_num', y = 'item_cnt_day', hue = 'shop_id', data = grouped[np.logical_and(count * id_per_graph <= grouped['shop_id'],
                                                                                                               grouped['shop_id'] < (count + 1)*id_per_graph)], ax = axes[i][j])
        count += 1

데이터의 시작이 2013년 1월부터 시작이다. 확실히 연말에 피크 생긴다. 그러므로, 월과 연도를 추가하여 패턴을 파악하도록 하는 것이 좋겠다. 각가의 아템 판매양이 어떻게 되는지 보는 것이 좋겠다. item category도 함께!

## item categories

In [None]:
train = train.set_index('item_id').join(items.set_index('item_id')).drop('item_name', axis = 1).reset_index()
train.head(3)

In [None]:
train['month'] = train.date.apply(lambda x: datetime.strptime(x, '%d.%m.%Y').strftime('%m'))
train['year'] = train.date.apply(lambda x: datetime.strptime(x, '%d.%m.%Y').strftime('%Y'))

In [None]:
fig, axes = plt.subplots(nrows = 5, ncols = 2, sharex = True, sharey = True, figsize = (16,20))
num_graph = 10
id_per_graph = ceil(train.item_category_id.max() / num_graph)
count = 0

for i in range(5) :
    for j in range(2) :
        sns.pointplot(x = 'month', y = 'item_cnt_day', hue = 'item_category_id',
                     data = train[np.logical_and(count * id_per_graph <= train.item_category_id, train['item_category_id'] < (count + 1) * id_per_graph)], ax = axes[i][j])
        count += 1

In [None]:
fig, axes = plt.subplots(nrows = 5, ncols = 2, sharex = True, sharey = True, figsize = (16,20))
num_graph = 10
id_per_graph = ceil(train.item_category_id.max() / num_graph)
count = 0

for i in range(5) :
    for j in range(2) :
        sns.pointplot(x = 'date_block_num', y = 'item_cnt_day', hue = 'item_category_id',
                     data = train[np.logical_and(count * id_per_graph <= train.item_category_id, train['item_category_id'] < (count + 1) * id_per_graph)], ax = axes[i][j])
        count += 1

In [None]:
train.head(3)

In [None]:
train = train.drop(['date', 'item_category_id'], axis = 1)
train = train.groupby(['shop_id', 'item_id', 'date_block_num', 'month', 'year']).sum()
train = train.sort_index()
train.head(3)

# Training

In [None]:
scaler = StandardScaler()
cnt_scaler = StandardScaler()

scaler.fit(train.item_price.as_matrix().reshape(-1,1))
cnt_scaler.fit(train.item_cnt_day.as_matrix().reshape(-1,1))

train.item_price = scaler.transform(train.item_price.as_matrix().reshape(-1,1))
train.item_cnt_day = scaler.transform(train.item_cnt_day.as_matrix().reshape(-1,1))

모든 데이터를 training하는 것이 자연스럽지만, 두가지 결점이 있다
- 2013년 1월 데이터(date_block_num = 0) 가 예측해야 할 데이(2015 11월) 영향이 미미
- 메모리 에러 ㅜㅜ

**2013,2014 7,8,9,10,11 데이터를 활용**

# Missing Data
- 모든 아이템들이 위 기간에 팔리지는 않았기 때문에 빈 값은 item_cnt_day = 0으로 채운다.
- item의 가격은 shop과 산 시점에 따라 다르다. 가격은 가장 가까운 과거의 데이터로 채운다.

In [None]:
train.reset_index().groupby(['item_id', 'date_block_num', 'shop_id']).mean()

In [None]:
price = train.reset_index().set_index(['item_id', 'shop_id', 'date_block_num'])
price = price.sort_index()
price.head(3)

# Some functions

In [None]:
def convert(date_block) :
    date = datetime(2013,1,1) #처음 달
    date += relativedelta(months = date_block)
    
    return (date.month, date.year)

#date_block값을 입력하였을 때, 몇년 몇월인지 알려줌
print(convert(6))
print(convert(18))
print(convert(30))

In [None]:
price.head(3)

In [None]:
def closest_date_block(current_day, item_id, shop_id) :
    if (item_id, shop_id) in price.index : #item_id와 shop_id가 동일한 경우를 찾을 수 있음
        search_lst = np.array(price.loc[(item_id, shop_id)].index)
        
        return search_lst[np.abs(current_day - search_lst).argmin()]
    
    return -1 #item_id와 shop_id가 동일한 경우를 찾을 수 없음

def closest_price(current_day, item_id, shop_id) :
    closest_date = closest_date_block(current_day, item_id, shop_id)
    
    if closest_date != -1 :
        return price.loc[(item_id, shop_id, closest_date)]['item_price']
    
    return np.nan

def closest_price_lambda(x) :
    return closest_date_price(34, x.item_id, x.shop_id)

In [None]:
assert closest_date_block(18,30,5) == 18

# make training dataset

In [None]:
%who

In [None]:
del items, item_cats

In [None]:
maxlen = 4 # 4달
step = 1
# 0: train, 1: val, 2:test

sentences = [[],[],[]]
next_chars = [[], []]
BLOCKS = [6, 18, 30]

for s in test_shops:
    shop_items = list(train.loc[s].index.get_level_values(0).unique())
    for it in shop_items:        
        for i_index, i in enumerate(BLOCKS):
            sentence = []
            closest_pc = closest_price(i, it, s)            
            for j in range(maxlen+1):
                if j < maxlen:
                    if (s, it, i+j) in train.index:
                        r = train.loc[(s, it, i + j)].to_dict(orient='list')                    
                        closest_pc = r['item_price'][0]
                        item_cnt_day = r['item_cnt_day'][0]
                        row = {'shop_id': s, 'date_block_num': i+j, 'item_cnt_day': item_cnt_day, 
                               'month': month, 'item_id': it, 'item_price': closest_pc, 'year': year}
                    else:
                        month, year = convert(i+j)                    
                        row = {'shop_id': s, 'date_block_num': i+j, 'item_cnt_day': 0, 
                               'month': month, 'item_id': it, 'item_price': closest_pc, 'year': year}
                    sentence.append(row)
                elif i_index < 2:   # not in test set
                    next_chars[i_index].append(row)
            sentences[i_index].append(sentence)

In [None]:
print('train length : ', len(sentences[0]))
print('val length : ', len(sentences[1]))
print('test length : ', len(sentences[2]))

In [None]:
x_train_o = np.array(sentences[0])
x_val_o = np.array(sentences[1])
x_test_o = np.array(sentences[2])
y_train = np.array([x['item_cnt_day'] for x in next_chars[0]])
y_val = np.array([x['item_cnt_day'] for x in next_chars[1]])

In [None]:
length = MAX_SHOP + MAX_ITEM + MAX_MONTH + 1+ 1+ 1

In [None]:
%who

In [None]:
del train, test, sentences, sentence, next_chars

## categorical values

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

shop_dm = dict(zip(test_shops, le.fit_transform(test_shops)))
item_dm = dict(zip(test_items, le.fit_transform(test_items)))
month_dm = dict(zip(range(7,11), le.fit_transform(range(7,11))))

del test_shops, test_items

In [None]:
import gc
gc.collect()

In [None]:
def vectorize(inp) :
    print('Vectorization ...')
    
    x = np.zeros((len(inp), maxlen, length), dtype = np.float32)
    
    for i, sentence in enumerate(inp) :
        for t, char in enumerate(sentence) :
            x[i][t][ shop_dm[char['shop_id']] ] = 1        
            x[i][t][ MAX_SHOP + item_dm[char['item_id']] ] = 1
            x[i][t][ MAX_SHOP + MAX_ITEM + month_dm[char['month']] ] = 1
            x[i][t][ MAX_SHOP + MAX_ITEM + MAX_MONTH + 1 ] = char['item_price']
            x[i][t][ MAX_SHOP + MAX_ITEM + MAX_MONTH + 1 + 1] = char['item_cnt_day']    
    return x

In [None]:
x_train = vectorize(x_train_o)
x_val = vectorize(x_val_o)
x_test = vectorize(x_test_o)

In [None]:
del x_train_o, x_val_o, x_test_o
gc.collect()

In [None]:
print('X_train shape : ', x_train.shape)
print('X_val shape : ', x_val.shape)
print('X_test shape : ', x_test.shape)

# Modeling

In [None]:
print('Build Model...')
model = Sequential()
model.add(LSTM(32, input_shape = (maxlen, length)))
model.add(Dense(1, activation = 'relu'))

optimizer = RMSprop(lr = 0.005)
model.compile(loss = 'mean_squared_error', optimizer = optimizer)

In [None]:
history = model.fit(x_train, y_train, batch_size= 256, epochs = 13,
                    validation_data = (x_val, y_val))

In [None]:
history_dict = history.history
loss = history_dict['loss']
val_loss = history_dict['val_loss']

epochs = range(1, len(loss) + 1)

plt.plot(epochs, loss, 'bo', label = 'Training Loss')
plt.plot(epochs, val_loss, 'b', label = 'Validation Loss')
plt.title('Training and Validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

In [None]:
history2 = model.fit(x_val, y_val, batch_size=128, epochs=13)

In [None]:
history_dict = history2.history
loss = history_dict['loss']
val_loss = history_dict['val_loss']

epochs = range(1, len(loss) + 1)

plt.plot(epochs, loss, 'bo', label = 'Training Loss')
plt.plot(epochs, val_loss, 'b', label = 'Validation Loss')
plt.title('Training and Validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()