In [1]:
import numpy as np
import pandas as pd
from sklearn import *
import nltk, datetime
from matplotlib import pylab as plt

directory = '../data/'
train = pd.read_csv(directory + 'sales_train_v2.csv', parse_dates=['date'], infer_datetime_format=True, dayfirst=True)
test = pd.read_csv(directory + 'test.csv')
submission = pd.read_csv(directory + 'sample_submission.csv')
items = pd.read_csv(directory + 'items.csv')
item_cats = pd.read_csv(directory + 'item_categories.csv')
shops = pd.read_csv(directory + 'shops.csv')
print('train:', train.shape, 'test:', test.shape)



train: (2935849, 6) test: (214200, 3)


In [2]:
# Now we convert the raw sales data to monthly sales, broken out by item & shop
# This placeholder dataframe will be used later to create the actual training set
horizontal = train.groupby([train.date.apply(lambda x: x.strftime('%Y-%m')),'item_id','shop_id']).sum().reset_index()
horizontal = horizontal[['date','item_id','shop_id','item_cnt_day']]
horizontal = horizontal.pivot_table(index=['item_id','shop_id'], columns='date',values='item_cnt_day',fill_value=0).reset_index()

In [3]:
# Merge the monthly sales data to the test data
# This placeholder dataframe now looks similar in format to our training data
df_test = pd.merge(test, horizontal, on=['item_id','shop_id'], how='left')
df_test = df_test.fillna(0)
df_test = df_test.drop(labels=['ID', 'shop_id', 'item_id'], axis=1)

In [4]:
# Now we finally create the actual training set
# Let's use the '2015-10' sales column as the target to predict
TARGET = '2015-10'
y_train = df_test[TARGET]
X_train = df_test.drop(labels=[TARGET], axis=1)

In [5]:
# To make the training set friendly for keras, we convert it to a numpy matrix
X_train = X_train.as_matrix()
X_train = X_train.reshape((214200, 33, 1))

y_train = y_train.as_matrix()
y_train = y_train.reshape(214200, 1)

print(y_train.shape)
print(X_train.shape)

(214200, 1)
(214200, 33, 1)


In [6]:
# Lastly we create the test set by converting the test data to a numpy matrix
# We drop the first month so that our trained LSTM can output predictions beyond the known time range
X_test = df_test.drop(labels=['2013-01'],axis=1)
X_test = X_test.as_matrix()
X_test = X_test.reshape((214200, 33, 1))
print(X_test.shape)

(214200, 33, 1)


In [7]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, Activation
from keras.optimizers import Adam, SGD, RMSprop
from keras import backend as K

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [8]:
# Create the model using the NestedLSTM class - two layers are a good starting point
# Feel free to play around with the number of nodes & other model parameters
model = Sequential()
model.add(LSTM(15, input_shape=(33,1)))
model.add(Dense(1))

# The adam optimizer works pretty well, although you might try RMSProp as well
model.compile(loss='mse',
              optimizer='adam',
              metrics=['mean_squared_error'])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 15)                1020      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 16        
Total params: 1,036
Trainable params: 1,036
Non-trainable params: 0
_________________________________________________________________


In [9]:
# It's training time!
BATCH_SIZE = 128
number_of_epochs = 5

print('Training time, it is...')
model.fit(X_train, y_train, batch_size = BATCH_SIZE, epochs = number_of_epochs )

Training time, it is...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x2758a25aef0>

In [10]:
# Get the test set predictions and clip values to the specified range
y_pred = model.predict(X_test).clip(0., 20.)

# Create the submission file and submit!
preds = pd.DataFrame(y_pred, columns=['item_cnt_month'])