In [1]:
import numpy as np
import pandas as pd
from sklearn import *
import nltk, datetime
from matplotlib import pylab as plt
import timeit

# Loading all data files
directory = '../data/'
train = pd.read_csv(directory + 'sales_train_v2.csv', parse_dates=['date'], infer_datetime_format=True, dayfirst=True)
test = pd.read_csv(directory + 'test.csv')
submission = pd.read_csv(directory + 'sample_submission.csv')
items = pd.read_csv(directory + 'items.csv')
item_cats = pd.read_csv(directory + 'item_categories.csv')
shops = pd.read_csv(directory + 'shops.csv')



In [2]:
# Removing unnecesary features for this model 
train_clean = train.drop(labels = ['date', 'item_price'], axis = 1)

# Change the item count per day to item count per month by using grouping
train_clean = train_clean.groupby(["item_id","shop_id","date_block_num"]).sum().reset_index()
train_clean = train_clean.rename(index=str, columns = {"item_cnt_day":"item_cnt_month"})

In [3]:
# Data preprocessing cell
num_month = train['date_block_num'].max()
month_list = [i for i in range(num_month + 1)] 

# Shop and item selection
shops = [54] * (num_month + 1) 
items = [22167] * (num_month + 1)

check = train_clean[["shop_id","item_id","date_block_num","item_cnt_month"]]
check = check.loc[check['shop_id'] == 54]
check = check.loc[check['item_id'] == 22167]

months_full = pd.DataFrame({'shop_id': shops, 'item_id': items, 'date_block_num':month_list})

sales_33month = pd.merge(check, months_full, how='right', on=['shop_id','item_id','date_block_num'])
sales_33month = sales_33month.sort_values(by=['date_block_num'])
sales_33month.fillna(0.00, inplace=True)

df = sales_33month[['shop_id','item_id','date_block_num','item_cnt_month']].reset_index()
df = df.drop(labels = ['index'], axis = 1)

x, y = df, df.item_cnt_month

In [4]:
from sklearn.cross_validation import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.15, random_state=42)

x_train = x_train.drop(["item_cnt_month"], axis=1)
x_test = x_test.drop(["item_cnt_month"], axis=1)

In [5]:
x_train

Unnamed: 0,shop_id,item_id,date_block_num
21,54,22167,21
12,54,22167,12
32,54,22167,32
9,54,22167,9
0,54,22167,0
4,54,22167,4
16,54,22167,16
17,54,22167,17
5,54,22167,5
13,54,22167,13


In [6]:
# Reshape the data between -1 and 1 and to 3D
from sklearn.preprocessing import RobustScaler, MinMaxScaler
scaler = RobustScaler()
x_train_scaled = scaler.fit_transform(x_train)

x_train_reshaped = x_train_scaled.reshape((x_train_scaled.shape[0], 1, x_train_scaled.shape[1]))
y_train_reshaped = y_train.as_matrix().reshape(y_train.shape[0], )

x_test_scaled = scaler.fit_transform(x_test)
x_test_reshaped = x_test_scaled.reshape((x_test_scaled.shape[0], 1, x_test_scaled.shape[1]))

  import sys


In [7]:
# Model instantiation

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.core import Dropout, Activation

model = Sequential()

model.add(LSTM(15, input_shape=(1, 3), return_sequences=True, activation='tanh'))
model.add(Dense(1))

model.add(Dropout(0.1))
  
model.add(LSTM(33)) 
model.add(Dropout(0.1))

model.add(Dense(1))

model.compile(loss='mean_squared_error', optimizer='adagrad', metrics=['mean_squared_error', 'accuracy'])

model.summary()

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 1, 15)             1140      
_________________________________________________________________
dense_1 (Dense)              (None, 1, 1)              16        
_________________________________________________________________
dropout_1 (Dropout)          (None, 1, 1)              0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 33)                4620      
_________________________________________________________________
dropout_2 (Dropout)          (None, 33)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 34        
Total params: 5,810
Trainable params: 5,810
Non-trainable params: 0
_________________________________________________________________


In [8]:
start = timeit.default_timer()
history = model.fit(x_train_reshaped, y_train_reshaped,  epochs = 100, batch_size = 33, verbose=1, shuffle = False, validation_split = 0.95)
stop = timeit.default_timer()

# Duration
print (stop - start)

Train on 1 samples, validate on 27 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100


Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100


Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
3.927233263085267


In [9]:
start = timeit.default_timer()
y_predicted = model.predict(x_test_reshaped)
stop = timeit.default_timer()

print (stop - start)

0.14003093151561163


In [10]:
from sklearn.metrics import mean_squared_error
from numpy import sqrt
rmse = sqrt(mean_squared_error(y_test, y_predicted))
print('Val RMSE: %.3f' % rmse)

Val RMSE: 1.309
