In [1]:
# Core
import time
notebookstart= time.time()
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Keras Neural Net / LSTM (RNN)
from keras.models import Sequential
from keras.layers import LSTM, Dense, Activation, ThresholdedReLU, MaxPooling2D, Embedding, Dropout
from keras.optimizers import Adam, SGD, RMSprop
from keras import backend as K
from keras.callbacks import EarlyStopping
from keras.layers.advanced_activations import PReLU
from keras.layers.normalization import BatchNormalization
from keras import optimizers


# Sklearn Support
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn import preprocessing
import gc

# Viz
import matplotlib.pyplot as plt
%matplotlib inline

# Utility
def root_mean_squared_error(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true), axis=-1)) 

# IN_PATH = "Large Data/Predict Future Sales/"
# OUT_PATH = "Large Data/Predict Future Sales/"
IN_PATH = "../input/"
OUT_PATH = "" 
# Import data
sales = pd.read_csv(IN_PATH + 'sales_train.csv', parse_dates=['date'], infer_datetime_format=True, dayfirst=True)
shops = pd.read_csv(IN_PATH + 'shops.csv')
items = pd.read_csv(IN_PATH + 'items.csv')
cats = pd.read_csv(IN_PATH + 'item_categories.csv')
val = pd.read_csv(IN_PATH + 'test.csv')
#"sample_submission.csv.gz"

# scaler = preprocessing.StandardScaler()
scaler = MinMaxScaler(feature_range=(0, 1))
sales["item_price"] = scaler.fit_transform(sales["item_price"].values.reshape(-1,1))
# sales["item_cnt_day"] = scaler.fit_transform(sales["item_cnt_day"].values.reshape(-1,1))
# sales["item_cnt_day"] = sales["item_cnt_day"].astype(int)
sales["item_cnt_day"].clip(0.,20.,inplace=True)

# Remove the items/shops outside of forecast range
sales = pd.merge(val,sales,on=['item_id','shop_id'], how='left')
sales = sales.fillna(0)

# Represents the submission set
expand = sales.loc[sales.date_block_num == 33,:]
expand.loc[:,"date_block_num"] = 34.0
sales = pd.concat([sales,expand])

In [2]:
# Clean
df = (sales.groupby(["date_block_num",'shop_id',"item_id"])[["item_price","item_cnt_day"]].sum()
                .unstack(level=[1,2]).fillna(0)
                .stack([1,2]).fillna(0).reset_index())
df["item_cnt_day"].clip(0.,20.,inplace=True)

In [3]:
# By Item / Date
item = (df.groupby(["date_block_num","item_id"])[["item_cnt_day","item_price"]].agg(
    dict(item_cnt_day= ["sum","mean","max","std"],
         item_price= ["sum","mean","max","std"]))).reset_index()
item.columns = pd.Index(["item_" + e[0] +"_"+ e[1] for e in item.columns.tolist()])
df = pd.merge(df,item, how="left", left_on = ["date_block_num","item_id"],
        right_on=["item_date_block_num_","item_item_id_"]).drop(["item_date_block_num_","item_item_id_"],axis=1)

# By Shop / Date
shop = (df.groupby(["date_block_num","shop_id"])[["item_cnt_day","item_price"]].agg(
    dict(item_cnt_day= ["sum","mean","max","std"],
         item_price= ["sum","mean","max","std"]))).reset_index()
shop.columns = pd.Index(["shop_" + e[0] +"_"+ e[1] for e in shop.columns.tolist()])
df = pd.merge(df,shop, how="left", left_on = ["date_block_num","shop_id"],
        right_on=["shop_date_block_num_","shop_shop_id_"]).drop(["shop_date_block_num_","shop_shop_id_"],axis=1)
del item, shop

In [4]:
# Merge and Expand Category Variable
items = pd.merge(items, cats, on = "item_category_id",how='left')
items = (pd.concat([items,items.item_category_name.str.split('-', n=1,expand=True)], axis=1)
      .rename(columns = {0:"category1",1:"category2"}))[["item_id","category1","category2"]]

# Encode Russian Strings into categorical interger
lbl = preprocessing.LabelEncoder()
for col in ["category1","category2"]:
    items[col] = lbl.fit_transform(items[col].astype(str))

# Merge Df and Items.. 
df = pd.merge(df, items,on="item_id",how="left")

# Additional Ideas:
"""
Stochastic Gradient Descent
Batch Normalization
Less Dropout?
"""

In [5]:
y_var = df.loc[df.date_block_num != 0,"item_cnt_day"].copy()
df.date_block_num = df.date_block_num + 1
n_samples = df["shop_id"].nunique()*df["item_id"].nunique()
df = df.drop(["shop_id","item_id"],axis=1).set_index("date_block_num")

In [7]:
# Input Output
y = y_var.values.reshape(y_var.shape[0],1)
X = df.values.reshape(n_samples*35,1,df.shape[1])
testing = X[-n_samples:,:,:] # Test Set
X = X[:-n_samples,:,:]
print("X for Submission Shape: ",X.shape)
print("y Shape: ",y.shape)
print("X Shape: ",X.shape)

# Validation Set
val_size = 1
y_train = y[:-val_size*n_samples,:]
y_valid = y[-val_size*n_samples:,:]
X_train = X[:-val_size*n_samples,:,:]
X_valid = X[-val_size*n_samples:,:,:]
print("y Train Shape: ",y_train.shape)
print("X Train Shape: ",X_train.shape)
print("y Valid Shape: ",y_valid.shape)
print("X Valid Shape: ",X_valid.shape)

In [None]:
VALID = True
if VALID is True:
    inputshape = (X_train.shape[1], X_train.shape[2])
else: 
    inputshape = (X.shape[1], X.shape[2])

LSTM_PARAM = {"batch_size":512,
              "verbose":1,
              "epochs":3}
    
print("Modeling Stage")
# Define the model layers
model_lstm = Sequential()
model_lstm.add(LSTM(200, input_shape=inputshape,return_sequences=True))
#model_lstm.add(PReLU())
model_lstm.add(BatchNormalization())
model_lstm.add(Dropout(0.4))

model_lstm.add(LSTM(64))
model_lstm.add(PReLU())
model_lstm.add(BatchNormalization())
model_lstm.add(Dropout(0.2))

# model_lstm.add(LSTM(64))
# model_lstm.add(PReLU())
# model_lstm.add(BatchNormalization())
# model_lstm.add(Dropout(0.2))

model_lstm.add(Dense(1))

# model_lstm = Sequential()
# model_lstm.add(LSTM(512, input_shape=inputshape))
# model_lstm.add(BatchNormalization())
# model_lstm.add(Dropout(.2))

# model_lstm.add(Dense(256))
# model_lstm.add(PReLU())
# model_lstm.add(BatchNormalization())
# model_lstm.add(Dropout(.1))

# model_lstm.add(Dense(256))
# model_lstm.add(PReLU())
# model_lstm.add(BatchNormalization())
# model_lstm.add(Dropout(.1))

# model_lstm.add(Dense(128))
# model_lstm.add(PReLU())
# model_lstm.add(BatchNormalization())
# model_lstm.add(Dropout(.05))

# model_lstm.add(Dense(64))
# model_lstm.add(PReLU())
# model_lstm.add(BatchNormalization())
# model_lstm.add(Dropout(.05))

# model_lstm.add(Dense(32))
# model_lstm.add(PReLU())
# model_lstm.add(BatchNormalization())
# model_lstm.add(Dropout(.05))

# model_lstm.add(Dense(16))
# model_lstm.add(PReLU())
# model_lstm.add(BatchNormalization())
# model_lstm.add(Dropout(.05))

# model_lstm.add(Dense(1))

# Compile
opt = optimizers.Adam(lr=0.02)
#opt = optimizers.SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)
model_lstm.compile(optimizer=opt, loss='mse', metrics=["mse",root_mean_squared_error])
print(model_lstm.summary())

# Optimizer


# Train Model
print("\nFit Model")
modelstart = time.time()
if VALID is True:
    callbacks_list=[EarlyStopping(monitor="val_loss",min_delta=.001, patience=5,mode='auto')]
    hist = model_lstm.fit(X_train, y_train,
                          validation_data=(X_valid, y_valid),
                          callbacks=callbacks_list,
                          **LSTM_PARAM)

    # Model Evaluation
    best = np.argmin(hist.history["val_loss"])
    print("Optimal Epoch: ",best+1)
    print("Train Score: {}, Validation Score: {}".format(hist.history["loss"][best],
                                                         hist.history["val_loss"][best]))

    plt.plot(hist.history['loss'], label='train')
    plt.plot(hist.history['val_loss'], label='validation')
    plt.xlabel("Epochs")
    plt.ylabel("Mean Square Error")
    plt.legend()
    plt.show()
    plt.savefig("Train and Validation MSE Progression.png")

if VALID is False:
    hist = model_lstm.fit(X,y,**LSTM_PARAM)
    
    plt.plot(hist.history['loss'], label='Training Loss')
    plt.xlabel("Epochs")
    plt.ylabel("Mean Square Error")
    plt.legend()
    plt.show()
    plt.savefig("Training Loss Progression.png")

In [17]:
pred = model_lstm.predict(testing)

print("Output Submission")
submission = pd.DataFrame(pred.clip(0.,20.),columns=['item_cnt_month'])
submission.to_csv(OUT_PATH + 'submission.csv',index_label='ID')

print(submission.shape)
print(submission.head())
print("\nModel Runtime: %0.2f Minutes"%((time.time() - modelstart)/60))
print("Notebook Runtime: %0.2f Minutes"%((time.time() - notebookstart)/60))