In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)

from itertools import product
from sklearn.preprocessing import LabelEncoder

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import time
import sys
import gc
import pickle
sys.version_info

sys.version_info(major=3, minor=7, micro=6, releaselevel='final', serial=0)

In [2]:
from module.data.read_data import *

In [3]:
train = sales_file_processing()

In [4]:
test = test_file_processing().set_index('ID')

items = pd.read_csv(ITEMS_FILE)
shops = shops_file_processing()
cats = categories_file_processing()

In [5]:
train = train[train.item_price<100000]
train = train[train.item_cnt_day<1001]

In [6]:
median = train[(train.shop_id==32)&(train.item_id==2973)&(train.date_block_num==4)&(train.item_price>0)].item_price.median()
train.loc[train.item_price<0, 'item_price'] = median

In [7]:
ts = time.time()
cols = ['date_block_num','shop_id','item_id']
group = train.groupby(cols).agg({'item_cnt_day': ['sum']})

new_columns = ['item_cnt_month']
group.columns = new_columns
group.reset_index(inplace=True)
group[new_columns] = group[new_columns].astype(np.float16)

time.time() - ts

1.1100032329559326

In [8]:
date_block_nums = [i for i in range(35)]
shop_ids = test['shop_id'].unique()
item_ids = test['item_id'].unique()
columns = ["date_block_num", "shop_id", "item_id"]

index = pd.MultiIndex.from_product([date_block_nums, shop_ids, item_ids], names=columns)
matrix = pd.DataFrame(index = index).reset_index()

In [9]:
matrix['date_block_num'] = matrix['date_block_num'].astype(np.int8)
matrix['shop_id'] = matrix['shop_id'].astype(np.int8)
matrix['item_id'] = matrix['item_id'].astype(np.int16)
matrix.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7497000 entries, 0 to 7496999
Data columns (total 3 columns):
date_block_num    int8
shop_id           int8
item_id           int16
dtypes: int16(1), int8(2)
memory usage: 28.6 MB


In [13]:
matrix.sort_values(cols, inplace=True)
matrix.reset_index(drop=True, inplace=True)

In [37]:
matrix.fillna(0, inplace=True)

In [15]:
matrix.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7497000 entries, 0 to 7496999
Data columns (total 3 columns):
date_block_num    int8
shop_id           int8
item_id           int16
dtypes: int16(1), int8(2)
memory usage: 28.6 MB


In [16]:
matrix = pd.merge(matrix, group,  how='left', on=columns)

In [18]:
matrix.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7497000 entries, 0 to 7496999
Data columns (total 4 columns):
date_block_num    int8
shop_id           int8
item_id           int16
item_cnt_month    float16
dtypes: float16(1), int16(1), int8(2)
memory usage: 100.1 MB


In [19]:
from tqdm.notebook import tqdm
tqdm.pandas()

  from pandas import Panel


In [47]:
def get_sets(matrix):
    train_X = matrix[matrix.date_block_num < 32]
    train_X = np.array(list(train_X.groupby(['shop_id', 'item_id'])['item_cnt_month'].progress_apply(pd.DataFrame.as_matrix)))
    train_X = train_X.reshape((train_X.shape[0], train_X.shape[1], 1))
    train_y = matrix[matrix.date_block_num == 32]['item_cnt_month']
    
    valid_X = matrix[(matrix.date_block_num > 0) & (matrix.date_block_num < 33)]
    valid_X = np.array(list(valid_X.groupby(['shop_id', 'item_id'])['item_cnt_month'].progress_apply(pd.DataFrame.as_matrix)))
    valid_X = valid_X.reshape((valid_X.shape[0], valid_X.shape[1], 1))
    valid_y = matrix[matrix.date_block_num == 33]['item_cnt_month']
   
    test_X = matrix[(matrix.date_block_num > 1) & (matrix.date_block_num < 34)]
    test_X = np.array(list(test_X.groupby(['shop_id', 'item_id'])['item_cnt_month'].progress_apply(pd.DataFrame.as_matrix)))
    test_X = test_X.reshape((test_X.shape[0], test_X.shape[1], 1))
    test_y = matrix[matrix.date_block_num == 34]

    return (train_X, train_y), (valid_X, valid_y), (test_X, test_y)


In [54]:
train, valid, test_dataset = get_sets(matrix)

HBox(children=(FloatProgress(value=0.0, max=214200.0), HTML(value='')))

  return func(*args, **kwargs)





HBox(children=(FloatProgress(value=0.0, max=214200.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=214200.0), HTML(value='')))




In [23]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout

Using TensorFlow backend.


In [24]:
import keras.backend as K

def rmse_keras(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true))) 

In [25]:
from sklearn.metrics import mean_squared_error
from math import sqrt

def rmse_sklearn(y_actual, y_predicted):
    return sqrt(mean_squared_error(y_actual, y_predicted))

In [39]:
# our defining our model 
my_model = Sequential()
my_model.add(LSTM(units = 64,input_shape = (32, 1)))
my_model.add(Dropout(0.4))
my_model.add(Dense(1))

my_model.compile(loss=rmse_keras, optimizer='adam', metrics=[rmse_keras])
my_model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_3 (LSTM)                (None, 64)                16896     
_________________________________________________________________
dropout_3 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 65        
Total params: 16,961
Trainable params: 16,961
Non-trainable params: 0
_________________________________________________________________


In [40]:
my_model.fit(train[0], train[1], batch_size = 4096, epochs = 10, validation_data=valid)

Train on 214200 samples, validate on 214200 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x27c23929848>

In [60]:
train_pred = my_model.predict(train[0])
valid_pred = my_model.predict(valid[0])
test_pred = my_model.predict(test_dataset[0])

In [63]:
mean_cv_scores = [
    rmse_sklearn(train[1], train_pred),
    rmse_sklearn(valid[1], valid_pred),
    rmse_sklearn(test_dataset[1]['item_cnt_month'], test_pred),
]

In [64]:
mean_cv_scores

[2.240683552975836, 2.4549468254965574, 0.04798060650407035]

In [57]:
import pickle

model_name = 'lstm'
with open('{}_train_pred.pickle'.format(model_name), 'wb') as f:
    pickle.dump(train_pred, f)

with open('{}_valid_pred.pickle'.format(model_name), 'wb') as f:
    pickle.dump(valid_pred, f)

with open('{}_test_pred.pickle'.format(model_name), 'wb') as f:
    pickle.dump(test_pred, f)

In [65]:
with open('{}_train.pickle'.format(model_name), 'wb') as f:
    pickle.dump(train[1], f)

with open('{}_valid.pickle'.format(model_name), 'wb') as f:
    pickle.dump(valid[1], f)

In [59]:
import pickle


test = test_file_processing().set_index('ID')
test = test[['shop_id', 'item_id']]

test_y = test_dataset[1]
test_y['item_cnt_month'] = test_pred.clip(0, 20)

answer = pd.merge(test, test_y,  how='left', on=['shop_id', 'item_id'])

submission = pd.DataFrame({
    "ID": answer.index, 
    "item_cnt_month": answer['item_cnt_month'],
})

submission.to_csv('{}_submission_filtered.csv'.format(model_name), index=False)
pickle.dump(answer['item_cnt_month'], open('{}_test_filtered.pickle'.format(model_name), 'wb'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
