In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/competitive-data-science-predict-future-sales/sample_submission.csv
/kaggle/input/competitive-data-science-predict-future-sales/shops.csv
/kaggle/input/competitive-data-science-predict-future-sales/items.csv
/kaggle/input/competitive-data-science-predict-future-sales/item_categories.csv
/kaggle/input/competitive-data-science-predict-future-sales/sales_train.csv
/kaggle/input/competitive-data-science-predict-future-sales/test.csv


In [2]:
import random
import tensorflow as tf

from tqdm.notebook import tqdm, trange

In [3]:
random_seed = 1234
random.seed(random_seed)
np.random.seed(random_seed)
tf.random.set_seed(random_seed)

In [4]:
data_dir = '/kaggle/input/competitive-data-science-predict-future-sales'

In [5]:
df_submission = pd.read_csv(os.path.join(data_dir, 'sample_submission.csv'))
df_test = pd.read_csv(os.path.join(data_dir, 'test.csv'))
df_shops = pd.read_csv(os.path.join(data_dir, 'shops.csv'))
df_item_categories = pd.read_csv(os.path.join(data_dir, 'item_categories.csv'))
df_items = pd.read_csv(os.path.join(data_dir, 'items.csv'))
df_train = pd.read_csv(os.path.join(data_dir, 'sales_train.csv'))

In [6]:
df_train['date'] = pd.to_datetime(df_train['date'], format='%d.%m.%Y')
df_train

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,2013-01-02,0,59,22154,999.00,1.0
1,2013-01-03,0,25,2552,899.00,1.0
2,2013-01-05,0,25,2552,899.00,-1.0
3,2013-01-06,0,25,2554,1709.05,1.0
4,2013-01-15,0,25,2555,1099.00,1.0
...,...,...,...,...,...,...
2935844,2015-10-10,33,25,7409,299.00,1.0
2935845,2015-10-09,33,25,7460,299.00,1.0
2935846,2015-10-14,33,25,7459,349.00,1.0
2935847,2015-10-22,33,25,7440,299.00,1.0


In [7]:
test_keys = df_test[['shop_id', 'item_id']].values
test_keys = [tuple(x) for x in test_keys]
len(test_keys)

214200

In [8]:
df_group = df_train.groupby(['date_block_num', 'shop_id', 'item_id']).agg({'item_price': 'mean', 'item_cnt_day': 'sum'})
df_group = df_group.reset_index()
df_group

Unnamed: 0,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,0,0,32,221.0,6.0
1,0,0,33,347.0,3.0
2,0,0,35,247.0,1.0
3,0,0,43,221.0,1.0
4,0,0,51,128.5,2.0
...,...,...,...,...,...
1609119,33,59,22087,119.0,6.0
1609120,33,59,22088,119.0,2.0
1609121,33,59,22091,179.0,1.0
1609122,33,59,22100,629.0,1.0


In [9]:
df_pivot_cnt = df_group.pivot_table(index = ['shop_id', 'item_id'], values = ['item_cnt_day'], columns = ['date_block_num'], fill_value = 0, aggfunc='sum')
df_pivot_cnt = df_pivot_cnt.reset_index()
df_pivot_cnt

Unnamed: 0_level_0,shop_id,item_id,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day
date_block_num,Unnamed: 1_level_1,Unnamed: 2_level_1,0,1,2,3,4,5,6,7,...,24,25,26,27,28,29,30,31,32,33
0,0,30,0,31,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,31,0,11,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,32,6,10,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,33,3,3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,35,1,14,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
424119,59,22154,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
424120,59,22155,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
424121,59,22162,0,0,0,0,0,0,0,0,...,0,9,4,1,1,0,0,1,0,0
424122,59,22164,0,0,0,0,0,0,0,0,...,0,2,1,2,0,0,1,0,0,0


In [10]:
df_train_cnt = df_pivot_cnt
df_train_cnt

Unnamed: 0_level_0,shop_id,item_id,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day
date_block_num,Unnamed: 1_level_1,Unnamed: 2_level_1,0,1,2,3,4,5,6,7,...,24,25,26,27,28,29,30,31,32,33
0,0,30,0,31,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,31,0,11,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,32,6,10,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,33,3,3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,35,1,14,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
424119,59,22154,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
424120,59,22155,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
424121,59,22162,0,0,0,0,0,0,0,0,...,0,9,4,1,1,0,0,1,0,0
424122,59,22164,0,0,0,0,0,0,0,0,...,0,2,1,2,0,0,1,0,0,0


In [11]:
df_train_cnt = df_train_cnt.fillna(0)
df_train_cnt

Unnamed: 0_level_0,shop_id,item_id,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day
date_block_num,Unnamed: 1_level_1,Unnamed: 2_level_1,0,1,2,3,4,5,6,7,...,24,25,26,27,28,29,30,31,32,33
0,0,30,0,31,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,31,0,11,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,32,6,10,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,33,3,3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,35,1,14,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
424119,59,22154,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
424120,59,22155,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
424121,59,22162,0,0,0,0,0,0,0,0,...,0,9,4,1,1,0,0,1,0,0
424122,59,22164,0,0,0,0,0,0,0,0,...,0,2,1,2,0,0,1,0,0,0


In [12]:
df_test_cnt = pd.merge(df_test, df_pivot_cnt, on = ['item_id','shop_id'], how = 'left')
df_test_cnt

  obj = obj._drop_axis(labels, axis, level=level, errors=errors)


Unnamed: 0,ID,shop_id,item_id,"(item_cnt_day, 0)","(item_cnt_day, 1)","(item_cnt_day, 2)","(item_cnt_day, 3)","(item_cnt_day, 4)","(item_cnt_day, 5)","(item_cnt_day, 6)",...,"(item_cnt_day, 24)","(item_cnt_day, 25)","(item_cnt_day, 26)","(item_cnt_day, 27)","(item_cnt_day, 28)","(item_cnt_day, 29)","(item_cnt_day, 30)","(item_cnt_day, 31)","(item_cnt_day, 32)","(item_cnt_day, 33)"
0,0,5,5037,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,1.0,1.0,1.0,3.0,1.0,0.0
1,1,5,5320,,,,,,,,...,,,,,,,,,,
2,2,5,5233,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,2.0,0.0,1.0,3.0,1.0
3,3,5,5232,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,4,5,5268,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
214195,214195,45,18454,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
214196,214196,45,16188,,,,,,,,...,,,,,,,,,,
214197,214197,45,15757,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
214198,214198,45,19648,,,,,,,,...,,,,,,,,,,


In [13]:
df_test_cnt = df_test_cnt.fillna(0)
df_test_cnt

Unnamed: 0,ID,shop_id,item_id,"(item_cnt_day, 0)","(item_cnt_day, 1)","(item_cnt_day, 2)","(item_cnt_day, 3)","(item_cnt_day, 4)","(item_cnt_day, 5)","(item_cnt_day, 6)",...,"(item_cnt_day, 24)","(item_cnt_day, 25)","(item_cnt_day, 26)","(item_cnt_day, 27)","(item_cnt_day, 28)","(item_cnt_day, 29)","(item_cnt_day, 30)","(item_cnt_day, 31)","(item_cnt_day, 32)","(item_cnt_day, 33)"
0,0,5,5037,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,1.0,1.0,1.0,3.0,1.0,0.0
1,1,5,5320,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,5,5233,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,2.0,0.0,1.0,3.0,1.0
3,3,5,5232,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,4,5,5268,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
214195,214195,45,18454,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
214196,214196,45,16188,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
214197,214197,45,15757,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
214198,214198,45,19648,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
test_inputs_ids = df_test_cnt[['shop_id', 'shop_id']].values
test_inputs_ids

array([[ 5,  5],
       [ 5,  5],
       [ 5,  5],
       ...,
       [45, 45],
       [45, 45],
       [45, 45]])

In [15]:
test_df_ids = df_test[['shop_id', 'shop_id']].values
test_df_ids

array([[ 5,  5],
       [ 5,  5],
       [ 5,  5],
       ...,
       [45, 45],
       [45, 45],
       [45, 45]])

In [16]:
np.array_equal(test_inputs_ids, test_df_ids)

True

In [17]:
train_matrix = df_train_cnt.drop(['shop_id', 'item_id'], axis=1)
train_matrix = np.expand_dims(train_matrix, axis=-1)
train_matrix.shape

  obj = obj._drop_axis(labels, axis, level=level, errors=errors)


(424124, 34, 1)

In [18]:
test_matrix = df_test_cnt.drop(['ID', 'shop_id', 'item_id'], axis=1)
test_matrix = np.expand_dims(test_matrix, axis=-1)
test_matrix.shape

(214200, 34, 1)

In [19]:
bs, n_total, d_model = train_matrix.shape
bs, n_total, d_model

(424124, 34, 1)

In [20]:
n_seq = 33
total_inputs, total_labels = [], []
for i in range(n_total - n_seq):
    x = train_matrix[:, i:i + n_seq]
    y = train_matrix[:, i + n_seq, :1]
    total_inputs.append(x)
    total_labels.append(y)

In [21]:
if len(total_inputs) == 1:
    train_inputs = total_inputs[0]
else:
    train_inputs = np.concatenate(total_inputs, axis=0)
train_inputs.shape

(424124, 33, 1)

In [22]:
if len(total_labels) == 1:
    train_labels = total_labels[0]
else:
    train_labels = np.concatenate(total_labels, axis=0)
train_labels.shape

(424124, 1)

In [23]:
test_inputs = test_matrix[:,1:]
test_inputs.shape

(214200, 33, 1)

In [24]:
def build_model_rnn(n_seq, d_model):
    inputs = tf.keras.layers.Input((n_seq, d_model))  # bs, n_seq, d_model

    hidden = tf.keras.layers.LSTM(units=64)(inputs)  # (bs, units)
    hidden = tf.keras.layers.Dropout(0.4)(hidden)

    output_dense = tf.keras.layers.Dense(1)
    outputs = output_dense(hidden)

    model = tf.keras.Model(inputs=(inputs), outputs=outputs)
    return model

In [25]:
model = build_model_rnn(n_seq, d_model)

model.compile(loss=tf.keras.losses.MeanSquaredError(), optimizer =tf.keras.optimizers.Adam())
model.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 33, 1)]           0         
_________________________________________________________________
lstm (LSTM)                  (None, 64)                16896     
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
dense (Dense)                (None, 1)                 65        
Total params: 16,961
Trainable params: 16,961
Non-trainable params: 0
_________________________________________________________________


In [26]:
# early stopping
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=10)
# save weights
save_weights = tf.keras.callbacks.ModelCheckpoint('weights.hdf5', monitor='loss', verbose=1, save_best_only=True, mode='min', save_freq='epoch', save_weights_only=True)

In [27]:
model.fit(train_inputs, train_labels, epochs = 10, batch_size = 4096, callbacks=[early_stopping, save_weights])

Epoch 1/10
Epoch 00001: loss improved from inf to 15.80727, saving model to weights.hdf5
Epoch 2/10
Epoch 00002: loss improved from 15.80727 to 15.64802, saving model to weights.hdf5
Epoch 3/10
Epoch 00003: loss improved from 15.64802 to 15.55693, saving model to weights.hdf5
Epoch 4/10
Epoch 00004: loss improved from 15.55693 to 15.47287, saving model to weights.hdf5
Epoch 5/10
Epoch 00005: loss improved from 15.47287 to 15.44960, saving model to weights.hdf5
Epoch 6/10
Epoch 00006: loss improved from 15.44960 to 15.35402, saving model to weights.hdf5
Epoch 7/10
Epoch 00007: loss improved from 15.35402 to 15.34254, saving model to weights.hdf5
Epoch 8/10
Epoch 00008: loss improved from 15.34254 to 15.31667, saving model to weights.hdf5
Epoch 9/10
Epoch 00009: loss improved from 15.31667 to 15.31440, saving model to weights.hdf5
Epoch 10/10
Epoch 00010: loss improved from 15.31440 to 15.27722, saving model to weights.hdf5


<tensorflow.python.keras.callbacks.History at 0x7f1016d3af50>

In [28]:
model = build_model_rnn(n_seq, d_model)
model.load_weights('weights.hdf5')

In [29]:
y_pred = model.predict(test_inputs)
y_pred = np.squeeze(y_pred, axis=-1)
y_pred

array([0.6317266 , 2.1093674 , 0.9448272 , ..., 0.08102894, 2.1093674 ,
       0.0471254 ], dtype=float32)

In [30]:
ID = df_test['ID'].values
ID

array([     0,      1,      2, ..., 214197, 214198, 214199])

In [31]:
submission = pd.DataFrame({'ID': ID,'item_cnt_month': y_pred})
submission

Unnamed: 0,ID,item_cnt_month
0,0,0.631727
1,1,2.109367
2,2,0.944827
3,3,0.067820
4,4,2.109367
...,...,...
214195,214195,0.203284
214196,214196,2.109367
214197,214197,0.081029
214198,214198,2.109367


In [32]:
submission.to_csv('submission.csv',index = False)