In [1]:
import os
import numpy as np
import pandas as pd
#import matplotlib as mpl
import matplotlib.pyplot as plt
#import matplotlib.dates as mdates
%matplotlib inline 

In [2]:
DATA_FOLDER = './data/'

train = pd.read_csv(os.path.join(DATA_FOLDER, 'sales_train.csv.gz'))
test = pd.read_csv(os.path.join(DATA_FOLDER, 'test.csv.gz'))
items = pd.read_csv(os.path.join(DATA_FOLDER, 'items.csv'))
categories = pd.read_csv(os.path.join(DATA_FOLDER, 'item_categories.csv'))
shops = pd.read_csv(os.path.join(DATA_FOLDER, 'shops.csv'))

date_block_nums = train[['date_block_num']].drop_duplicates().reset_index()
combos = shops.assign(foo=1).merge(items.assign(foo=1)).drop('foo', 1)
combos = combos.assign(foo=1).merge(date_block_nums.assign(foo=1)).drop('foo', 1)
combos.head()

print(len(combos))
print(len(date_block_nums) * len(shops) * len(items))

train.date = pd.to_datetime(train.date, format='%d.%m.%Y')
print(train.date.min(), train.date.max())

In [3]:
max_train_date_block_num = train.date_block_num.max()

In [4]:
train.item_cnt_day = train.item_cnt_day.clip(0, 20)

In [5]:
train_rup = train.groupby(['date_block_num', 'shop_id', 'item_id']).sum().loc[:, 'item_cnt_day'].reset_index().sort_values(['date_block_num', 'shop_id', 'item_id'])
train_rup.rename(columns={'item_cnt_day':'item_cnt_month'}, inplace=True)
train_rup.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month
0,0,0,32,6.0
1,0,0,33,3.0
2,0,0,35,1.0
3,0,0,43,1.0
4,0,0,51,2.0


In [6]:
train_rup = train_rup.assign(prev_date_block_num = train_rup.date_block_num - 1)
train_rup.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,prev_date_block_num
0,0,0,32,6.0,-1
1,0,0,33,3.0,-1
2,0,0,35,1.0,-1
3,0,0,43,1.0,-1
4,0,0,51,2.0,-1


means = train_rup.groupby(['shop_id', 'item_id'])[['item_cnt_month']].mean().reset_index().sort_values(['shop_id', 'item_id'])
means.rename(columns={'item_cnt_month':'item_cnt_month_mean'}, inplace=True)
means.head()

In [7]:
merged = train_rup.merge(train_rup, how='left', left_on=['prev_date_block_num', 'shop_id', 'item_id'], right_on=['date_block_num', 'shop_id', 'item_id'] )
merged.head()

Unnamed: 0,date_block_num_x,shop_id,item_id,item_cnt_month_x,prev_date_block_num_x,date_block_num_y,item_cnt_month_y,prev_date_block_num_y
0,0,0,32,6.0,-1,,,
1,0,0,33,3.0,-1,,,
2,0,0,35,1.0,-1,,,
3,0,0,43,1.0,-1,,,
4,0,0,51,2.0,-1,,,


In [8]:
buf = merged.loc[:, ['date_block_num_x', 'shop_id', 'item_id', 'item_cnt_month_y', 'item_cnt_month_x']]
buf.rename(columns={'date_block_num_x':'date_block_num', 'item_cnt_month_x': 'item_cnt_month', 'item_cnt_month_y':'item_cnt_prev_month'}, inplace=True)
buf.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_prev_month,item_cnt_month
0,0,0,32,,6.0
1,0,0,33,,3.0
2,0,0,35,,1.0
3,0,0,43,,1.0
4,0,0,51,,2.0


In [10]:
#buf = buf.merge(means, how='left', left_on=['shop_id', 'item_id'], right_on=['shop_id', 'item_id'])
buf = buf.assign(valid=pd.Series(~buf.item_cnt_prev_month.isnull(), dtype=int))
buf.item_cnt_prev_month.fillna(-1, inplace=True)
buf.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_prev_month,item_cnt_month,valid
0,0,0,32,-1.0,6.0,0
1,0,0,33,-1.0,3.0,0
2,0,0,35,-1.0,1.0,0
3,0,0,43,-1.0,1.0,0
4,0,0,51,-1.0,2.0,0


In [11]:
buf.describe()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_prev_month,item_cnt_month,valid
count,1609124.0,1609124.0,1609124.0,1609124.0,1609124.0,1609124.0
mean,14.66479,32.80585,10680.99,0.841635,2.22614,0.420563
std,9.542322,16.53701,6238.883,6.508466,6.437858,0.4936496
min,0.0,0.0,0.0,-1.0,0.0,0.0
25%,6.0,21.0,5045.0,-1.0,1.0,0.0
50%,14.0,31.0,10497.0,-1.0,1.0,0.0
75%,23.0,47.0,16060.0,1.0,2.0,1.0
max,33.0,59.0,22169.0,618.0,618.0,1.0


In [12]:
target_col = 'item_cnt_month'
X = buf.loc[:, buf.columns != target_col].values
print(X) #X.head()

[[ 0.0000e+00  0.0000e+00  3.2000e+01 -1.0000e+00  0.0000e+00]
 [ 0.0000e+00  0.0000e+00  3.3000e+01 -1.0000e+00  0.0000e+00]
 [ 0.0000e+00  0.0000e+00  3.5000e+01 -1.0000e+00  0.0000e+00]
 ...
 [ 3.3000e+01  5.9000e+01  2.2091e+04  3.0000e+00  1.0000e+00]
 [ 3.3000e+01  5.9000e+01  2.2100e+04  1.0000e+00  1.0000e+00]
 [ 3.3000e+01  5.9000e+01  2.2102e+04 -1.0000e+00  0.0000e+00]]


In [13]:
y = buf.loc[:, [target_col]].values.ravel()
print(y) #y.head()

[6. 3. 1. ... 1. 1. 1.]


from sklearn.model_selection import KFold
cv = KFold(n_splits=3, shuffle=True)
print(cv)

In [14]:
def gen_time_split(X, n_splits):
    for i in range(n_splits):
        print(i)
        first_vali_date_block_num = max_train_date_block_num - i
        vali_indices = X.loc[:,'date_block_num'] == first_vali_date_block_num
        train_indices = X.loc[:,'date_block_num'] < first_vali_date_block_num
        yield (train_indices[train_indices].index, vali_indices[vali_indices].index)        

In [15]:
cv = gen_time_split(buf, 3)

for (train_indices, vali_indices) in gen_time_split(buf, 3):
    print('train')
    print(X[train_indices])
    print('test')
    print(X[vali_indices])

In [16]:
from sklearn.ensemble import GradientBoostingRegressor
est = GradientBoostingRegressor(n_estimators=100, max_depth=7, loss='ls', verbose=1)

In [17]:
#lr = 1 / np.logspace(0.0, 1.0, num=5)[2:]
#lr = np.array([0.3, 0.45, 0.6])
#lr = np.linspace(0.3, 0.6, 5)
lr = np.array([0.3])
print(lr)

[0.3]


In [18]:
from sklearn.model_selection import GridSearchCV
param_grid = {'learning_rate':lr}
gs = GridSearchCV(est, param_grid, cv=cv, refit=True, n_jobs=4, scoring='r2', verbose=1)
#gs.fit(X, y)

In [19]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
pipe = Pipeline(steps=[('Scaling', StandardScaler()), ('Search', gs)])

In [20]:
pipe.fit(X,y)

0
1
2
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   3 out of   3 | elapsed:  7.4min finished


      Iter       Train Loss   Remaining Time 
         1          27.7456            4.90m
         2          20.8531            4.82m
         3          17.3259            4.74m
         4          15.3891            4.74m
         5          14.4356            4.61m
         6          13.8540            4.56m
         7          13.4816            4.49m
         8          13.2537            4.44m
         9          12.9656            4.41m
        10          12.8220            4.32m
        20          11.4329            3.83m
        30          10.3394            3.37m
        40           9.7703            2.90m
        50           9.1783            2.43m
        60           8.7514            1.94m
        70           8.3919            1.46m
        80           8.1209           58.43s
        90           7.8940           29.11s
       100           7.7374            0.00s


Pipeline(memory=None,
     steps=[('Scaling', StandardScaler(copy=True, with_mean=True, with_std=True)), ('Search', GridSearchCV(cv=<generator object gen_time_split at 0x7fbbec1763b8>,
       error_score='raise-deprecating',
       estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
           ...
       refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=1))])

from sklearn.model_selection import cross_val_score
scores = cross_val_score(est, X, y, cv=cv)
print(scores)
#print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [21]:
print(gs.cv_results_['mean_test_score'])
print(gs.cv_results_['params'])
#gs.best_estimator_

[-18.39451776]
[{'learning_rate': 0.3}]


In [22]:
from sklearn.metrics import mean_squared_error
y_pred = pipe.predict(X)
mse = mean_squared_error(y, y_pred)
print(mse)
print(np.sqrt(mse))

7.737353468633712
2.7816098699554743


In [23]:
X_test = test.assign(date_block_num=max_train_date_block_num+1, prev_date_block_num=max_train_date_block_num).drop(columns=['ID'])
X_test.head()

Unnamed: 0,shop_id,item_id,date_block_num,prev_date_block_num
0,5,5037,34,33
1,5,5320,34,33
2,5,5233,34,33
3,5,5232,34,33
4,5,5268,34,33


In [24]:
merged2 = X_test.merge(train_rup, how='left', left_on=['prev_date_block_num', 'shop_id', 'item_id'], right_on=['date_block_num', 'shop_id', 'item_id'] )
merged2.head()

Unnamed: 0,shop_id,item_id,date_block_num_x,prev_date_block_num_x,date_block_num_y,item_cnt_month,prev_date_block_num_y
0,5,5037,34,33,,,
1,5,5320,34,33,,,
2,5,5233,34,33,33.0,1.0,32.0
3,5,5232,34,33,,,
4,5,5268,34,33,,,


In [54]:
buf2 = merged2.loc[:, ['date_block_num_x', 'shop_id', 'item_id', 'item_cnt_month']]
buf2.rename(columns={'date_block_num_x':'date_block_num', 'item_cnt_month':'item_cnt_prev_month'}, inplace=True)
buf2.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_prev_month
0,34,5,5037,
1,34,5,5320,
2,34,5,5233,1.0
3,34,5,5232,
4,34,5,5268,


In [55]:
buf2 = buf2.assign(valid=pd.Series(~buf2.item_cnt_prev_month.isnull(), dtype=int))
buf2.item_cnt_prev_month.fillna(-1, inplace=True)
buf2.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_prev_month,valid
0,34,5,5037,-1.0,0
1,34,5,5320,-1.0,0
2,34,5,5233,1.0,1
3,34,5,5232,-1.0,0
4,34,5,5268,-1.0,0


In [56]:
X_test = buf2.values

In [57]:
y_pred_test = pipe.predict(X_test)
print(y_pred_test)

[1.42306685 1.21112435 1.36891599 ... 1.21063522 1.21994674 0.88613285]


In [29]:
submission = test.assign(item_cnt_month=y_pred_test)[['item_cnt_month']]
submission.describe()

Unnamed: 0,item_cnt_month
count,214200.0
mean,1.484593
std,2.278481
min,-3.027911
25%,1.159105
50%,1.307529
75%,1.515532
max,463.55372


In [30]:
submission.head()

Unnamed: 0,item_cnt_month
0,1.423067
1,1.211124
2,1.368916
3,1.361695
4,1.322851


In [31]:
submission.to_csv('lagged3.csv', index_label='ID') #header=['ID', 'item_cnt_month'])

In [32]:
!gzip lagged3.csv
!ls

Baseline.ipynb	EDA.ipynb	lagged3.csv.gz	Lagged.ipynb
data		lagged2.csv.gz	lagged.csv.gz	submission.csv.gz


0.3 is best learning rate so far.

[
 (split1_train_idxs, split1_test_idxs),
 (split2_train_idxs, split2_test_idxs),
 (split3_train_idxs, split3_test_idxs),
 ...
]

"Submissions are evaluated by root mean squared error (RMSE). True target values are clipped into [0,20] range."

and

"For each id in the test set, you must predict a total number of sales."

and

"Submission is for date_block_num 34"

and

"
My CV strategy is 5-fold moving window:

fold 1: Train on month 0 to 32 and validate on 33
fold 2: Train on month 0 to 31 and validate on 32
…
fold 5: Train on month 0 to 28 and validate on 29
"

and

- mean encodings
- lag
- text extraction on item and category names

In [38]:
import sys
#!conda install --yes --prefix {sys.prefix} xgboost
!{sys.executable} -m pip install xgboost

Collecting xgboost
[?25l  Downloading https://files.pythonhosted.org/packages/54/21/8b2ec99862903a6d3aed62ce156d21d114b8666e669c46d9e54041df9496/xgboost-0.81-py2.py3-none-manylinux1_x86_64.whl (16.6MB)
[K    100% |████████████████████████████████| 16.6MB 2.1MB/s ta 0:00:011
Installing collected packages: xgboost
Successfully installed xgboost-0.81


In [68]:
import xgboost as xgb
dtrain = xgb.DMatrix(buf.loc[:, buf.columns != target_col], label=y)
param = {'max_depth':7, 'eta':0.3, 'silent':0, 'objective':'reg:linear', 'eval_metrix':'rmse' }
num_round = 100
bst = xgb.train(param, dtrain, num_round)

[22:25:46] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 174 extra nodes, 0 pruned nodes, max_depth=7
[22:25:47] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 166 extra nodes, 0 pruned nodes, max_depth=7
[22:25:47] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 186 extra nodes, 0 pruned nodes, max_depth=7
[22:25:48] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 186 extra nodes, 0 pruned nodes, max_depth=7
[22:25:48] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 164 extra nodes, 0 pruned nodes, max_depth=7
[22:25:48] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 178 extra nodes, 0 pruned nodes, max_depth=7
[22:25:49] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 194 extra nodes, 0 pruned nodes, max_depth=7
[22:25:49] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 178 extra nodes, 0 pruned nodes, max_depth=7
[22:25:5

[22:26:14] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 110 extra nodes, 0 pruned nodes, max_depth=7
[22:26:15] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 174 extra nodes, 0 pruned nodes, max_depth=7
[22:26:15] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 84 extra nodes, 0 pruned nodes, max_depth=7
[22:26:16] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 226 extra nodes, 0 pruned nodes, max_depth=7
[22:26:16] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 188 extra nodes, 0 pruned nodes, max_depth=7
[22:26:17] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 114 extra nodes, 0 pruned nodes, max_depth=7
[22:26:17] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 188 extra nodes, 0 pruned nodes, max_depth=7
[22:26:17] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 192 extra nodes, 0 pruned nodes, max_depth=7
[22:26:18

In [69]:
y_pred = bst.predict(dtrain)
mse = mean_squared_error(y, y_pred)
print(mse)
print(np.sqrt(mse))

8.016485296205774
2.831339841171627


In [70]:
# make prediction
dtest = xgb.DMatrix(buf2)
y_pred_test = bst.predict(dtest)
y_pred_test

array([1.4377265 , 1.3224547 , 1.4866621 , ..., 1.18872   , 1.1837723 ,
       0.79696286], dtype=float32)