In [1]:
import os
import numpy as np
import pandas as pd
#import matplotlib as mpl
import matplotlib.pyplot as plt
#import matplotlib.dates as mdates
%matplotlib inline 

In [2]:
DATA_FOLDER = './data/'

train = pd.read_csv(os.path.join(DATA_FOLDER, 'sales_train.csv.gz'))
test = pd.read_csv(os.path.join(DATA_FOLDER, 'test.csv.gz'))
items = pd.read_csv(os.path.join(DATA_FOLDER, 'items.csv'))
categories = pd.read_csv(os.path.join(DATA_FOLDER, 'item_categories.csv'))
shops = pd.read_csv(os.path.join(DATA_FOLDER, 'shops.csv'))

In [3]:
max_train_date_block_num = train.date_block_num.max()

In [4]:
from itertools import product

# For every month we create a grid from all shops/items combinations from that month
grid = [] 
for block_num in train['date_block_num'].unique():
    cur_shops = train[train['date_block_num']==block_num]['shop_id'].unique()
    cur_items = train[train['date_block_num']==block_num]['item_id'].unique()
    grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])),dtype='int32'))

#turn the grid into pandas dataframe
index_cols = ['shop_id', 'item_id', 'date_block_num']
grid = pd.DataFrame(np.vstack(grid), columns = index_cols, dtype=np.int32)

#get aggregated values for (shop_id, item_id, month)
gb = train.groupby(index_cols,as_index=False).agg({'item_cnt_day':{'item_cnt_month':'sum'}})

#fix column names
gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values]
#join aggregated data to the grid
all_data = pd.merge(grid,gb,how='left',on=index_cols).fillna(0)
#sort the data
all_data.sort_values(['date_block_num','shop_id','item_id'],inplace=True)
all_data.head()

  return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)


Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month
139255,0,19,0,0.0
141495,0,27,0,0.0
144968,0,28,0,0.0
142661,0,29,0,0.0
138947,0,32,0,6.0


In [5]:
all_data.describe()

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month
count,10913850.0,10913850.0,10913850.0,10913850.0
mean,31.1872,11309.26,14.97334,0.3342731
std,17.34959,6209.978,9.495618,3.417243
min,0.0,0.0,0.0,-22.0
25%,16.0,5976.0,7.0,0.0
50%,30.0,11391.0,14.0,0.0
75%,46.0,16605.0,23.0,0.0
max,59.0,22169.0,33.0,2253.0


In [29]:
train_rup = all_data
train_rup.item_cnt_month = train_rup.item_cnt_month.clip(0, 20)
train_rup.head()

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month
139255,0,19,0,0.0
141495,0,27,0,0.0
144968,0,28,0,0.0
142661,0,29,0,0.0
138947,0,32,0,6.0


In [30]:
shop_means = train_rup.groupby(['shop_id', 'date_block_num']).mean()[['item_cnt_month']]
shop_means = shop_means.rename(columns={'item_cnt_month':'mean_shop_cnt_month'}).reset_index()
print(len(shop_means))
print(max_train_date_block_num * len(shops))
shop_means.head()

1586
1980


Unnamed: 0,shop_id,date_block_num,mean_shop_cnt_month
0,0,0,0.66679
1,0,1,0.712537
2,1,0,0.361306
3,1,1,0.40573
4,2,0,0.14122


In [31]:
category_means = train_rup.merge(items, how='left').groupby(['date_block_num', 'item_category_id']).mean()[['item_cnt_month']]
category_means = category_means.rename(columns={'item_cnt_month':'mean_category_cnt_month'}).reset_index()
print(len(category_means))
print(max_train_date_block_num * len(categories))
category_means.head()

2077
2772


Unnamed: 0,date_block_num,item_category_id,mean_category_cnt_month
0,0,0,0.022222
1,0,1,0.022222
2,0,2,0.835435
3,0,3,4.422222
4,0,4,0.513131


In [32]:
train_rup = train_rup.merge(shop_means)
print(len(train_rup))
train_rup.head()

10913850


Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month,mean_shop_cnt_month
0,0,19,0,0.0,0.66679
1,0,27,0,0.0,0.66679
2,0,28,0,0.0,0.66679
3,0,29,0,0.0,0.66679
4,0,32,0,6.0,0.66679


In [33]:
train_rup = train_rup.merge(items)
print(len(train_rup))
train_rup.head()

10913850


Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month,mean_shop_cnt_month,item_name,item_category_id
0,0,19,0,0.0,0.66679,/ЗОЛОТАЯ КОЛЛЕКЦИЯ м/ф-72,40
1,1,19,0,0.0,0.361306,/ЗОЛОТАЯ КОЛЛЕКЦИЯ м/ф-72,40
2,2,19,0,0.0,0.14122,/ЗОЛОТАЯ КОЛЛЕКЦИЯ м/ф-72,40
3,3,19,0,0.0,0.094516,/ЗОЛОТАЯ КОЛЛЕКЦИЯ м/ф-72,40
4,4,19,0,0.0,0.257425,/ЗОЛОТАЯ КОЛЛЕКЦИЯ м/ф-72,40


In [34]:
train_rup = train_rup.merge(category_means)
print(len(train_rup))
train_rup.head()

10913850


Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month,mean_shop_cnt_month,item_name,item_category_id,mean_category_cnt_month
0,0,19,0,0.0,0.66679,/ЗОЛОТАЯ КОЛЛЕКЦИЯ м/ф-72,40,0.293846
1,1,19,0,0.0,0.361306,/ЗОЛОТАЯ КОЛЛЕКЦИЯ м/ф-72,40,0.293846
2,2,19,0,0.0,0.14122,/ЗОЛОТАЯ КОЛЛЕКЦИЯ м/ф-72,40,0.293846
3,3,19,0,0.0,0.094516,/ЗОЛОТАЯ КОЛЛЕКЦИЯ м/ф-72,40,0.293846
4,4,19,0,0.0,0.257425,/ЗОЛОТАЯ КОЛЛЕКЦИЯ м/ф-72,40,0.293846


In [35]:
train_rup = train_rup.loc[:, ['shop_id', 'item_id', 'date_block_num', 'mean_shop_cnt_month', 'mean_category_cnt_month', 'item_cnt_month']]
train_rup = train_rup.assign(prev_date_block_num = train_rup.date_block_num - 1)
train_rup.head()

Unnamed: 0,shop_id,item_id,date_block_num,mean_shop_cnt_month,mean_category_cnt_month,item_cnt_month,prev_date_block_num
0,0,19,0,0.66679,0.293846,0.0,-1
1,1,19,0,0.361306,0.293846,0.0,-1
2,2,19,0,0.14122,0.293846,0.0,-1
3,3,19,0,0.094516,0.293846,0.0,-1
4,4,19,0,0.257425,0.293846,0.0,-1


In [36]:
merged = train_rup.merge(train_rup, how='left', left_on=['prev_date_block_num', 'shop_id', 'item_id'], right_on=['date_block_num', 'shop_id', 'item_id'] )
merged.head()

Unnamed: 0,shop_id,item_id,date_block_num_x,mean_shop_cnt_month_x,mean_category_cnt_month_x,item_cnt_month_x,prev_date_block_num_x,date_block_num_y,mean_shop_cnt_month_y,mean_category_cnt_month_y,item_cnt_month_y,prev_date_block_num_y
0,0,19,0,0.66679,0.293846,0.0,-1,,,,,
1,1,19,0,0.361306,0.293846,0.0,-1,,,,,
2,2,19,0,0.14122,0.293846,0.0,-1,,,,,
3,3,19,0,0.094516,0.293846,0.0,-1,,,,,
4,4,19,0,0.257425,0.293846,0.0,-1,,,,,


In [37]:
buf = merged.loc[:, ['shop_id', 'item_id', 'date_block_num_x', 'mean_shop_cnt_month_y', 'mean_category_cnt_month_y', 'item_cnt_month_y', 'item_cnt_month_x']]
buf.rename(columns={'date_block_num_x':'date_block_num', 'item_cnt_month_x': 'item_cnt_month', 'mean_category_cnt_month_y':'mean_category_cnt_prev_month', 'mean_shop_cnt_month_y':'mean_shop_cnt_prev_month', 'item_cnt_month_y':'item_cnt_prev_month'}, inplace=True)
buf.head()

Unnamed: 0,shop_id,item_id,date_block_num,mean_shop_cnt_prev_month,mean_category_cnt_prev_month,item_cnt_prev_month,item_cnt_month
0,0,19,0,,,,0.0
1,1,19,0,,,,0.0
2,2,19,0,,,,0.0
3,3,19,0,,,,0.0
4,4,19,0,,,,0.0


In [38]:
#buf = buf.merge(means, how='left', left_on=['shop_id', 'item_id'], right_on=['shop_id', 'item_id'])
buf = buf.assign(valid=pd.Series(~buf.item_cnt_prev_month.isnull(), dtype=int))
buf.item_cnt_prev_month.fillna(-1, inplace=True)
buf.mean_shop_cnt_prev_month.fillna(-1, inplace=True)
buf.mean_category_cnt_prev_month.fillna(-1, inplace=True)
buf.head()

Unnamed: 0,shop_id,item_id,date_block_num,mean_shop_cnt_prev_month,mean_category_cnt_prev_month,item_cnt_prev_month,item_cnt_month,valid
0,0,19,0,-1.0,-1.0,-1.0,0.0,0
1,1,19,0,-1.0,-1.0,-1.0,0.0,0
2,2,19,0,-1.0,-1.0,-1.0,0.0,0
3,3,19,0,-1.0,-1.0,-1.0,0.0,0
4,4,19,0,-1.0,-1.0,-1.0,0.0,0


In [39]:
buf.describe()

Unnamed: 0,shop_id,item_id,date_block_num,mean_shop_cnt_prev_month,mean_category_cnt_prev_month,item_cnt_prev_month,item_cnt_month,valid
count,10913850.0,10913850.0,10913850.0,10913850.0,10913850.0,10913850.0,10913850.0,10913850.0
mean,31.1872,11309.26,14.97334,0.00531935,0.01477373,0.0553887,0.2982399,0.7731313
std,17.34959,6209.978,9.495618,0.5778523,0.6634297,1.319458,1.222373,0.418807
min,0.0,0.0,0.0,-1.0,-1.0,-1.0,0.0,0.0
25%,16.0,5976.0,7.0,0.08653093,0.05189063,0.0,0.0,1.0
50%,30.0,11391.0,14.0,0.1986196,0.1965922,0.0,0.0,1.0
75%,46.0,16605.0,23.0,0.2879637,0.2514297,0.0,0.0,1.0
max,59.0,22169.0,33.0,1.799243,19.24,20.0,20.0,1.0


In [40]:
target_col = 'item_cnt_month'
X = buf.loc[:, buf.columns != target_col].values
print(X) #X.head()

[[ 0.000e+00  1.900e+01  0.000e+00 ... -1.000e+00 -1.000e+00  0.000e+00]
 [ 1.000e+00  1.900e+01  0.000e+00 ... -1.000e+00 -1.000e+00  0.000e+00]
 [ 2.000e+00  1.900e+01  0.000e+00 ... -1.000e+00 -1.000e+00  0.000e+00]
 ...
 [ 5.700e+01  4.931e+03  3.300e+01 ... -1.000e+00 -1.000e+00  0.000e+00]
 [ 5.800e+01  4.931e+03  3.300e+01 ... -1.000e+00 -1.000e+00  0.000e+00]
 [ 5.900e+01  4.931e+03  3.300e+01 ... -1.000e+00 -1.000e+00  0.000e+00]]


In [41]:
y = buf.loc[:, [target_col]].values.ravel()
print(y) #y.head()

[0. 0. 0. ... 0. 0. 0.]


In [42]:
def gen_time_split(X, n_splits):
    for i in range(n_splits):
        print(i)
        first_vali_date_block_num = max_train_date_block_num - i
        vali_indices = X.loc[:,'date_block_num'] == first_vali_date_block_num
        train_indices = X.loc[:,'date_block_num'] < first_vali_date_block_num
        yield (train_indices[train_indices].index, vali_indices[vali_indices].index)        

In [66]:
cv = gen_time_split(buf, 3)

In [72]:
from sklearn.ensemble import GradientBoostingRegressor
est = GradientBoostingRegressor(n_estimators=50, max_depth=7, loss='ls', verbose=1)

In [68]:
#lr = 1 / np.logspace(0.0, 1.0, num=5)[2:]
#lr = np.array([0.3, 0.45, 0.6])
#lr = np.linspace(0.3, 0.6, 5)
lr = np.array([0.3])
print(lr)

[0.3]


In [69]:
from sklearn.model_selection import GridSearchCV
param_grid = {'learning_rate':lr}
gs = GridSearchCV(est, param_grid, cv=cv, refit=True, n_jobs=4, scoring='r2', verbose=1)
#gs.fit(X, y)

In [70]:
#from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
pipe = Pipeline(steps=[('Search', gs)]) #('Scaling', StandardScaler()), 

In [73]:
#pipe.fit(X,y)
est.fit(X,y)

      Iter       Train Loss   Remaining Time 
         1           1.3973           36.39m
         2           1.3188           35.37m
         3           1.2541           34.79m
         4           1.2020           33.83m
         5           1.1594           32.84m
         6           1.1243           32.23m
         7           1.0958           31.52m
         8           1.0726           30.81m
         9           1.0536           30.11m
        10           1.0382           29.46m
        20           0.9712           22.21m
        30           0.9499           14.82m
        40           0.9361            7.48m
        50           0.9264            0.00s


GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=7, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=50, n_iter_no_change=None, presort='auto',
             random_state=None, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=1, warm_start=False)

from sklearn.model_selection import cross_val_score
scores = cross_val_score(est, X, y, cv=cv)
print(scores)
#print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [47]:
print(gs.cv_results_['mean_test_score'])
print(gs.cv_results_['params'])
#gs.best_estimator_

NameError: name 'gs' is not defined

In [74]:
from sklearn.metrics import mean_squared_error
y_pred = est.predict(X) #pipe.predict(X)
mse = mean_squared_error(y, y_pred)
print(mse)
print(np.sqrt(mse))

0.926443010735238
0.9625190962963998


In [75]:
X_test = test.assign(date_block_num=max_train_date_block_num+1, prev_date_block_num=max_train_date_block_num).drop(columns=['ID'])
X_test.head()

Unnamed: 0,shop_id,item_id,date_block_num,prev_date_block_num
0,5,5037,34,33
1,5,5320,34,33
2,5,5233,34,33
3,5,5232,34,33
4,5,5268,34,33


In [76]:
merged2 = X_test.merge(train_rup, how='left', left_on=['prev_date_block_num', 'shop_id', 'item_id'], right_on=['date_block_num', 'shop_id', 'item_id'] )
merged2.head()

Unnamed: 0,shop_id,item_id,date_block_num_x,prev_date_block_num_x,date_block_num_y,mean_shop_cnt_month,mean_category_cnt_month,item_cnt_month,prev_date_block_num_y
0,5,5037,34,33,33.0,0.190098,0.379047,0.0,32.0
1,5,5320,34,33,,,,,
2,5,5233,34,33,33.0,0.190098,0.379047,1.0,32.0
3,5,5232,34,33,33.0,0.190098,0.337353,0.0,32.0
4,5,5268,34,33,,,,,


In [77]:
buf2 = merged2.loc[:, ['shop_id', 'item_id', 'date_block_num_x', 'mean_shop_cnt_month', 'mean_category_cnt_month', 'item_cnt_month']]
buf2.rename(columns={'date_block_num_x':'date_block_num', 'mean_shop_cnt_month':'mean_shop_cnt_prev_month', 'mean_category_cnt_month':'mean_category_cnt_prev_month', 'item_cnt_month':'item_cnt_prev_month'}, inplace=True)
buf2.head()

Unnamed: 0,shop_id,item_id,date_block_num,mean_shop_cnt_prev_month,mean_category_cnt_prev_month,item_cnt_prev_month
0,5,5037,34,0.190098,0.379047,0.0
1,5,5320,34,,,
2,5,5233,34,0.190098,0.379047,1.0
3,5,5232,34,0.190098,0.337353,0.0
4,5,5268,34,,,


In [78]:
buf2 = buf2.assign(valid=pd.Series(~buf2.item_cnt_prev_month.isnull(), dtype=int))
buf2.item_cnt_prev_month.fillna(-1, inplace=True)
buf2.mean_shop_cnt_prev_month.fillna(-1, inplace=True)
buf2.mean_category_cnt_prev_month.fillna(-1, inplace=True)
buf2.head()

Unnamed: 0,shop_id,item_id,date_block_num,mean_shop_cnt_prev_month,mean_category_cnt_prev_month,item_cnt_prev_month,valid
0,5,5037,34,0.190098,0.379047,0.0,1
1,5,5320,34,-1.0,-1.0,-1.0,0
2,5,5233,34,0.190098,0.379047,1.0,1
3,5,5232,34,0.190098,0.337353,0.0,1
4,5,5268,34,-1.0,-1.0,-1.0,0


In [79]:
X_test = buf2.values

In [80]:
y_pred_test = est.predict(X_test) #pipe.predict(X_test)
print(y_pred_test)

[0.20531101 0.15155253 0.55934457 ... 0.08583404 0.08857669 0.1006706 ]


In [81]:
submission = test.assign(item_cnt_month=y_pred_test)[['item_cnt_month']]
submission.describe()

Unnamed: 0,item_cnt_month
count,214200.0
mean,0.257611
std,0.642333
min,-0.034788
25%,0.088577
50%,0.128054
75%,0.221238
max,19.73025


In [82]:
submission.head()

Unnamed: 0,item_cnt_month
0,0.205311
1,0.151553
2,0.559345
3,0.178515
4,0.17257


In [83]:
submission.to_csv('ShopAndCategoryMeans_50.csv', index_label='ID') #header=['ID', 'item_cnt_month'])

In [84]:
!gzip ShopAndCategoryMeans_50.csv
!ls

Baseline.ipynb	   EDA.ipynb	   ShopAndCategoryMeans_50.csv.gz
combos.csv.gz	   lagged2.csv.gz  ShopAndCategoryMeans.csv
Combos.ipynb	   lagged3.csv.gz  ShopAndCategoryMeans_xgb.csv.gz
combos_xgb.csv.gz  lagged.csv.gz   Shop and item category means.ipynb
data		   Lagged.ipynb    submission.csv.gz


0.3 is best learning rate so far.

[
 (split1_train_idxs, split1_test_idxs),
 (split2_train_idxs, split2_test_idxs),
 (split3_train_idxs, split3_test_idxs),
 ...
]

"Submissions are evaluated by root mean squared error (RMSE). True target values are clipped into [0,20] range."

and

"For each id in the test set, you must predict a total number of sales."

and

"Submission is for date_block_num 34"

and

"
My CV strategy is 5-fold moving window:

fold 1: Train on month 0 to 32 and validate on 33
fold 2: Train on month 0 to 31 and validate on 32
…
fold 5: Train on month 0 to 28 and validate on 29
"

and

- mean encodings
- lag
- text extraction on item and category names

In [None]:
#import sys
#!conda install --yes --prefix {sys.prefix} xgboost
#{sys.executable} -m pip install xgboost

In [59]:
import xgboost as xgb
dtrain = xgb.DMatrix(buf.loc[:, buf.columns != target_col], label=y)
param = {'max_depth':7, 'eta':0.3, 'silent':0, 'objective':'reg:linear', 'eval_metrix':'rmse' }
num_round = 100
bst = xgb.train(param, dtrain, num_round)

[20:32:31] Tree method is automatically selected to be 'approx' for faster speed. To use old behavior (exact greedy algorithm on single machine), set tree_method to 'exact'.
[20:32:38] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 254 extra nodes, 0 pruned nodes, max_depth=7
[20:32:43] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 254 extra nodes, 0 pruned nodes, max_depth=7
[20:32:49] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 254 extra nodes, 0 pruned nodes, max_depth=7
[20:32:55] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 254 extra nodes, 0 pruned nodes, max_depth=7
[20:33:00] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 252 extra nodes, 0 pruned nodes, max_depth=7
[20:33:06] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 250 extra nodes, 0 pruned nodes, max_depth=7
[20:33:11] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 252

[20:38:41] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 252 extra nodes, 0 pruned nodes, max_depth=7
[20:38:47] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 192 extra nodes, 0 pruned nodes, max_depth=7
[20:38:53] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 254 extra nodes, 0 pruned nodes, max_depth=7
[20:38:59] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 220 extra nodes, 0 pruned nodes, max_depth=7
[20:39:04] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 254 extra nodes, 0 pruned nodes, max_depth=7
[20:39:10] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 246 extra nodes, 0 pruned nodes, max_depth=7
[20:39:15] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 254 extra nodes, 0 pruned nodes, max_depth=7
[20:39:21] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 254 extra nodes, 0 pruned nodes, max_depth=7
[20:39:2

In [60]:
y_pred = bst.predict(dtrain)
mse = mean_squared_error(y, y_pred)
print(mse)
print(np.sqrt(mse))

0.889699998879066
0.9432390995283572


In [61]:
# make prediction
dtest = xgb.DMatrix(buf2)
y_pred_test = bst.predict(dtest)
y_pred_test

array([0.15239403, 0.07205728, 0.40105173, ..., 0.07691392, 0.08787793,
       0.04896444], dtype=float32)