In [1]:
import os
import numpy as np
import pandas as pd
#import matplotlib as mpl
import matplotlib.pyplot as plt
#import matplotlib.dates as mdates
%matplotlib inline 

In [2]:
DATA_FOLDER = './data/'

train = pd.read_csv(os.path.join(DATA_FOLDER, 'sales_train.csv.gz'))
test = pd.read_csv(os.path.join(DATA_FOLDER, 'test.csv.gz'))
items = pd.read_csv(os.path.join(DATA_FOLDER, 'items.csv'))
categories = pd.read_csv(os.path.join(DATA_FOLDER, 'item_categories.csv'))
shops = pd.read_csv(os.path.join(DATA_FOLDER, 'shops.csv'))

In [3]:
max_train_date_block_num = train.date_block_num.max()

In [4]:
from itertools import product

# For every month we create a grid from all shops/items combinations from that month
grid = [] 
for block_num in train['date_block_num'].unique():
    cur_shops = train[train['date_block_num']==block_num]['shop_id'].unique()
    cur_items = train[train['date_block_num']==block_num]['item_id'].unique()
    grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])),dtype='int32'))

#turn the grid into pandas dataframe
index_cols = ['shop_id', 'item_id', 'date_block_num']
grid = pd.DataFrame(np.vstack(grid), columns = index_cols, dtype=np.int32)

#get aggregated values for (shop_id, item_id, month)
gb = train.groupby(index_cols,as_index=False).agg({'item_cnt_day':{'item_cnt_month':'sum'}})

#fix column names
gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values]
#join aggregated data to the grid
all_data = pd.merge(grid,gb,how='left',on=index_cols).fillna(0)
#sort the data
all_data.sort_values(['date_block_num','shop_id','item_id'],inplace=True)
all_data.head()

  return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)


Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month
139255,0,19,0,0.0
141495,0,27,0,0.0
144968,0,28,0,0.0
142661,0,29,0,0.0
138947,0,32,0,6.0


In [5]:
all_data.describe()

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month
count,10913850.0,10913850.0,10913850.0,10913850.0
mean,31.1872,11309.26,14.97334,0.3342731
std,17.34959,6209.978,9.495618,3.417243
min,0.0,0.0,0.0,-22.0
25%,16.0,5976.0,7.0,0.0
50%,30.0,11391.0,14.0,0.0
75%,46.0,16605.0,23.0,0.0
max,59.0,22169.0,33.0,2253.0


In [6]:
train_rup = all_data
train_rup.item_cnt_month = train_rup.item_cnt_month.clip(0, 20)
train_rup = train_rup.assign(prev_date_block_num = train_rup.date_block_num - 1)
train_rup.head()

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month,prev_date_block_num
139255,0,19,0,0.0,-1
141495,0,27,0,0.0,-1
144968,0,28,0,0.0,-1
142661,0,29,0,0.0,-1
138947,0,32,0,6.0,-1


means = train_rup.groupby(['shop_id', 'item_id'])[['item_cnt_month']].mean().reset_index().sort_values(['shop_id', 'item_id'])
means.rename(columns={'item_cnt_month':'item_cnt_month_mean'}, inplace=True)
means.head()

In [7]:
merged = train_rup.merge(train_rup, how='left', left_on=['prev_date_block_num', 'shop_id', 'item_id'], right_on=['date_block_num', 'shop_id', 'item_id'] )
merged.head()

Unnamed: 0,shop_id,item_id,date_block_num_x,item_cnt_month_x,prev_date_block_num_x,date_block_num_y,item_cnt_month_y,prev_date_block_num_y
0,0,19,0,0.0,-1,,,
1,0,27,0,0.0,-1,,,
2,0,28,0,0.0,-1,,,
3,0,29,0,0.0,-1,,,
4,0,32,0,6.0,-1,,,


In [8]:
buf = merged.loc[:, ['shop_id', 'item_id', 'date_block_num_x', 'item_cnt_month_y', 'item_cnt_month_x']]
buf.rename(columns={'date_block_num_x':'date_block_num', 'item_cnt_month_x': 'item_cnt_month', 'item_cnt_month_y':'item_cnt_prev_month'}, inplace=True)
buf.head()

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_prev_month,item_cnt_month
0,0,19,0,,0.0
1,0,27,0,,0.0
2,0,28,0,,0.0
3,0,29,0,,0.0
4,0,32,0,,6.0


In [9]:
#buf = buf.merge(means, how='left', left_on=['shop_id', 'item_id'], right_on=['shop_id', 'item_id'])
buf = buf.assign(valid=pd.Series(~buf.item_cnt_prev_month.isnull(), dtype=int))
buf.item_cnt_prev_month.fillna(-1, inplace=True)
buf.head()

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_prev_month,item_cnt_month,valid
0,0,19,0,-1.0,0.0,0
1,0,27,0,-1.0,0.0,0
2,0,28,0,-1.0,0.0,0
3,0,29,0,-1.0,0.0,0
4,0,32,0,-1.0,6.0,0


In [10]:
buf.describe()

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_prev_month,item_cnt_month,valid
count,10913850.0,10913850.0,10913850.0,10913850.0,10913850.0,10913850.0
mean,31.1872,11309.26,14.97334,0.0553887,0.2982399,0.7731313
std,17.34959,6209.978,9.495618,1.319458,1.222373,0.418807
min,0.0,0.0,0.0,-1.0,0.0,0.0
25%,16.0,5976.0,7.0,0.0,0.0,1.0
50%,30.0,11391.0,14.0,0.0,0.0,1.0
75%,46.0,16605.0,23.0,0.0,0.0,1.0
max,59.0,22169.0,33.0,20.0,20.0,1.0


In [11]:
target_col = 'item_cnt_month'
X = buf.loc[:, buf.columns != target_col].values
print(X) #X.head()

[[ 0.0000e+00  1.9000e+01  0.0000e+00 -1.0000e+00  0.0000e+00]
 [ 0.0000e+00  2.7000e+01  0.0000e+00 -1.0000e+00  0.0000e+00]
 [ 0.0000e+00  2.8000e+01  0.0000e+00 -1.0000e+00  0.0000e+00]
 ...
 [ 5.9000e+01  2.2164e+04  3.3000e+01  0.0000e+00  1.0000e+00]
 [ 5.9000e+01  2.2166e+04  3.3000e+01  0.0000e+00  1.0000e+00]
 [ 5.9000e+01  2.2167e+04  3.3000e+01  0.0000e+00  1.0000e+00]]


In [12]:
y = buf.loc[:, [target_col]].values.ravel()
print(y) #y.head()

[0. 0. 0. ... 0. 0. 0.]


In [13]:
def gen_time_split(X, n_splits):
    for i in range(n_splits):
        print(i)
        first_vali_date_block_num = max_train_date_block_num - i
        vali_indices = X.loc[:,'date_block_num'] == first_vali_date_block_num
        train_indices = X.loc[:,'date_block_num'] < first_vali_date_block_num
        yield (train_indices[train_indices].index, vali_indices[vali_indices].index)        

In [14]:
cv = gen_time_split(buf, 1)

In [15]:
from sklearn.ensemble import GradientBoostingRegressor
est = GradientBoostingRegressor(n_estimators=10, max_depth=7, loss='ls', verbose=1)

In [16]:
#lr = 1 / np.logspace(0.0, 1.0, num=5)[2:]
#lr = np.array([0.3, 0.45, 0.6])
#lr = np.linspace(0.3, 0.6, 5)
lr = np.array([0.3])
print(lr)

[0.3]


In [17]:
from sklearn.model_selection import GridSearchCV
param_grid = {'learning_rate':lr}
gs = GridSearchCV(est, param_grid, cv=cv, refit=True, n_jobs=4, scoring='r2', verbose=1)
#gs.fit(X, y)

In [18]:
#from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
pipe = Pipeline(steps=[('Search', gs)]) #('Scaling', StandardScaler()), 

In [19]:
pipe.fit(X,y)

0
Fitting 1 folds for each of 1 candidates, totalling 1 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   1 out of   1 | elapsed:  4.4min finished


      Iter       Train Loss   Remaining Time 
         1           1.2412            3.55m
         2           1.1140            3.21m
         3           1.0507            2.83m
         4           1.0164            2.42m
         5           0.9980            2.05m
         6           0.9872            1.65m
         7           0.9789            1.25m
         8           0.9738           50.12s
         9           0.9664           25.23s
        10           0.9624            0.00s


Pipeline(memory=None,
     steps=[('Search', GridSearchCV(cv=<generator object gen_time_split at 0x7f3da4a81678>,
       error_score='raise-deprecating',
       estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=7, max_features=None,
       ...}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score='warn', scoring='r2', verbose=1))])

from sklearn.model_selection import cross_val_score
scores = cross_val_score(est, X, y, cv=cv)
print(scores)
#print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [20]:
print(gs.cv_results_['mean_test_score'])
print(gs.cv_results_['params'])
#gs.best_estimator_

[0.24402449]
[{'learning_rate': 0.3}]


In [21]:
from sklearn.metrics import mean_squared_error
y_pred = pipe.predict(X)
mse = mean_squared_error(y, y_pred)
print(mse)
print(np.sqrt(mse))

0.9623996225623318
0.9810196851043977


In [22]:
X_test = test.assign(date_block_num=max_train_date_block_num+1, prev_date_block_num=max_train_date_block_num).drop(columns=['ID'])
X_test.head()

Unnamed: 0,shop_id,item_id,date_block_num,prev_date_block_num
0,5,5037,34,33
1,5,5320,34,33
2,5,5233,34,33
3,5,5232,34,33
4,5,5268,34,33


In [23]:
merged2 = X_test.merge(train_rup, how='left', left_on=['prev_date_block_num', 'shop_id', 'item_id'], right_on=['date_block_num', 'shop_id', 'item_id'] )
merged2.head()

Unnamed: 0,shop_id,item_id,date_block_num_x,prev_date_block_num_x,date_block_num_y,item_cnt_month,prev_date_block_num_y
0,5,5037,34,33,33.0,0.0,32.0
1,5,5320,34,33,,,
2,5,5233,34,33,33.0,1.0,32.0
3,5,5232,34,33,33.0,0.0,32.0
4,5,5268,34,33,,,


In [24]:
buf2 = merged2.loc[:, ['shop_id', 'item_id', 'date_block_num_x', 'item_cnt_month']]
buf2.rename(columns={'date_block_num_x':'date_block_num', 'item_cnt_month':'item_cnt_prev_month'}, inplace=True)
buf2.head()

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_prev_month
0,5,5037,34,0.0
1,5,5320,34,
2,5,5233,34,1.0
3,5,5232,34,0.0
4,5,5268,34,


In [25]:
buf2 = buf2.assign(valid=pd.Series(~buf2.item_cnt_prev_month.isnull(), dtype=int))
buf2.item_cnt_prev_month.fillna(-1, inplace=True)
buf2.head()

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_prev_month,valid
0,5,5037,34,0.0,1
1,5,5320,34,-1.0,0
2,5,5233,34,1.0,1
3,5,5232,34,0.0,1
4,5,5268,34,-1.0,0


In [26]:
X_test = buf2.values

In [27]:
y_pred_test = pipe.predict(X_test)
print(y_pred_test)

[0.13899542 0.18831306 0.49094366 ... 0.11572538 0.11572538 0.11975792]


In [34]:
submission = test.assign(item_cnt_month=y_pred_test)[['item_cnt_month']]
submission.describe()

Unnamed: 0,item_cnt_month
count,214200.0
mean,0.269337
std,0.695639
min,-0.072007
25%,0.084585
50%,0.129889
75%,0.225853
max,21.851133


In [None]:
submission.head()

In [35]:
submission.to_csv('combos_xgb.csv', index_label='ID') #header=['ID', 'item_cnt_month'])

In [36]:
!gzip combos_xgb.csv
!ls

Baseline.ipynb	combos_xgb.csv.gz  lagged2.csv.gz  Lagged.ipynb
combos.csv.gz	data		   lagged3.csv.gz  submission.csv.gz
Combos.ipynb	EDA.ipynb	   lagged.csv.gz


0.3 is best learning rate so far.

[
 (split1_train_idxs, split1_test_idxs),
 (split2_train_idxs, split2_test_idxs),
 (split3_train_idxs, split3_test_idxs),
 ...
]

"Submissions are evaluated by root mean squared error (RMSE). True target values are clipped into [0,20] range."

and

"For each id in the test set, you must predict a total number of sales."

and

"Submission is for date_block_num 34"

and

"
My CV strategy is 5-fold moving window:

fold 1: Train on month 0 to 32 and validate on 33
fold 2: Train on month 0 to 31 and validate on 32
…
fold 5: Train on month 0 to 28 and validate on 29
"

and

- mean encodings
- lag
- text extraction on item and category names

In [None]:
#import sys
#!conda install --yes --prefix {sys.prefix} xgboost
#{sys.executable} -m pip install xgboost

In [31]:
import xgboost as xgb
dtrain = xgb.DMatrix(buf.loc[:, buf.columns != target_col], label=y)
param = {'max_depth':7, 'eta':0.3, 'silent':0, 'objective':'reg:linear', 'eval_metrix':'rmse' }
num_round = 100
bst = xgb.train(param, dtrain, num_round)

[19:46:33] Tree method is automatically selected to be 'approx' for faster speed. To use old behavior (exact greedy algorithm on single machine), set tree_method to 'exact'.
[19:46:39] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 250 extra nodes, 0 pruned nodes, max_depth=7
[19:46:43] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 252 extra nodes, 0 pruned nodes, max_depth=7
[19:46:47] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 252 extra nodes, 0 pruned nodes, max_depth=7
[19:46:51] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 254 extra nodes, 0 pruned nodes, max_depth=7
[19:46:56] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 254 extra nodes, 0 pruned nodes, max_depth=7
[19:47:00] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 254 extra nodes, 0 pruned nodes, max_depth=7
[19:47:04] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 254

[19:51:23] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 238 extra nodes, 0 pruned nodes, max_depth=7
[19:51:27] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 240 extra nodes, 0 pruned nodes, max_depth=7
[19:51:31] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 254 extra nodes, 0 pruned nodes, max_depth=7
[19:51:36] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 248 extra nodes, 0 pruned nodes, max_depth=7
[19:51:40] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 244 extra nodes, 0 pruned nodes, max_depth=7
[19:51:44] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 228 extra nodes, 0 pruned nodes, max_depth=7
[19:51:49] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 200 extra nodes, 0 pruned nodes, max_depth=7
[19:51:53] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 250 extra nodes, 0 pruned nodes, max_depth=7
[19:51:5

In [32]:
y_pred = bst.predict(dtrain)
mse = mean_squared_error(y, y_pred)
print(mse)
print(np.sqrt(mse))

0.9126763902918501
0.9553409811642386


In [33]:
# make prediction
dtest = xgb.DMatrix(buf2)
y_pred_test = bst.predict(dtest)
y_pred_test

array([ 0.14166307, -0.01116216,  0.41987526, ...,  0.06655723,
        0.06721783,  0.01776317], dtype=float32)