In [1]:
import os
import numpy as np
import pandas as pd
#import matplotlib as mpl
import matplotlib.pyplot as plt
#import matplotlib.dates as mdates
%matplotlib inline 

In [2]:
DATA_FOLDER = './data/'

train = pd.read_csv(os.path.join(DATA_FOLDER, 'sales_train.csv.gz'))
test = pd.read_csv(os.path.join(DATA_FOLDER, 'test.csv.gz'))
items = pd.read_csv(os.path.join(DATA_FOLDER, 'items.csv'))
categories = pd.read_csv(os.path.join(DATA_FOLDER, 'item_categories.csv'))
shops = pd.read_csv(os.path.join(DATA_FOLDER, 'shops.csv'))

train.date = pd.to_datetime(train.date, format='%d.%m.%Y')
print(train.date.min(), train.date.max())

In [3]:
train.item_cnt_day = train.item_cnt_day.clip(0, 20)

In [4]:
train_rup = train.groupby(['date_block_num', 'shop_id', 'item_id']).sum().loc[:, 'item_cnt_day'].reset_index().sort_values(['date_block_num', 'shop_id', 'item_id'])
train_rup.rename(columns={'item_cnt_day':'item_cnt_month'}, inplace=True)
train_rup.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month
0,0,0,32,6.0
1,0,0,33,3.0
2,0,0,35,1.0
3,0,0,43,1.0
4,0,0,51,2.0


In [5]:
train_rup = train_rup.assign(prev_date_block_num = train_rup.date_block_num - 1)
train_rup.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,prev_date_block_num
0,0,0,32,6.0,-1
1,0,0,33,3.0,-1
2,0,0,35,1.0,-1
3,0,0,43,1.0,-1
4,0,0,51,2.0,-1


In [11]:
means = train_rup.groupby(['shop_id', 'item_id'])[['item_cnt_month']].mean().reset_index().sort_values(['shop_id', 'item_id'])
means.rename(columns={'item_cnt_month':'item_cnt_month_mean'}, inplace=True)
means.head()

Unnamed: 0,shop_id,item_id,item_cnt_month_mean
0,0,30,31.0
1,0,31,11.0
2,0,32,8.0
3,0,33,3.0
4,0,35,7.5


In [6]:
merged = train_rup.merge(train_rup, how='left', left_on=['prev_date_block_num', 'shop_id', 'item_id'], right_on=['date_block_num', 'shop_id', 'item_id'] )
merged.head()

Unnamed: 0,date_block_num_x,shop_id,item_id,item_cnt_month_x,prev_date_block_num_x,date_block_num_y,item_cnt_month_y,prev_date_block_num_y
0,0,0,32,6.0,-1,,,
1,0,0,33,3.0,-1,,,
2,0,0,35,1.0,-1,,,
3,0,0,43,1.0,-1,,,
4,0,0,51,2.0,-1,,,


In [13]:
buf = merged.loc[:, ['shop_id', 'item_id', 'item_cnt_month_y', 'item_cnt_month_x']]
buf.rename(columns={'item_cnt_month_x': 'item_cnt_month', 'item_cnt_month_y':'item_cnt_prev_month'}, inplace=True)
buf.head()

Unnamed: 0,shop_id,item_id,item_cnt_prev_month,item_cnt_month
0,0,32,,6.0
1,0,33,,3.0
2,0,35,,1.0
3,0,43,,1.0
4,0,51,,2.0


In [14]:
#buf = buf.merge(means, how='left', left_on=['shop_id', 'item_id'], right_on=['shop_id', 'item_id'])
buf.item_cnt_prev_month.fillna(train_rup.item_cnt_month.mean(), inplace=True)
buf.head()

Unnamed: 0,shop_id,item_id,item_cnt_prev_month,item_cnt_month
0,0,32,2.22614,6.0
1,0,33,2.22614,3.0
2,0,35,2.22614,1.0
3,0,43,2.22614,1.0
4,0,51,2.22614,2.0


In [15]:
buf.describe()

Unnamed: 0,shop_id,item_id,item_cnt_prev_month,item_cnt_month
count,1609124.0,1609124.0,1609124.0,1609124.0
mean,32.80585,10680.99,2.71098,2.22614
std,16.53701,6238.883,6.165318,6.437858
min,0.0,0.0,0.0,0.0
25%,21.0,5045.0,2.0,1.0
50%,31.0,10497.0,2.22614,1.0
75%,47.0,16060.0,2.22614,2.0
max,59.0,22169.0,618.0,618.0


In [16]:
target_col = 'item_cnt_month'
X = buf.loc[:, buf.columns != target_col].values
print(X) #X.head()

[[0.00000000e+00 3.20000000e+01 2.22614043e+00]
 [0.00000000e+00 3.30000000e+01 2.22614043e+00]
 [0.00000000e+00 3.50000000e+01 2.22614043e+00]
 ...
 [5.90000000e+01 2.20910000e+04 3.00000000e+00]
 [5.90000000e+01 2.21000000e+04 1.00000000e+00]
 [5.90000000e+01 2.21020000e+04 2.22614043e+00]]


In [17]:
y = buf.loc[:, [target_col]].values.ravel()
print(y) #y.head()

[6. 3. 1. ... 1. 1. 1.]


In [18]:
from sklearn.model_selection import KFold
cv = KFold(n_splits=3, shuffle=True)
print(cv)

KFold(n_splits=3, random_state=None, shuffle=True)


In [None]:
cv2 = [(i, i) for i in range(5)]
cv2

In [19]:
from sklearn.ensemble import GradientBoostingRegressor
est = GradientBoostingRegressor(n_estimators=100, max_depth=6, loss='ls', verbose=1)

In [21]:
#lr = 1 / np.logspace(0.0, 1.0, num=5)[2:]
#lr = np.array([0.3, 0.45, 0.6])
#lr = np.linspace(0.3, 0.6, 5)
lr = np.array([0.3])
print(lr)

[0.3]


In [22]:
from sklearn.model_selection import GridSearchCV
param_grid = {'learning_rate':lr}
gs = GridSearchCV(est, param_grid, cv=cv, refit=True)
#gs.fit(X, y)

In [23]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
pipe = Pipeline(steps=[('Scaling', StandardScaler()), ('Search', gs)])

In [24]:
pipe.fit(X,y)

      Iter       Train Loss   Remaining Time 
         1          27.4045            1.79m
         2          20.9659            1.76m
         3          17.6976            1.75m
         4          16.0249            1.72m
         5          15.1146            1.69m
         6          14.6576            1.68m
         7          14.2802            1.67m
         8          14.0523            1.62m
         9          13.9351            1.59m
        10          13.8111            1.56m
        20          12.9433            1.31m
        30          12.3243            1.14m
        40          11.8814           58.41s
        50          11.5728           48.98s
        60          11.2397           39.35s
        70          10.9951           29.29s
        80          10.5753           19.66s
        90          10.3986            9.81s
       100          10.2724            0.00s
      Iter       Train Loss   Remaining Time 
         1          28.1006            1.68m
        

Pipeline(memory=None,
     steps=[('Scaling', StandardScaler(copy=True, with_mean=True, with_std=True)), ('Search', GridSearchCV(cv=KFold(n_splits=3, random_state=None, shuffle=True),
       error_score='raise-deprecating',
       estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
            ...}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score='warn', scoring=None, verbose=0))])

from sklearn.model_selection import cross_val_score
scores = cross_val_score(est, X, y, cv=cv)
print(scores)
#print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [25]:
print(gs.cv_results_['mean_test_score'])
print(gs.cv_results_['params'])
#gs.best_estimator_

[0.647928]
[{'learning_rate': 0.3}]


In [26]:
from sklearn.metrics import mean_squared_error
y_pred = pipe.predict(X)
mse = mean_squared_error(y, y_pred)
print(mse)
print(np.sqrt(mse))

10.75878390077296
3.2800585209372346


In [27]:
X_test = test.assign(prev_date_block_num=train.date_block_num.max()).drop(columns=['ID'])
X_test.head()

Unnamed: 0,shop_id,item_id,prev_date_block_num
0,5,5037,33
1,5,5320,33
2,5,5233,33
3,5,5232,33
4,5,5268,33


In [28]:
merged2 = X_test.merge(train_rup, how='left', left_on=['prev_date_block_num', 'shop_id', 'item_id'], right_on=['date_block_num', 'shop_id', 'item_id'] )
merged2.head()

Unnamed: 0,shop_id,item_id,prev_date_block_num_x,date_block_num,item_cnt_month,prev_date_block_num_y
0,5,5037,33,,,
1,5,5320,33,,,
2,5,5233,33,33.0,1.0,32.0
3,5,5232,33,,,
4,5,5268,33,,,


In [29]:
buf2 = merged2.loc[:, ['shop_id', 'item_id', 'item_cnt_month']]
buf2.rename(columns={'item_cnt_month':'item_cnt_prev_month'}, inplace=True)
buf2.head()

Unnamed: 0,shop_id,item_id,item_cnt_prev_month
0,5,5037,
1,5,5320,
2,5,5233,1.0
3,5,5232,
4,5,5268,


In [30]:
buf2.item_cnt_prev_month.fillna(train_rup.item_cnt_month.mean(), inplace=True)
buf2.head()

Unnamed: 0,shop_id,item_id,item_cnt_prev_month
0,5,5037,2.22614
1,5,5320,2.22614
2,5,5233,1.0
3,5,5232,2.22614
4,5,5268,2.22614


In [31]:
X_test = buf2

In [32]:
y_pred_test = pipe.predict(X_test)
print(y_pred_test)

  Xt = transform.transform(Xt)


[2.2811127  1.47708415 2.19860462 ... 1.2755043  1.24254714 1.26887488]


In [33]:
submission = test.assign(item_cnt_month=y_pred_test)[['item_cnt_month']]
submission.describe()

Unnamed: 0,item_cnt_month
count,214200.0
mean,1.699244
std,2.33379
min,-0.557311
25%,1.304035
50%,1.446261
75%,1.742217
max,467.520296


In [34]:
submission.head()

Unnamed: 0,item_cnt_month
0,2.281113
1,1.477084
2,2.198605
3,2.296455
4,1.727955


In [35]:
submission.to_csv('lagged.csv', index_label='ID') #header=['ID', 'item_cnt_month'])

In [36]:
!head lagged.csv

ID,item_cnt_month
0,2.2811126965340898
1,1.4770841479724601
2,2.1986046177616765
3,2.2964553351865846
4,1.7279547376708146
5,1.7956481967951279
6,2.1707444863297622
7,1.3732766880552278
8,1.4770841479724601


In [37]:
!gzip lagged.csv
!ls

Baseline.ipynb	data  EDA.ipynb  lagged.csv.gz	Lagged.ipynb  submission.csv.gz


0.3 is best learning rate so far.

[
 (split1_train_idxs, split1_test_idxs),
 (split2_train_idxs, split2_test_idxs),
 (split3_train_idxs, split3_test_idxs),
 ...
]

"Submissions are evaluated by root mean squared error (RMSE). True target values are clipped into [0,20] range."

and

"For each id in the test set, you must predict a total number of sales."

and

"Submission is for date_block_num 34"

and

"
My CV strategy is 5-fold moving window:

fold 1: Train on month 0 to 32 and validate on 33
fold 2: Train on month 0 to 31 and validate on 32
…
fold 5: Train on month 0 to 28 and validate on 29
"

and

- mean encodings
- lag
- text extraction on item and category names