In [1]:
import os
import numpy as np
import pandas as pd
#import matplotlib as mpl
import matplotlib.pyplot as plt
#import matplotlib.dates as mdates
%matplotlib inline 

In [2]:
DATA_FOLDER = './data/'

train = pd.read_csv(os.path.join(DATA_FOLDER, 'sales_train.csv.gz'))
test = pd.read_csv(os.path.join(DATA_FOLDER, 'test.csv.gz'))
items = pd.read_csv(os.path.join(DATA_FOLDER, 'items.csv'))
categories = pd.read_csv(os.path.join(DATA_FOLDER, 'item_categories.csv'))
shops = pd.read_csv(os.path.join(DATA_FOLDER, 'shops.csv'))

train.date = pd.to_datetime(train.date, format='%d.%m.%Y')
print(train.date.min(), train.date.max())

In [3]:
train.item_cnt_day = train.item_cnt_day.clip(0, 20)

In [4]:
train_rup = train.groupby(['date_block_num', 'shop_id', 'item_id']).sum().loc[:, 'item_cnt_day'].reset_index().sort_values(['date_block_num', 'shop_id', 'item_id'])
train_rup.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_day
0,0,0,32,6.0
1,0,0,33,3.0
2,0,0,35,1.0
3,0,0,43,1.0
4,0,0,51,2.0


In [5]:
target_col = 'item_cnt_day'
X = train_rup.loc[:, train_rup.columns != target_col].values
print(X) #X.head()

[[    0     0    32]
 [    0     0    33]
 [    0     0    35]
 ...
 [   33    59 22091]
 [   33    59 22100]
 [   33    59 22102]]


In [6]:
y = train_rup.loc[:, [target_col]].values.ravel()
print(y) #y.head()

[6. 3. 1. ... 1. 1. 1.]


In [7]:
from sklearn.model_selection import KFold
cv = KFold(n_splits=2, shuffle=True)
print(cv)

KFold(n_splits=2, random_state=None, shuffle=True)


In [8]:
from sklearn.ensemble import GradientBoostingRegressor
est = GradientBoostingRegressor(n_estimators=100, max_depth=5, loss='ls', verbose=1)

In [9]:
lr = 1 / np.logspace(0.0, 1.0, num=5)[2:]
print(lr)

[0.31622777 0.17782794 0.1       ]


In [10]:
from sklearn.model_selection import GridSearchCV
param_grid = {'learning_rate':lr}
gs = GridSearchCV(est, param_grid, cv=cv, refit=True)
#gs.fit(X, y)

In [11]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
pipe = Pipeline(steps=[('Scaling', StandardScaler()), ('Search', gs)])

In [12]:
pipe.fit(X,y)



      Iter       Train Loss   Remaining Time 
         1          32.7282           43.12s
         2          27.8949           43.76s
         3          25.5790           44.42s
         4          24.2516           43.80s
         5          23.0785           42.58s
         6          23.0038           41.64s
         7          22.3664           41.05s
         8          22.3174           40.56s
         9          22.1944           40.49s
        10          21.7955           39.82s
        20          19.9044           35.04s
        30          18.7927           30.74s
        40          17.3065           26.55s
        50          16.4488           22.18s
        60          15.8904           17.77s
        70          15.5279           13.37s
        80          14.7190            8.91s
        90          14.3936            4.45s
       100          13.8641            0.00s
      Iter       Train Loss   Remaining Time 
         1          32.9148           45.53s
        



{'mean_fit_time': array([44.973943  , 45.23663175, 44.99232519]),
 'std_fit_time': array([0.45523071, 0.55264199, 0.66651404]),
 'mean_score_time': array([0.91878235, 0.90744555, 0.92343569]),
 'std_score_time': array([0.00355089, 0.01255524, 0.00204515]),
 'param_learning_rate': masked_array(data=[0.31622776601683794, 0.17782794100389226, 0.1],
              mask=[False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'learning_rate': 0.31622776601683794},
  {'learning_rate': 0.17782794100389226},
  {'learning_rate': 0.1}],
 'split0_test_score': array([0.64010933, 0.57705732, 0.54332943]),
 'split1_test_score': array([0.62043135, 0.57179562, 0.52488354]),
 'mean_test_score': array([0.63027034, 0.57442647, 0.53410648]),
 'std_test_score': array([0.00983899, 0.00263085, 0.00922294]),
 'rank_test_score': array([1, 2, 3], dtype=int32),
 'split0_train_score': array([0.6715157 , 0.60185301, 0.56486235]),
 'split1_train_score': array([0.6847244 , 0.61593738, 0

from sklearn.model_selection import cross_val_score
scores = cross_val_score(est, X, y, cv=cv)
print(scores)
#print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [15]:
print(gs.cv_results_['mean_test_score'])
print(gs.cv_results_['params'])
#gs.best_estimator_

[0.63027034 0.57442647 0.53410648]
[{'learning_rate': 0.31622776601683794}, {'learning_rate': 0.17782794100389226}, {'learning_rate': 0.1}]


In [16]:
from sklearn.metrics import mean_squared_error
y_pred = pipe.predict(X)
mse = mean_squared_error(y, y_pred)
print(mse)
print(np.sqrt(mse))



14.202447613074613
3.76861348682438


In [19]:
X_test = test.assign(date_block_num=train.date_block_num.max()+1).drop(columns=['ID'])
X_test.head()

Unnamed: 0,shop_id,item_id,date_block_num
0,5,5037,34
1,5,5320,34
2,5,5233,34
3,5,5232,34
4,5,5268,34


In [23]:
y_pred_test = pipe.predict(X_test)
print(y_pred_test)

  Xt = transform.transform(Xt)


[0.85784778 0.85784778 0.85784778 ... 0.62710467 0.62710467 0.62710467]


In [26]:
submission = test.assign(item_cnt_month=y_pred_test)[['item_cnt_month']]
submission.describe()

Unnamed: 0,item_cnt_month
count,214200.0
mean,0.706271
std,0.125218
min,-0.015541
25%,0.627105
50%,0.627105
75%,0.775692
max,3.242039


In [29]:
submission.head()

Unnamed: 0,item_cnt_month
0,0.857848
1,0.857848
2,0.857848
3,0.857848
4,0.857848


In [35]:
submission.to_csv('submission.csv', index_label='ID') #header=['ID', 'item_cnt_month'])

In [36]:
!head submission.csv

ID,item_cnt_month
0,0.8578477837496452
1,0.8578477837496452
2,0.8578477837496452
3,0.8578477837496452
4,0.8578477837496452
5,0.8578477837496452
6,0.8578477837496452
7,0.8578477837496452
8,0.8578477837496452


In [37]:
!gzip submission.csv
!ls

Baseline.ipynb	data  EDA.ipynb  submission.csv.gz


0.3 is best learning rate so far.

[
 (split1_train_idxs, split1_test_idxs),
 (split2_train_idxs, split2_test_idxs),
 (split3_train_idxs, split3_test_idxs),
 ...
]

"Submissions are evaluated by root mean squared error (RMSE). True target values are clipped into [0,20] range."

and

"For each id in the test set, you must predict a total number of sales."

and

"Submission is for date_block_num 34"

and

"
My CV strategy is 5-fold moving window:

fold 1: Train on month 0 to 32 and validate on 33
fold 2: Train on month 0 to 31 and validate on 32
…
fold 5: Train on month 0 to 28 and validate on 29
"

and

- mean encodings
- lag
- text extraction on item and category names