In [51]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import os
import re
from datetime import datetime
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt

from src.sales_data import SalesData
import src.nlp as nlp

In [2]:
for filename in os.listdir('data'):
    if '.csv' in filename:
        df = pd.read_csv(f'data/{filename}')
        print(f'{filename}:\t{df.shape}')

sales_train.csv:	(2935849, 6)
shops.csv:	(60, 2)
test.csv:	(214200, 3)
item_categories.csv:	(84, 2)
items.csv:	(22170, 3)
sample_submission.csv:	(214200, 2)


## Import Data and Format Columns
Also, add year, month, and year_month columns. Then, aggregate to monthly sales.

In [5]:
sd = SalesData()
sd.set()

In [14]:
sd.monthly_agg()
sales = sd.monthly_sales

In [15]:
sales.head()

Unnamed: 0,date_block_num,year,month,year_month,shop_id,item_id,item_price,item_cnt_mth
0,0,2013,1,201301,0,1000,58.0,5.0
1,0,2013,1,201301,0,1001,58.0,2.0
2,0,2013,1,201301,0,10012,76.0,1.0
3,0,2013,1,201301,0,1002,58.0,2.0
4,0,2013,1,201301,0,1003,58.0,2.0


In [16]:
sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1739022 entries, 0 to 1739021
Data columns (total 8 columns):
 #   Column          Dtype  
---  ------          -----  
 0   date_block_num  int64  
 1   year            int64  
 2   month           object 
 3   year_month      object 
 4   shop_id         object 
 5   item_id         object 
 6   item_price      float64
 7   item_cnt_mth    float64
dtypes: float64(2), int64(2), object(4)
memory usage: 106.1+ MB


# Baseline Models
Get a baseline score using Gradient Boosting and Random Forest Regressors. First run used GridSearch to find best parameters... probably overkill for our baseline models.

In [17]:
def ts_train_test_split(X, y, test_periods=1):
    periods = sorted(list(X['year_month'].unique()))
    train_periods = periods[:-test_periods]
    test_periods = periods[-test_periods:]
    
    train_mask = X['year_month'].isin(train_periods)
    X_train = X.loc[train_mask]
    y_train = y.loc[train_mask]
    
    test_mask = X['year_month'].isin(test_periods)
    X_test = X.loc[test_mask]
    y_test = y.loc[test_mask]
    return X_train, X_test, y_train, y_test

In [20]:
feat_cols = ['year_month', 'year', 'month', 'shop_id', 'item_id', 'item_price']
X = sales.loc[:, feat_cols].copy()
y = sales.loc[:, 'item_cnt_mth'].copy()

X_train, X_test, y_train, y_test = ts_train_test_split(X, y, test_periods=1)

In [21]:
X_train.head()

Unnamed: 0,year_month,year,month,shop_id,item_id,item_price
0,201301,2013,1,0,1000,58.0
1,201301,2013,1,0,1001,58.0
2,201301,2013,1,0,10012,76.0
3,201301,2013,1,0,1002,58.0
4,201301,2013,1,0,1003,58.0


In [25]:
for df in [X_train, X_test]:
    df.drop(['year_month', 'year', 'item_price'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [33]:
X_train.head()

Unnamed: 0,month,shop_id,item_id
0,1,0,1000
1,1,0,1001
2,1,0,10012
3,1,0,1002
4,1,0,1003


In [34]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1705824 entries, 0 to 1705823
Data columns (total 3 columns):
 #   Column   Dtype 
---  ------   ----- 
 0   month    object
 1   shop_id  object
 2   item_id  object
dtypes: object(3)
memory usage: 132.1+ MB


### Random Forest Regressor

In [None]:
# rf = RandomForestRegressor(n_jobs=4)
# params = {'n_estimators': [100, 500, 1000], 'max_depth': [2, 3, 4]}
# gs = GridSearchCV(estimator=rf, param_grid=params, verbose=3)
# gs.fit(X_train, y_train.values.ravel())


In [None]:
# for k in gs.cv_results_.keys():
#     print(f'{k}:\n{gs.cv_results_[k]}')

In [29]:
rf = RandomForestRegressor(n_estimators=500, 
                           criterion='mse', 
                           max_depth=2, 
                           min_samples_split=2, 
                           min_samples_leaf=1, 
                           min_weight_fraction_leaf=0.0, 
                           max_features='auto', 
                           max_leaf_nodes=None, 
                           min_impurity_decrease=0.0, 
                           min_impurity_split=None, 
                           bootstrap=True, 
                           oob_score=False, 
                           n_jobs=2, 
                           random_state=123, 
                           verbose=1, 
                           warm_start=False, 
                           ccp_alpha=0.0, 
                           max_samples=None)

rf.fit(X_train, y_train.values.ravel())

y_pred = rf.predict(X_test)

score = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'Random Forest Regressor RMSE: {score}')

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   10.5s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:   44.4s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:  1.7min


Random Forest Regressor RMSE: 13.235326902255489


[Parallel(n_jobs=2)]: Done 500 out of 500 | elapsed:  1.9min finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 500 out of 500 | elapsed:    0.1s finished


### Gradient Boosting Regressor

In [None]:
# gb = GradientBoostingRegressor(criterion='mse', n_iter_no_change=100)
# params = {'n_estimators': [100, 200, 400], 
#           'max_depth': [2, 3, 4], 
#           'learning_rate': [0.05, 0.1]}
# gs = GridSearchCV(estimator=gb, param_grid=params, n_jobs=1, verbose=3)
# gs.fit(X_train, y_train.values.ravel())


In [None]:
# for k in gs.cv_results_.keys():
#     print(f'{k}:\n{gs.cv_results_[k]}')

In [None]:
# best_score_idx = np.argmin(gs.cv_results_['rank_test_score'])
# gs.cv_results_['params'][best_score_idx]

In [30]:
gb = GradientBoostingRegressor(loss='ls', 
                               learning_rate=0.1, 
                               n_estimators=200, 
                               subsample=1.0, 
                               criterion='mse', 
                               min_samples_split=2, 
                               min_samples_leaf=1, 
                               min_weight_fraction_leaf=0.0, 
                               max_depth=2, 
                               min_impurity_decrease=0.0, 
                               min_impurity_split=None, 
                               init=None, 
                               random_state=None, 
                               max_features=None, 
                               alpha=0.9, 
                               verbose=1, 
                               max_leaf_nodes=None, 
                               warm_start=False, 
                               presort='deprecated', 
                               validation_fraction=0.1, 
                               n_iter_no_change=100, 
                               tol=0.0001, 
                               ccp_alpha=0.0)

gb.fit(X_train, y_train.values.ravel())
y_pred = gb.predict(X_test)

score = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'Gradient Boosting Regressor RMSE: {score}')

      Iter       Train Loss   Remaining Time 
         1          53.8286            3.53m
         2          51.9387            3.45m
         3          50.4079            3.49m
         4          49.1677            3.52m
         5          48.1628            3.54m
         6          47.3479            3.51m
         7          46.6878            3.48m
         8          46.2276            3.46m
         9          45.7798            3.43m
        10          45.4166            3.40m
        20          44.3538            3.23m
        30          44.2013            2.97m
        40          44.0791            2.76m
        50          44.0441            2.56m
        60          43.9574            2.38m
        70          43.8851            2.20m
        80          43.7786            2.03m
        90          43.7529            1.85m
       100          43.6705            1.68m
       200          43.2841            0.00s
Gradient Boosting Regressor RMSE: 13.205525666865636


## Generate Output
Import the test set and get results to upload.

In [36]:
test_set = pd.read_csv('data/test.csv')
test_set.insert(loc=1, column='month', value='11')


In [37]:
test_set.head()

Unnamed: 0,ID,month,shop_id,item_id
0,0,11,5,5037
1,1,11,5,5320
2,2,11,5,5233
3,3,11,5,5232
4,4,11,5,5268


In [39]:
y_pred_out = gb.predict(test_set.iloc[:, 1:])

In [47]:
d = {
    'ID': np.arange(0, test_set.shape[0]), 
    'item_cnt_month': y_pred_out
}
output = pd.DataFrame(data=d)
output.to_csv('output/submission_baseline.csv', index=False)

## Categorical info in shops, potentially
A quick glance into the translations of some of the shop_names in shops indicated that I might be able to break out some categorical info. The first word might be a city or some other location. Also, some of the words which have higher frequencies seem to point to either a shopping center, a mall, a megastore, etc.

In [55]:
# clean up names, get locations, then vectorize the top occurences
shops = sd.shops
shops['clean_name'] = nlp.clean_names(shops['shop_name'])

shops['loc_name'] = shops['clean_name'].apply(lambda x: x.split()[0])
shops = nlp.get_top_words(shops, shops['clean_name'], 10)

# remove top occurences if in loc_name
for col in shops.columns:
    if col.upper() in shops['loc_name'].unique():
        shops.drop(col, axis=1, inplace=True)

In [61]:
shops.head()

Unnamed: 0,shop_name,shop_id,clean_name,loc_name,тц,трц,мега,тк,трк,молл,центральный
0,"!Якутск Орджоникидзе, 56 фран",0,ЯКУТСК ОРДЖОНИКИДЗЕ 56 ФРАН,ЯКУТСК,0,0,0,0,0,0,0
1,"!Якутск ТЦ ""Центральный"" фран",1,ЯКУТСК ТЦ ЦЕНТРАЛЬНЫЙ ФРАН,ЯКУТСК,1,0,0,0,0,0,1
2,"Адыгея ТЦ ""Мега""",2,АДЫГЕЯ ТЦ МЕГА,АДЫГЕЯ,1,0,1,0,0,0,0
3,"Балашиха ТРК ""Октябрь-Киномир""",3,БАЛАШИХА ТРК ОКТЯБРЬ-КИНОМИР,БАЛАШИХА,0,0,0,0,1,0,0
4,"Волжский ТЦ ""Волга Молл""",4,ВОЛЖСКИЙ ТЦ ВОЛГА МОЛЛ,ВОЛЖСКИЙ,1,0,0,0,0,1,0


In [62]:
sales.head()

Unnamed: 0,date_block_num,year,month,year_month,shop_id,item_id,item_price,item_cnt_mth
0,0,2013,1,201301,0,1000,58.0,5.0
1,0,2013,1,201301,0,1001,58.0,2.0
2,0,2013,1,201301,0,10012,76.0,1.0
3,0,2013,1,201301,0,1002,58.0,2.0
4,0,2013,1,201301,0,1003,58.0,2.0


## Model with Additional Shop Info

In [63]:
shops['shop_id'] = shops['shop_id'].astype(str)

In [64]:
shop_sales = pd.merge(sales, shops, on='shop_id', how='inner')

In [65]:
drop_cols = ['shop_name', 'clean_name']
for col in drop_cols:
    shop_sales.drop(col, axis=1, inplace=True)

In [66]:
shop_sales.head()

Unnamed: 0,date_block_num,year,month,year_month,shop_id,item_id,item_price,item_cnt_mth,loc_name,тц,трц,мега,тк,трк,молл,центральный
0,0,2013,1,201301,0,1000,58.0,5.0,ЯКУТСК,0,0,0,0,0,0,0
1,0,2013,1,201301,0,1001,58.0,2.0,ЯКУТСК,0,0,0,0,0,0,0
2,0,2013,1,201301,0,10012,76.0,1.0,ЯКУТСК,0,0,0,0,0,0,0
3,0,2013,1,201301,0,1002,58.0,2.0,ЯКУТСК,0,0,0,0,0,0,0
4,0,2013,1,201301,0,1003,58.0,2.0,ЯКУТСК,0,0,0,0,0,0,0


In [67]:
gb_cols = ['year', 'month', 'year_month', 'shop_id', 
           'item_id', 'loc_name', 
           'тц', 'трц', 'мега', 'тк', 'трк', 'молл', 'центральный']

model_input = shop_sales.groupby(gb_cols)[['item_cnt_mth']].sum().reset_index()

In [76]:
model_input.head()

Unnamed: 0,year,month,year_month,shop_id,item_id,loc_name,тц,трц,мега,тк,трк,молл,центральный,item_cnt_mth
0,2013,1,201301,0,1000,ЯКУТСК,0,0,0,0,0,0,0,5.0
1,2013,1,201301,0,1001,ЯКУТСК,0,0,0,0,0,0,0,2.0
2,2013,1,201301,0,10012,ЯКУТСК,0,0,0,0,0,0,0,1.0
3,2013,1,201301,0,1002,ЯКУТСК,0,0,0,0,0,0,0,2.0
4,2013,1,201301,0,1003,ЯКУТСК,0,0,0,0,0,0,0,2.0


In [77]:
model_input = pd.get_dummies(data=model_input, prefix='loc', prefix_sep='_', 
                             columns=['loc_name'], drop_first=True)

In [78]:
model_input.columns

Index(['year', 'month', 'year_month', 'shop_id', 'item_id', 'тц', 'трц',
       'мега', 'тк', 'трк', 'молл', 'центральный', 'item_cnt_mth',
       'loc_БАЛАШИХА', 'loc_ВОЛЖСКИЙ', 'loc_ВОЛОГДА', 'loc_ВОРОНЕЖ',
       'loc_ВЫЕЗДНАЯ', 'loc_ЖУКОВСКИЙ', 'loc_ИНТЕРНЕТ-МАГАЗИН', 'loc_КАЗАНЬ',
       'loc_КАЛУГА', 'loc_КОЛОМНА', 'loc_КРАСНОЯРСК', 'loc_КУРСК',
       'loc_МОСКВА', 'loc_МЫТИЩИ', 'loc_ННОВГОРОД', 'loc_НОВОСИБИРСК',
       'loc_ОМСК', 'loc_РОСТОВНАДОНУ', 'loc_САМАРА', 'loc_СЕРГИЕВ', 'loc_СПБ',
       'loc_СУРГУТ', 'loc_ТОМСК', 'loc_ТЮМЕНЬ', 'loc_УФА', 'loc_ХИМКИ',
       'loc_ЦИФРОВОЙ', 'loc_ЧЕХОВ', 'loc_ЯКУТСК', 'loc_ЯРОСЛАВЛЬ'],
      dtype='object')

In [79]:
X = model_input.drop('item_cnt_mth', axis=1)
y = model_input.loc[:, 'item_cnt_mth'].copy()
X_train, X_test, y_train, y_test = ts_train_test_split(X, y, test_periods=1)

In [81]:
for df in [X_train, X_test]:
    df.drop(['year', 'year_month'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [82]:
X_train.head()

Unnamed: 0,month,shop_id,item_id,тц,трц,мега,тк,трк,молл,центральный,...,loc_СПБ,loc_СУРГУТ,loc_ТОМСК,loc_ТЮМЕНЬ,loc_УФА,loc_ХИМКИ,loc_ЦИФРОВОЙ,loc_ЧЕХОВ,loc_ЯКУТСК,loc_ЯРОСЛАВЛЬ
0,1,0,1000,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,1,0,1001,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,1,0,10012,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,1,0,1002,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,1,0,1003,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [84]:
rf = RandomForestRegressor(n_estimators=500, 
                           criterion='mse', 
                           max_depth=2, 
                           min_samples_split=2, 
                           min_samples_leaf=1, 
                           min_weight_fraction_leaf=0.0, 
                           max_features='auto', 
                           max_leaf_nodes=None, 
                           min_impurity_decrease=0.0, 
                           min_impurity_split=None, 
                           bootstrap=True, 
                           oob_score=False, 
                           n_jobs=2, 
                           random_state=123, 
                           verbose=1, 
                           warm_start=False, 
                           ccp_alpha=0.0, 
                           max_samples=None)

rf.fit(X_train, y_train.values.ravel())

y_pred = rf.predict(X_test)

score = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'Random Forest Regressor RMSE: {score}')

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   20.2s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:  1.4min
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:  3.2min
[Parallel(n_jobs=2)]: Done 500 out of 500 | elapsed:  3.6min finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.0s


Random Forest Regressor RMSE: 14.165455365525284


[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 500 out of 500 | elapsed:    0.1s finished


In [85]:
gb = GradientBoostingRegressor(loss='ls', 
                               learning_rate=0.1, 
                               n_estimators=200, 
                               subsample=1.0, 
                               criterion='mse', 
                               min_samples_split=2, 
                               min_samples_leaf=1, 
                               min_weight_fraction_leaf=0.0, 
                               max_depth=2, 
                               min_impurity_decrease=0.0, 
                               min_impurity_split=None, 
                               init=None, 
                               random_state=None, 
                               max_features=None, 
                               alpha=0.9, 
                               verbose=1, 
                               max_leaf_nodes=None, 
                               warm_start=False, 
                               presort='deprecated', 
                               validation_fraction=0.1, 
                               n_iter_no_change=100, 
                               tol=0.0001, 
                               ccp_alpha=0.0)

gb.fit(X_train, y_train.values.ravel())
y_pred = gb.predict(X_test)

score = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'Gradient Boosting Regressor RMSE: {score}')

      Iter       Train Loss   Remaining Time 
         1          64.6079           11.47m
         2          62.1983           11.19m
         3          60.5140           10.63m
         4          58.8839           10.89m
         5          57.5613           11.11m
         6          56.4899           11.30m
         7          55.6204           10.76m
         8          54.9146           10.73m
         9          54.8826           10.82m
        10          54.8567           10.88m
        20          53.3467           10.39m
        30          52.9010           10.11m
        40          52.7882            9.23m
        50          52.2984            8.36m
        60          52.2607            7.98m
        70          52.0755            7.48m
        80          51.9222            6.94m
        90          51.7929            6.35m
       100          51.6015            5.80m
       200          51.0850            0.00s
Gradient Boosting Regressor RMSE: 14.07817497991133


## Generate Output
Import the test set and get results to upload.

In [104]:
test_set = pd.read_csv('data/test.csv')
test_set.insert(loc=1, column='month', value='11')
test_set['shop_id'] = test_set['shop_id'].astype(str)
print(test_set.shape)

(214200, 4)


In [105]:
test_set = pd.merge(test_set, shops, on='shop_id', how='inner')

test_set = pd.get_dummies(data=test_set, prefix='loc', prefix_sep='_', 
                             columns=['loc_name'], drop_first=True)

In [109]:
# add missing columns
for col in X_train.columns:
    if col not in test_set.columns:
        print(f'Adding columns {col}')
        test_set[col] = 0

Adding columns loc_ВЫЕЗДНАЯ
Adding columns loc_МЫТИЩИ
Adding columns loc_ХИМКИ


In [111]:
test_set = test_set.loc[:, X_train.columns]
y_pred_out = gb.predict(test_set)

y_pred_out.shape

In [115]:
d = {
    'ID': np.arange(0, test_set.shape[0]), 
    'item_cnt_month': y_pred_out
}
output = pd.DataFrame(data=d)
output.to_csv('output/submission_gb_shop_categories.csv', index=False)

# Initial Insights
Decision Tree models not really doing the job we need it to here. The RMSE scores are terrible. Time Series modeling is likely the better approach. Let's see what diffferent aggregations look like. Then, maybe we can pick a particular shop-item combination to use as a template for our TS model...