In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import os
import re
from datetime import datetime
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
# from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt

from sales_data import SalesData
# import nlp as nlp

In [None]:
for filename in os.listdir('../data'):
    if '.csv' in filename:
        df = pd.read_csv(f'../data/{filename}')
        print(f'{filename}:\t{df.shape}')

## Import Data and Format Columns
Also, add year, month, and year_month columns. Then, aggregate to monthly sales.

In [None]:
sd = SalesData()
sd.set_sales_data()

sd.merge_shop_data_to_sales()
sd.merge_item_data_to_sales()

In [None]:
sd.monthly_sales.head()

In [None]:
sd.monthly_sales.columns

## Get unique shop/items
... and get dummies for categorical data

In [None]:
shop_items = sd.monthly_sales.loc[:, ['shop_id', 
                                      'item_id',  
                                      'loc_name', 
                                      'тц', 
                                      'трц',
                                      'мега', 
                                      'тк', 
                                      'трк', 
                                      'молл', 
                                      'центральный', 
                                      'item_category_name']].copy().drop_duplicates()

In [None]:
data = pd.get_dummies(data=shop_items, 
                      prefix='loc_name', 
                      prefix_sep='_', 
                      columns=['loc_name'], 
                      drop_first=True)

data = pd.get_dummies(data=data, 
                      prefix='cat', 
                      prefix_sep='_', 
                      columns=['item_category_name'], 
                      drop_first=True)

### Break out monthly sales data
Create crosstab of item counts

In [None]:
sales = sd.monthly_sales.loc[:, ['year_month', 
                                 'month', 
                                 'shop_id',
                                 'item_id',
                                 'item_cnt_month']].copy()

In [None]:
sales_ct = pd.crosstab(index=[sales['year_month'], sales['shop_id'], sales['item_id']], 
                    columns=sales.loc[:, 'month'], 
                    values=sales.loc[:, 'item_cnt_month'],
                      aggfunc='mean').reset_index()

### Merge it all

In [None]:
model_data = pd.merge(data, sales_ct, on=['shop_id', 'item_id'], how='left')

In [None]:
model_data.head()

In [None]:
for col in model_data.columns:
    print(col)

# Baseline Models
Get a baseline score using Gradient Boosting and Random Forest Regressors. First run used GridSearch to find best parameters... probably overkill for our baseline models.

In [None]:
def ts_train_test_split(X, y, test_periods=1):
    periods = sorted(list(X['year_month'].unique()))
    train_periods = periods[:-test_periods]
    
    train_mask = X['year_month'].isin(train_periods)
    X_train = X.loc[train_mask].copy()
    y_train = y.loc[train_mask].copy()
    
    X_test = X.loc[~train_mask].copy()
    y_test = y.loc[~train_mask].copy()
    return X_train, X_test, y_train, y_test

In [None]:
feat_cols = model_data.columns[: -3]
X = model_data.loc[:, feat_cols].copy()
y = model_data.loc[:, '10'].copy()

X.fillna(0, inplace=True)
y.fillna(0, inplace=True)

X_train, X_test, y_train, y_test = ts_train_test_split(X, y, test_periods=1)

In [None]:
X_train.head()

In [None]:
# drop year_month and year
for df in [X_train, X_test]:
    df.drop(['year_month' 
#              'year',
#              'avg_price'
            ], axis=1, inplace=True)

In [None]:
# scale data
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

sc_y = StandardScaler()
y_train = sc_y.fit_transform(np.array(y_train).reshape(1, -1))


### Random Forest Regressor

In [None]:
# rf = RandomForestRegressor(n_jobs=1)
# params = {'n_estimators': [100, 500, 1000], 'max_depth': [2, 3]}
# gs = GridSearchCV(estimator=rf, param_grid=params, verbose=3)
# gs.fit(X_train, y_train.values.ravel())


In [None]:
# for k in gs.cv_results_.keys():
#     print(f'{k}:\n{gs.cv_results_[k]}')

In [None]:
rf = RandomForestRegressor(n_estimators=500, 
                           criterion='mse', 
                           max_depth=2, 
                           min_samples_split=2, 
                           min_samples_leaf=1, 
                           min_weight_fraction_leaf=0.0, 
                           max_features='auto', 
                           max_leaf_nodes=None, 
                           min_impurity_decrease=0.0, 
                           min_impurity_split=None, 
                           bootstrap=True, 
                           oob_score=False, 
                           n_jobs=2, 
                           random_state=123, 
                           verbose=1, 
                           warm_start=False, 
                           ccp_alpha=0.0, 
                           max_samples=None)

rf.fit(X_train, y_train.ravel())

y_pred = rf.predict(X_test)

score = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'Random Forest Regressor RMSE: {score}')

### Gradient Boosting Regressor

In [None]:
# gb = GradientBoostingRegressor(criterion='mse', n_iter_no_change=100)
# params = {'n_estimators': [100, 200, 400], 
#           'max_depth': [2, 3, 4], 
#           'learning_rate': [0.05, 0.1]}
# gs = GridSearchCV(estimator=gb, param_grid=params, n_jobs=1, verbose=3)
# gs.fit(X_train, y_train.values.ravel())


In [None]:
# for k in gs.cv_results_.keys():
#     print(f'{k}:\n{gs.cv_results_[k]}')

In [None]:
# best_score_idx = np.argmin(gs.cv_results_['rank_test_score'])
# gs.cv_results_['params'][best_score_idx]

In [None]:
gb = GradientBoostingRegressor(loss='ls', 
                               learning_rate=0.1, 
                               n_estimators=200, 
                               subsample=1.0, 
                               criterion='mse', 
                               min_samples_split=2, 
                               min_samples_leaf=1, 
                               min_weight_fraction_leaf=0.0, 
                               max_depth=2, 
                               min_impurity_decrease=0.0, 
                               min_impurity_split=None, 
                               init=None, 
                               random_state=None, 
                               max_features=None, 
                               alpha=0.9, 
                               verbose=1, 
                               max_leaf_nodes=None, 
                               warm_start=False, 
                               presort='deprecated', 
                               validation_fraction=0.1, 
                               n_iter_no_change=100, 
                               tol=0.0001, 
                               ccp_alpha=0.0)

gb.fit(X_train, y_train.ravel())
y_pred = gb.predict(X_test)

score = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'Gradient Boosting Regressor RMSE: {score}')

## Generate Output
Import the test set and get results to upload.

In [None]:
test_set = pd.read_csv('data/test.csv')
test_set.insert(loc=1, column='month', value='11')


In [None]:
test_set.head()

In [None]:
y_pred_out = gb.predict(test_set.iloc[:, 1:])

In [None]:
d = {
    'ID': np.arange(0, test_set.shape[0]), 
    'item_cnt_month': y_pred_out
}
output = pd.DataFrame(data=d)
output.to_csv('output/submission_baseline.csv', index=False)

## Categorical info in shops, potentially
A quick glance into the translations of some of the shop_names in shops indicated that I might be able to break out some categorical info. The first word might be a city or some other location. Also, some of the words which have higher frequencies seem to point to either a shopping center, a mall, a megastore, etc.

In [None]:
# clean up names, get locations, then vectorize the top occurences
shops = sd.shops
shops['clean_name'] = nlp.clean_names(shops['shop_name'])

shops['loc_name'] = shops['clean_name'].apply(lambda x: x.split()[0])
shops = nlp.get_top_words(shops, shops['clean_name'], 10)

# remove top occurences if in loc_name
for col in shops.columns:
    if col.upper() in shops['loc_name'].unique():
        shops.drop(col, axis=1, inplace=True)

In [None]:
shops.head()

In [None]:
sales.head()

## Model with Additional Shop Info

In [None]:
shops['shop_id'] = shops['shop_id'].astype(str)

In [None]:
shop_sales = pd.merge(sales, shops, on='shop_id', how='inner')

In [None]:
drop_cols = ['shop_name', 'clean_name']
for col in drop_cols:
    shop_sales.drop(col, axis=1, inplace=True)

In [None]:
shop_sales.head()

In [None]:
gb_cols = ['year', 'month', 'year_month', 'shop_id', 
           'item_id', 'loc_name', 
           'тц', 'трц', 'мега', 'тк', 'трк', 'молл', 'центральный']

model_input = shop_sales.groupby(gb_cols)[['item_cnt_mth']].sum().reset_index()

In [None]:
model_input.head()

In [None]:
model_input = pd.get_dummies(data=model_input, prefix='loc', prefix_sep='_', 
                             columns=['loc_name'], drop_first=True)

In [None]:
model_input.columns

In [None]:
X = model_input.drop('item_cnt_mth', axis=1)
y = model_input.loc[:, 'item_cnt_mth'].copy()
X_train, X_test, y_train, y_test = ts_train_test_split(X, y, test_periods=1)

In [None]:
for df in [X_train, X_test]:
    df.drop(['year', 'year_month'], axis=1, inplace=True)

In [None]:
X_train.head()

In [None]:
rf = RandomForestRegressor(n_estimators=500, 
                           criterion='mse', 
                           max_depth=2, 
                           min_samples_split=2, 
                           min_samples_leaf=1, 
                           min_weight_fraction_leaf=0.0, 
                           max_features='auto', 
                           max_leaf_nodes=None, 
                           min_impurity_decrease=0.0, 
                           min_impurity_split=None, 
                           bootstrap=True, 
                           oob_score=False, 
                           n_jobs=2, 
                           random_state=123, 
                           verbose=1, 
                           warm_start=False, 
                           ccp_alpha=0.0, 
                           max_samples=None)

rf.fit(X_train, y_train.values.ravel())

y_pred = rf.predict(X_test)

score = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'Random Forest Regressor RMSE: {score}')

In [None]:
gb = GradientBoostingRegressor(loss='ls', 
                               learning_rate=0.1, 
                               n_estimators=200, 
                               subsample=1.0, 
                               criterion='mse', 
                               min_samples_split=2, 
                               min_samples_leaf=1, 
                               min_weight_fraction_leaf=0.0, 
                               max_depth=2, 
                               min_impurity_decrease=0.0, 
                               min_impurity_split=None, 
                               init=None, 
                               random_state=None, 
                               max_features=None, 
                               alpha=0.9, 
                               verbose=1, 
                               max_leaf_nodes=None, 
                               warm_start=False, 
                               presort='deprecated', 
                               validation_fraction=0.1, 
                               n_iter_no_change=100, 
                               tol=0.0001, 
                               ccp_alpha=0.0)

gb.fit(X_train, y_train.values.ravel())
y_pred = gb.predict(X_test)

score = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'Gradient Boosting Regressor RMSE: {score}')

## Generate Output
Import the test set and get results to upload.

In [None]:
test_set = pd.read_csv('data/test.csv')
test_set.insert(loc=1, column='month', value='11')
test_set['shop_id'] = test_set['shop_id'].astype(str)
print(test_set.shape)

In [None]:
test_set = pd.merge(test_set, shops, on='shop_id', how='inner')

test_set = pd.get_dummies(data=test_set, prefix='loc', prefix_sep='_', 
                             columns=['loc_name'], drop_first=True)

In [None]:
# add missing columns
for col in X_train.columns:
    if col not in test_set.columns:
        print(f'Adding columns {col}')
        test_set[col] = 0

In [None]:
test_set = test_set.loc[:, X_train.columns]
y_pred_out = gb.predict(test_set)

y_pred_out.shape

In [None]:
d = {
    'ID': np.arange(0, test_set.shape[0]), 
    'item_cnt_month': y_pred_out
}
output = pd.DataFrame(data=d)
output.to_csv('output/submission_gb_shop_categories.csv', index=False)

# Initial Insights
Decision Tree models not really doing the job we need it to here. The RMSE scores are terrible. Time Series modeling is likely the better approach. Let's see what diffferent aggregations look like. Then, maybe we can pick a particular shop-item combination to use as a template for our TS model...