In [2]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
display(HTML("<style>.output_result { max-width:100% !important; }</style>"))
display(HTML("<style>.prompt { display:none !important; }</style>"))

In [2]:
# 
# train = pd.read_csv('data/train.csv', 
#                     parse_dates=['date'], 
#                     index_col='id', 
#                     dtype={
#                         # 'date': np.datetime64, 
#                         'store_nbr': np.short,
#                         'item_nbr': np.int64,
#                         'unit_sales': np.float64
#                     },
#                     converters={'onpromotion': lambda x: 'T' if x == 'True' else ('F' if x == 'False' else 'U')}
#                    )
#
# train.merge(items).to_parquet('data/train_items.parquet')
# train['class'] = train['class'].astype('str')
# train['item_nbr'] = train['item_nbr'].astype('str')
# train.to_parquet('data/train.parquet')

In [5]:
# scikit-based

train = pd.read_parquet('data/train_items.parquet')
train.head()

Unnamed: 0,date,store_nbr,item_nbr,unit_sales,onpromotion,family,class,perishable
0,2013-01-01,25,103665,7.0,U,BREAD/BAKERY,2712,1
1,2013-01-02,1,103665,2.0,U,BREAD/BAKERY,2712,1
2,2013-01-02,2,103665,5.0,U,BREAD/BAKERY,2712,1
3,2013-01-02,3,103665,6.0,U,BREAD/BAKERY,2712,1
4,2013-01-02,4,103665,2.0,U,BREAD/BAKERY,2712,1


In [7]:
train.shape

(10000000, 8)

In [3]:
# train['Weekday'] = train['date'].dt.dayofweek_str
train.head()['date'].dt.dayofweek

0    1
1    2
2    2
3    2
4    2
Name: date, dtype: int64

In [6]:
train = train.sample(int(10e6))
train = train.set_index('date', drop=False).sort_index()
train.head()

Unnamed: 0_level_0,date,store_nbr,item_nbr,unit_sales,onpromotion,family,class,perishable
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2013-01-01,2013-01-01,25,743494,3.0,U,CLEANING,3022,0
2013-01-01,2013-01-01,25,115894,5.0,U,GROCERY I,1016,0
2013-01-01,2013-01-01,25,1085246,2.891,U,MEATS,2302,1
2013-01-01,2013-01-01,25,683722,1.0,U,GROCERY I,1062,0
2013-01-01,2013-01-01,25,848953,2.0,U,CLEANING,3016,0


In [8]:
from sklearn.preprocessing import OrdinalEncoder, KBinsDiscretizer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
import time
import cbm
from sklearn.metrics import mean_squared_error, make_scorer

from sklearn.base import BaseEstimator, TransformerMixin
import calendar

class DateEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, feature_name, component = 'month' ):
        self.feature_name = feature_name
        
        if component == 'day':
            self.categories = calendar.day_abbr
            self.column_to_ordinal = lambda col: col.dayofweek.values
        elif component == 'month':
            self.categories = calendar.month_abbr
            self.column_to_ordinal = lambda col: col.month.values
        else:
            raise ValueError('component must be either day or month')
        
        self.component = component
    
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        return self.column_to_ordinal(X.iloc[:,0].dt)[:,np.newaxis]

# Talk to Ilya about this use-case
cats = make_column_transformer(
    # TODO: pass pipeline to CBM model + inspect pipeline to correlate for plotting
    
    # https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OrdinalEncoder.html
    (OrdinalEncoder(dtype='int', handle_unknown='use_encoded_value', unknown_value=-1), # +1 in CBM code
     ['store_nbr', 'item_nbr', 'onpromotion', 'family', 'class', 'perishable']),
    
    (DateEncoder('month', 'month'), ['date']),
    (DateEncoder('day', 'day'),     ['date'])
    # https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.KBinsDiscretizer.html
    # (KBinsDiscretizer(n_bins=10, encode='ordinal', dtype='int'),     [''])
)

cbm = cbm.CBM(learning_rate_step_size=1/64000, min_iterations_early_stopping=2)

pipeline = make_pipeline(
        cats,
        cbm
    )

# model.fit(x_train, train['unit_sales'])
# pipeline.fit(train.head(10), train['unit_sales'].head(10))
# pipeline.fit(train.head(100000), train['unit_sales'].head(100000))


# pipeline.fit(train, train['unit_sales'])
# 

# from sklearn.model_selection import cross_val_score

# start = time.time()
# scores = cross_val_score(pipeline, train, train['unit_sales'], 
#                          scoring=make_scorer(mean_squared_error), 
#                          cv=TemporalSplit(n_splits=3, test_size=90),
#                         # n_jobs=-1
#                         )

# print(f'cross-val { time.time() - start}')

# scores

In [9]:
pipeline.fit(train, train['unit_sales'])
1

1

In [17]:
pipeline_feat = make_pipeline(cats)
pipeline_feat.fit(train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('ordinalencoder',
                                                  OrdinalEncoder(dtype='int',
                                                                 handle_unknown='use_encoded_value',
                                                                 unknown_value=-1),
                                                  ['store_nbr', 'item_nbr',
                                                   'onpromotion', 'family',
                                                   'class', 'perishable']),
                                                 ('dateencoder-1',
                                                  DateEncoder(feature_name='month'),
                                                  ['date']),
                                                 ('dateencoder-2',
                                                  DateEncoder(component='day',
                                            

In [18]:
1

1

In [10]:
items = pd.read_csv('data/items.csv')

test = pd.read_csv('data/test.csv',
            parse_dates=['date'], 
            index_col='id', 
            dtype={
                'store_nbr': np.short,
                'item_nbr': np.int64,
                'unit_sales': np.float64
            },
).merge(items)


  mask |= (ar1 == a)


In [12]:
test = test.set_index('date', drop=False).sort_index()
test.head()

Unnamed: 0_level_0,date,store_nbr,item_nbr,onpromotion,family,class,perishable
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2017-08-16,2017-08-16,1,96995,False,GROCERY I,1093,0
2017-08-16,2017-08-16,50,1937083,False,BEVERAGES,1136,0
2017-08-16,2017-08-16,51,1937083,False,BEVERAGES,1136,0
2017-08-16,2017-08-16,52,1937083,False,BEVERAGES,1136,0
2017-08-16,2017-08-16,53,1937083,False,BEVERAGES,1136,0


In [23]:
# pipeline_feat.transform(test)
test.isna().sum(axis=1).sum()

0

In [14]:
test_scores = pipeline.predict(test)
test_scores

  mask &= (ar1 != a)
  mask |= (ar1 == a)


TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [None]:
test[['unit_sales']].index.rename('id', inplace=True)
test['unit_sales'] = y_pred_test
test[['unit_sales']].to_csv('submission.csv', index=True)

In [95]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.utils import indexable

from datetime import datetime, timedelta

def create_date_ranges(start, end, **interval):
    start_ = start
    while start_ < end:
        end_ = start_ + timedelta(**interval)
        yield start_
        start_ = end_

# TODO
class TemporalSplit(TimeSeriesSplit):
    def __init__(self, n_splits=5, *, max_train_size=None, test_size=None, gap=0):
        super().__init__(n_splits)
        self.max_train_size = max_train_size
        self.test_size = test_size
        self.gap = gap

    def split(self, X, y=None, groups=None):
        """Generate indices to split data into training and test set.
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training data, where `n_samples` is the number of samples
            and `n_features` is the number of features.
        y : array-like of shape (n_samples,)
            Always ignored, exists for compatibility.
        groups : array-like of shape (n_samples,)
            Always ignored, exists for compatibility.
        Yields
        ------
        train : ndarray
            The training set indices for that split.
        test : ndarray
            The testing set indices for that split.
        """
        X, y, groups = indexable(X, y, groups)
        
        date_range = list(create_date_ranges(X.index.min(), X.index.max(), days=1))
        n_samples =  len(date_range)
        n_splits = self.n_splits
        n_folds = n_splits + 1
        gap = self.gap
        test_size = (
            self.test_size if self.test_size is not None else n_samples // n_folds
        )

        # Make sure we have enough samples for the given split parameters
        if n_folds > n_samples:
            raise ValueError(
                f"Cannot have number of folds={n_folds} greater"
                f" than the number of samples={n_samples}."
            )
        if n_samples - gap - (test_size * n_splits) <= 0:
            raise ValueError(
                f"Too many splits={n_splits} for number of samples"
                f"={n_samples} with test_size={test_size} and gap={gap}."
            )

        # indices = np.arange(n_samples)
        test_starts = range(n_samples - n_splits * test_size, n_samples, test_size)

        for test_start in test_starts:
            train_end = test_start - gap
            if self.max_train_size and self.max_train_size < train_end:
                yield (
                    # TODO: unit test
                    # TODO: not sure why np.where returns a tuple.
                    np.where(np.logical_and(X.index >= date_range[train_end - self.max_train_size], X.index <= date_range[train_end - 1]))[0],
                    np.where(np.logical_and(X.index >= date_range[test_start], X.index <= date_range[test_start + test_size - 1]))[0]
                    # indices[train_end - self.max_train_size : train_end],
                    # indices[test_start : test_start + test_size],
                )
            else:
                yield (
                    np.where(X.index < date_range[train_end])[0],
                    np.where(np.logical_and(X.index >= date_range[test_start], X.index <= date_range[test_start + test_size - 1]))[0]
                    # indices[:train_end],
                    # indices[test_start : test_start + test_size],
                )

cv = list(TemporalSplit(n_splits=3, test_size=100).split(train_idx))
for s in cv:
    print(s[0])
    print(s[0][0])
    print(f'Train: {train_idx.iloc[s[0]].index.min()} - {train_idx.iloc[s[0]].index.max()}')
    print(f'Test:  {train_idx.iloc[s[1]].index.min()} - {train_idx.iloc[s[1]].index.max()}')
    print()
    
cv = list(TimeSeriesSplit(n_splits=3, test_size=100).split(train_idx))
for s in cv:
    print(s[0])
    print(s[0][0])
    print(f'Train: {train_idx.iloc[s[0]].index.min()} - {train_idx.iloc[s[0]].index.max()}')
    print(f'Test:  {train_idx.iloc[s[1]].index.min()} - {train_idx.iloc[s[1]].index.max()}')
    print()

[     0      1      2 ... 751397 751398 751399]
0
Train: 2013-01-01 00:00:00 - 2016-10-18 00:00:00
Test:  2016-10-19 00:00:00 - 2017-01-26 00:00:00

[     0      1      2 ... 830708 830709 830710]
0
Train: 2013-01-01 00:00:00 - 2017-01-26 00:00:00
Test:  2017-01-27 00:00:00 - 2017-05-06 00:00:00

[     0      1      2 ... 914933 914934 914935]
0
Train: 2013-01-01 00:00:00 - 2017-05-06 00:00:00
Test:  2017-05-07 00:00:00 - 2017-08-14 00:00:00

[     0      1      2 ... 999697 999698 999699]
0
Train: 2013-01-01 00:00:00 - 2017-08-15 00:00:00
Test:  2017-08-15 00:00:00 - 2017-08-15 00:00:00

[     0      1      2 ... 999797 999798 999799]
0
Train: 2013-01-01 00:00:00 - 2017-08-15 00:00:00
Test:  2017-08-15 00:00:00 - 2017-08-15 00:00:00

[     0      1      2 ... 999897 999898 999899]
0
Train: 2013-01-01 00:00:00 - 2017-08-15 00:00:00
Test:  2017-08-15 00:00:00 - 2017-08-15 00:00:00



In [32]:
pipeline.predict(train.head())

array([[9.46390226],
       [9.46412469],
       [9.48171461],
       [9.48268984],
       [9.46475154]])

In [18]:
train.shape

(125497040, 8)

In [54]:
from collections import defaultdict

item_store_map = defaultdict(int)

# have the first item as back-off
item_store_map.update({(row['item_nbr'], row['store_nbr']): idx + 1 for idx, row in train[['item_nbr','store_nbr']].value_counts(ascending=True).reset_index(name='count').query('count > 5').iterrows()})

len(item_store_map)

172197

In [55]:
train['item_store'] = train[['item_nbr','store_nbr']].apply(lambda x: item_store_map[tuple(x)], axis=1)

In [3]:
train = pd.read_parquet('data/train_items.parquet')

class_map = {x: i for i, x in enumerate(np.sort(train['class'].unique()))}
item_nbr_map = {x: i for i, x in enumerate(np.sort(train['item_nbr'].unique()))}

train['class']    = train['class'].map(class_map)
train['item_nbr'] = train['item_nbr'].map(item_nbr_map)

train.head()

Unnamed: 0,date,store_nbr,item_nbr,unit_sales,onpromotion,family,class,perishable
0,2013-01-01,25,4,7.0,U,BREAD/BAKERY,184,1
1,2013-01-02,1,4,2.0,U,BREAD/BAKERY,184,1
2,2013-01-02,2,4,5.0,U,BREAD/BAKERY,184,1
3,2013-01-02,3,4,6.0,U,BREAD/BAKERY,184,1
4,2013-01-02,4,4,2.0,U,BREAD/BAKERY,184,1


In [4]:

# # train['unit_sales'] = train['unit_sales'].astype(np.int32)
# train[['unit_sales']].to_parquet('data/train_unit_sales.parquet')

# x_train = pd.read_parquet('data/train_items_featurized.parquet')
# x_train_unit_sales = pd.read_parquet('data/train_unit_sales.parquet')

In [5]:
(((train['unit_sales'] * 10) % 10  ) > 0).sum()

8191095

In [6]:
8191095 / len(train)

0.06526922866069192

In [7]:
import cbm
import time
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error

def featurize(df):
   return pd.DataFrame({
       'store_nbr'    : df['store_nbr'],
       'item_nbr'     : df['item_nbr'],
       'onpromotion'  : df['onpromotion'],
       'family'       : df['family'],
       'class'        : df['class'],
       'perishable'   : df['perishable'],
       'date'         : df['date'],
    })

start = time.time()

x_train = featurize(train)

print(f'featurize { time.time() - start}')

# enable_bin_count=True) # 
model = cbm.CBM(learning_rate_step_size=1/64000, min_iterations_early_stopping=2)
model.fit(x_train, train['unit_sales'])

print(f'train      {time.time() - start}')
print(f'iterations {model.iterations}')

y_pred_train = model.predict(x_train).flatten()

rmsle = mean_squared_error(train['unit_sales'], y_pred_train, squared=False)
print(rmsle)

# 29sec - for store/item
# 76sec - for store/onprom/family/class/perishable - 612k
# 589k w/  1/2000 learning rate
# 612k w/   1/200
# 132  w/  1/4000
# 37   w/  1/8000
# 25   w/ 1/16000 
# 23   w  1/32000 it>=15
# 23   w  1/32000 it>=5
# 23   w  1/64000 it>=2
# 23.60 w 1/64000 it>=2 + item_nbr
# 23.60 w 1/64000 it>=2 + item_nbr + date

featurize 1.988840103149414
train      59.32568883895874
iterations 3
23.605482539271893


In [9]:
x_train['class'].max()

333

In [10]:
x_train['item_nbr'].max()

4035

In [8]:
# model.plot_importance(figsize=(20, 20))

In [45]:
items = pd.read_csv('data/items.csv')

test = pd.read_csv('data/test.csv',
            parse_dates=['date'], 
            index_col='id', 
            dtype={
                # 'date': np.datetime64, 
                'store_nbr': np.short,
                'item_nbr': np.int64,
                'unit_sales': np.float64
            },
).merge(items)

test['onpromotion'] = test['onpromotion'].map({True: 'T', False: 'F'})
test['class']       = test['class'].map(class_map)
test['item_nbr']    = test['item_nbr'].map(item_nbr_map)

test.head()

  mask |= (ar1 == a)


Unnamed: 0,date,store_nbr,item_nbr,onpromotion,family,class,perishable
0,2017-08-16,1,0.0,F,GROCERY I,63.0,0
1,2017-08-16,2,0.0,F,GROCERY I,63.0,0
2,2017-08-16,3,0.0,F,GROCERY I,63.0,0
3,2017-08-16,4,0.0,F,GROCERY I,63.0,0
4,2017-08-16,5,0.0,F,GROCERY I,63.0,0


In [53]:
# TODO: handle NA by multiplying by 1
test['item_nbr'] = test['item_nbr'].fillna(0).astype(int)
test['class']    = test['class']   .fillna(0).astype(int)
test[test.isna().any(axis=1)]

Unnamed: 0,date,store_nbr,item_nbr,onpromotion,family,class,perishable


In [12]:
# class_cats = train_raw['class'].astype('category').cat.categories.tolist()

# test['class'] = test['class'].astype(pd.CategoricalDtype(categories=class_cats, ordered=True)).cat.codes
# test.head()

In [54]:
x_test = featurize(test)
x_test.head()

Unnamed: 0,store_nbr,item_nbr,onpromotion,family,class,perishable,date
0,1,0,F,GROCERY I,63,0,2017-08-16
1,2,0,F,GROCERY I,63,0,2017-08-16
2,3,0,F,GROCERY I,63,0,2017-08-16
3,4,0,F,GROCERY I,63,0,2017-08-16
4,5,0,F,GROCERY I,63,0,2017-08-16


In [55]:
y_pred_test = model.predict(x_test, explain=True) #.flatten()
y_pred_test

array([[8.67074847, 1.00170424, 1.00180464, ..., 1.00164436, 1.00164009,
        1.00161252],
       [8.67054548, 1.00168079, 1.00180464, ..., 1.00164436, 1.00164009,
        1.00161252],
       [8.67063081, 1.00169065, 1.00180464, ..., 1.00164436, 1.00164009,
        1.00161252],
       ...,
       [8.67042574, 1.00148041, 1.00180464, ..., 1.00164436, 1.0016697 ,
        1.00161252],
       [8.67126881, 1.00157779, 1.00180464, ..., 1.00164436, 1.0016697 ,
        1.00161252],
       [8.67073749, 1.00151642, 1.00180464, ..., 1.00164436, 1.0016697 ,
        1.00161252]])

In [57]:
pd.Series(y_pred_test[:,0].flatten()).value_counts()

8.666806    21
8.666954    21
8.666864    21
8.667392    21
8.666751    21
            ..
8.656977     1
8.659257     1
8.656693     1
8.656806     1
8.641494     1
Length: 1538737, dtype: int64

In [58]:
y_pred_test = model.predict(x_test)

In [64]:
test[['unit_sales']].index.rename('id', inplace=True)
test['unit_sales'] = y_pred_test
test[['unit_sales']].to_csv('submission.csv', index=True)

In [65]:
test['unit_sales'] = y_pred_test
test[['unit_sales']].to_csv('submission.csv', index=True)

In [66]:
!head submission.csv

id,unit_sales
0,8.670748466911578
1,8.670545482030695
2,8.670630814956288
3,8.67052676822758
4,8.66955171527891
5,8.670202952780572
6,8.670841233962294
7,8.670437318412505
8,8.669850978368242


In [20]:
pd.Series(y_pred_train).value_counts()

8.665899    28
8.649263    26
8.649260    26
8.649381    24
8.649342    22
            ..
8.652211     1
8.659009     1
8.664719     1
8.652154     1
8.634132     1
Length: 20537444, dtype: int64

In [67]:
!kaggle competitions submit -c favorita-grocery-sales-forecasting -f submission.csv -m v1

100%|██████████████████████████████████████| 81.9M/81.9M [00:21<00:00, 3.94MB/s]
Successfully submitted to Corporación Favorita Grocery Sales Forecasting