# Kaggle recruit restaurants benchmark

See what the data looks like.

In [None]:
from creme import datasets

for x, y in datasets.fetch_restaurants():
    print(x, y)
    break

Define a feature extraction pipeline.

In [None]:
import datetime as dt
from creme import compose
from creme import feature_extraction
from creme import preprocessing
from creme import stats


def parse_date(x):
        x['date'] = dt.datetime.strptime(x['date'], '%Y-%m-%d')
        return x

    
def get_date_info(x):
    return {
        'day_of_week': x['date'].weekday(),
        'is_weekend': x['date'].weekday() in (5, 6)
    }


def make_feature_extractor():

    return compose.Pipeline([
        ('parse_date', preprocessing.FuncTransformer(parse_date)),
        ('features', compose.TransformerUnion([
            preprocessing.FuncTransformer(get_date_info),
            feature_extraction.GroupBy(
                on='visitors',
                by='store_id',
                how=stats.EWMean(alpha=0.5)
            )
        ])),
        ('scale', preprocessing.StandardScaler())
    ])

Check the feature extraction pipeline.

In [None]:
feature_extractor = make_feature_extractor()

for x, y in datasets.fetch_restaurants():
    
    x['visitors'] = y  # The target is needed by the feature extraction pipeline
    x = feature_extractor.fit_one(x, y)
    
    print(x, y)
    
    break

Let's also write a function to benchmark creme models.

In [None]:
import time
from sklearn import exceptions


def benchmark_creme_model(model, metric):
    
    feature_extractor = make_feature_extractor()
    duration = 0
    
    for x, y in datasets.fetch_restaurants():
    
        x['visitors'] = y  
        x = feature_extractor.fit_one(x, y)

        tic = time.time()
        y_pred = model.fit_one(x, y)
        duration += time.time() - tic
        metric.update(y, y_pred)
    
    return metric, duration

Let's also write a function to benchmark scikit-learn models that have a `partial_fit` method.

In [None]:
from sklearn import exceptions


def benchmark_sklearn_model(model, metric):
    
    feature_extractor = make_feature_extractor()
    duration = 0
    
    for x, y in datasets.fetch_restaurants():
    
        # Extract the features
        x['visitors'] = y  
        x = feature_extractor.fit_one(x, y)
        x = list(x.values())

        # Predict the output of the current observation
        try:
            y_pred = model.predict([x])[0]
        except exceptions.NotFittedError:
            y_pred = 0
        metric.update(y, y_pred)

        # Update the model
        tic = time.time()
        model.partial_fit([x], [y])
        duration += time.time() - tic
    
    return metric, duration

## LinearRegression

MAE: 4.60943 in 8.418

In [None]:
from creme import linear_model
from creme import metrics
from creme import optim

model = linear_model.LinearRegression(optimizer=optim.VanillaSGD(0.005))

metric, duration = benchmark_creme_model(model, metrics.MAE())

print(f'{metric} in {duration:.3f}')

In [None]:
model.weights

In [None]:
model.intercept.get()

## sklearn's SGDRegressor

In [None]:
from sklearn import linear_model

model = linear_model.SGDRegressor(
    learning_rate='constant',
    eta0=0.01,
    max_iter=1,
    alpha=0,
    tol=.0,
    random_state=42
)

metric, duration = benchmark_sklearn_model(model, metrics.MAE())

print(f'{metric} in {duration:.3f}')

In [None]:
model.coef_

In [None]:
model.coef_

In [None]:
model.intercept_