In [None]:
import os
import math

import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.model_selection import (
    train_test_split, cross_val_score, GridSearchCV, TimeSeriesSplit
)
from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.ensemble import RandomForestRegressor

In [None]:
DATASET_PATH = 'datasets'
FIGURES_PATH = 'figures'

season_order = ['winter', 'spring', 'summer', 'autumn']

In [None]:
motorbike_data = (
    pd.read_csv(
        os.path.join(DATASET_PATH, 'cleanted_motorbike_ambulance_calls.csv'),
        parse_dates=['date'],
        dayfirst=False,
    )
    .assign(
        yr=lambda x: np.where(
            x['yr'] == 2011,
            0,
            1
        )
    )
    .assign(
        season=lambda x: (
            pd.Categorical(
                x['season'], 
                categories=season_order, 
                ordered=True
            )
        )
    )
    .drop(columns='was_missing')
)
motorbike_data.info()
motorbike_data.head()

In [None]:
numerical_features = ['date', 'hr', 'yr', 'mnth', 'temp', 'atemp', 'hum', 'windspeed']
categorical_features = ['season', 'holiday', 'weekday', 'workingday', 'weathersit']
target_variable = 'cnt'

In [None]:
X, y = motorbike_data.drop(columns=target_variable), motorbike_data[target_variable]

# No time machine: use 'past' data for training, use 'future' data for testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, shuffle=False
)

assert X_train.index.max() < X_test.index.min()

In [None]:
class FeatureMeanStdTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, feature, lags=None):
        self.feature = feature
        self.feature_mean_and_std = None
        self.lags = lags
        
    def _get_lags(self):
        return sorted(self.lags) if self.lags else [0]
        
    def fit(self, X, y=None):
        self.feature_mean_and_std = (
            X[['hr', self.feature]]
            .groupby('hr')
            [self.feature]
            .agg(['mean', 'std'])
            .rename(columns={
                'mean': f'{self.feature}_mean', 
                'std': f'{self.feature}_std'
            })
        )    
        return self
    
    def transform(self, X):
        if self.feature_mean_and_std is None:
            raise RuntimeError('Need to fit() first!')
        
        data_with_feature = (
            X
            [['date', 'hr']]
            .merge(
                self.feature_mean_and_std,
                how='left',
                left_on='hr',
                right_index=True
            )
            .sort_values(['date', 'hr'])
        )
        
        for lag in self._get_lags():
            if lag == 0:
                continue
            data_with_feature = (
                data_with_feature
                .assign(**{
                    f'{self.feature}_mean_{lag}h_lag': lambda x: (
                        x[f'{self.feature}_mean']
                        .shift(
                            lag, 
                            fill_value=x.iloc[:lag][f'{self.feature}_mean'].mean()
                        )
                    ),
                    f'{self.feature}_std_{lag}h_lag': lambda x: (
                        x[f'{self.feature}_std']
                        .shift(
                            lag, 
                            fill_value=x.iloc[:lag][f'{self.feature}_std'].mean()
                        )
                    )
                })
            )
            
        if 0 not in self._get_lags():
            data_with_feature = (
                data_with_feature
                .drop(columns=[f'{self.feature}_mean', f'{self.feature}_std'])
            )
        
        return (
            # We sorted, so we have to restore the original ordering
            X
            [['date', 'hr']]
            .merge(
                data_with_feature,
                how='left',
                on=['date', 'hr']
            )
            .drop(columns=['date', 'hr'])
        )
    
    def get_feature_names(self):
        names = []
        for lag in self._get_lags():
            if lag == 0:
                names.extend([
                    f'{self.feature}_mean', 
                    f'{self.feature}_std'
                ])
            else:
                names.extend([
                    f'{self.feature}_mean_{lag}h_lag', 
                    f'{self.feature}_std_{lag}h_lag'
                ])
        return names
    
(
    FeatureMeanStdTransformer('hum', lags=[1, 2, 3, 0])
    .fit_transform(X_train, y_train)
    .head()
)

In [None]:
class CntMeanStdTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, lags=None):
        self.feature_transformer = FeatureMeanStdTransformer('cnt', lags)
        
    def fit(self, X, y=None):
        if y is None:
            raise RuntimeError('Target variable is required for fitting!')
        data = (
            pd.concat(
                [X['hr'], y],
                axis='columns',
                sort=False
            )
        )
        self.feature_transformer.fit(data)
        return self
    
    def transform(self, X):
        return self.feature_transformer.transform(X)
    
    def get_feature_names(self):
        return self.feature_transformer.get_feature_names()
    
(
    CntMeanStdTransformer(lags=[1, 2, 3, 0])
    .fit_transform(X_train, y_train)
    .head()
)

In [None]:
numerical_features_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

In [None]:
categorical_features_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    (
        'onehot', 
        OneHotEncoder(
            categories='auto', 
            sparse=False, 
            handle_unknown='ignore'
        )
    )
])

In [None]:
basic_features_pipeline = (
    ColumnTransformer(
        [
            (
                'numerical_features',
                numerical_features_pipeline,
                [x for x in numerical_features if x not in {'date', 'atemp'}]
            ),
            (
                'categorical_features',
                categorical_features_pipeline,
                categorical_features
            )
        ],
        remainder='drop'
    )
)

In [None]:
custom_numerical_features_pipeline = Pipeline([
    (
        'custom_numerical_features',
        FeatureUnion([
            ('cnt_mean_std', CntMeanStdTransformer(lags=[0, 1, 2, 3])),
            ('hum_mean_std', FeatureMeanStdTransformer('hum', lags=[0, 1, 2, 3])),
            ('temp_mean_std', FeatureMeanStdTransformer('temp', lags=[0, 1, 2, 3]))
        ])
    ),
    (
        'numerical_features',
        numerical_features_pipeline
    )
])

In [None]:
custom_features_pipeline = FeatureUnion([
    ('numerical_features', custom_numerical_features_pipeline)
])

In [None]:
features_pipeline = FeatureUnion([
    (
        'basic_features',
        basic_features_pipeline
    ),
    (
        'custom_features',
        custom_features_pipeline
    )
])

In [None]:
def build_pipeline(model, use_grid_search=True, **grid_search_params):
    pipeline = Pipeline([
        ('features', features_pipeline),
        ('model', model)
    ])
    if use_grid_search:
        grid_search_params = {
            'cv': TimeSeriesSplit(n_splits=5),

            **grid_search_params
        }
        return GridSearchCV(pipeline, **grid_search_params)
    return pipeline

In [None]:
def evaluate_prediction(model_name, y_true, y_pred):
    rmse = math.sqrt(
        mean_squared_error(y_true, y_pred)
    )
    print(f'{model_name} RMSE: ', rmse)
    print(f'{model_name} R2 score: ', r2_score(y_true, y_pred))
    sns.relplot(
        x=model_name,
        y='true values',
        data=pd.DataFrame({
            'true values': y_true,
            model_name: y_pred
        })
    )

In [None]:
def evaluate_model(model_name, model, **grid_search_params):
    pipeline = build_pipeline(
        model,
        use_grid_search='param_grid' in grid_search_params,
        **grid_search_params
    )
    pipeline.fit(X_train, y_train)
    evaluate_prediction(model_name, y_test, pipeline.predict(X_test))
    if hasattr(pipeline, 'best_params_'):
        print('Best params: ', pipeline.best_params_)
    return pipeline

In [None]:
random_forest = evaluate_model(
    'random forest',
    RandomForestRegressor(random_state=42),
    param_grid={
        # Already found the best params
        'model__n_estimators': [45, 50],
        'model__max_depth': [15]
    }
)