In [None]:
import math

import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import (
    train_test_split, cross_val_score, GridSearchCV, TimeSeriesSplit
)
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error

# Configuration

In [None]:
DATASET_PATH = 'datasets/motorbike_ambulance_calls.csv'

# Input data

In [None]:
motorbike_data = (
    pd.read_csv(
        DATASET_PATH,
        parse_dates=['date'],
        dayfirst=False,
    )
    .set_index('index')
)

motorbike_data.info()
motorbike_data.head()

# Data analysis

In [None]:
target_feature = 'cnt'
numerical_features = {
    'temp', 'atemp', 'hum', 'windspeed'
}
numerical_and_target_features = numerical_features | set([target_feature])
ordinal_features = {
    'yr', 'mnth', 'hr'
}
categorical_features = {
    'season', 'mnth', 'holiday', 'weekday', 'weathersit', 'workingday'
}
leftout_features = (
    set(motorbike_data.columns) 
    - set([target_feature])
    - numerical_features
    - ordinal_features
    - categorical_features
)
leftout_features

## Munging

In [None]:
(
    motorbike_data
    .reindex(columns=numerical_and_target_features)
    .describe()
)

In [None]:
(
    motorbike_data
    .reindex(columns=ordinal_features)
    .apply(['min', 'max'], axis='index')
)

Features are already within domains.
`windspeed` has to be scaled to [0, 1] interval, ordinal and categorical features have to be one-hot encoded.

In [None]:
(
    motorbike_data
    .isnull()
    .any()
    .any()
)

We can see that there are no missing values.

## Analysis

### Numerical features

Compute Pearson correlation coefficient pairwise to see if there is a linear dependency between features:

In [None]:
(
    motorbike_data
    .reindex(columns=numerical_and_target_features)
    .corr()
)

In [None]:
print('Max correlation for each feature:')
(
    motorbike_data
    .reindex(columns=numerical_and_target_features)
    .corr()
    .pipe(lambda x: x.subtract(np.diag([1.0] * len(x.columns))))
    .apply(
        lambda x: pd.Series({
            'feature': x.abs().idxmax(), 
            'corr': x[x.abs().idxmax()]
        }), 
        axis='columns'
    )
)

Strong linear dependencies are not observed between the target variable and the numerical features. The target variable `cnt` is correlated most with `temp` feature. At the same time, features `temp` and `atemp` are strongly correlated, so the model should not use both of them.

In [None]:
sns.pairplot(
    (
        motorbike_data
        .reindex(columns=numerical_and_target_features)
        .drop(columns='temp')
    )
);

### Categorical features: workingday

In [None]:
sns.catplot(
    x='hr',
    y='cnt',
    kind='box',
    hue='workingday',
    data=motorbike_data,
    aspect=2
);

As one can see, on working days more accidents happen in the rush hours - 7.00 - 8.00 and 16.00 - 20.00. On non-working days, the accidents are distributed closer to the middle of the day.

For both working and non-working days, the dependency between `hr` and `cnt` doesn't seem to be linear.

**Observation 1: motorbike accidents are distributed differently over time depending on workingday**

### Categorical features: season

In [None]:
sns.catplot(
    x='hr',
    y='cnt',
    kind='box',
    hue='workingday',
    row='season',
    data=motorbike_data,
    aspect=2
);

### Categorical features: weathersit

In [None]:
sns.catplot(
    x='hr',
    y='cnt',
    kind='box',
    hue='workingday',
    row='weathersit',
    data=motorbike_data,
    aspect=2
);

In [None]:
sns.catplot(
    x='weathersit',
    y='cnt',
    data=motorbike_data,
    aspect=2
);

In [None]:
(
    motorbike_data
    .groupby('weathersit')
    ['cnt']
    .sum()
    .plot(kind='bar', title='Number of accidents by weathersit')
);

### Categorical feature: weekday

In [None]:
sns.catplot(
    x='weekday',
    y='cnt',
    kind='box',
    hue='workingday',
    data=motorbike_data,
    aspect=2
);

# Datasets

In [None]:
X, y = motorbike_data.drop(columns=target_feature), motorbike_data[target_feature]

# No time machine: use 'past' data for training, use 'future' data for testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, shuffle=False
)

assert X_train.index.max() < X_test.index.min()

# Pipeline

In [None]:
numerical_transformer = Pipeline(
    steps=[
        # In case serving data has missing values
        ('imputer', SimpleImputer(strategy='mean')),
        # Need to scale windspeed
        ('scaler', StandardScaler())
    ]
)

In [None]:
categorical_transformer = Pipeline(
    steps=[
        # In case serving data has missing values
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(categories='auto', 
                                 sparse=False, 
                                 handle_unknown='ignore'))
    ]
)

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        (
            'numerical', 
            numerical_transformer, 
            list(numerical_features)
        ),
        (
            'categorical', 
            categorical_transformer, 
            list(ordinal_features | categorical_features)
        )
    ],
    remainder='drop'
)

In [None]:
def build_pipeline(model, use_grid_search=True, **grid_search_params):
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    if use_grid_search:
        grid_search_params = {
            'cv': TimeSeriesSplit(n_splits=5),

            **grid_search_params
        }
        return GridSearchCV(pipeline, **grid_search_params)
    return pipeline

In [None]:
def evaluate_prediction(model_name, prediction):
    print(f'{model_name} RMSE: ', math.sqrt(mean_squared_error(y_test, prediction)))
    sns.relplot(
        x=model_name,
        y='y_test',
        data=pd.DataFrame({
            model_name: prediction,
            'y_test': y_test
        })
    );

In [None]:
def evaluate_model(model_name, model, **grid_search_params):
    pipeline = build_pipeline(
        model,
        use_grid_search='param_grid' in grid_search_params,
        **grid_search_params
    )
    pipeline.fit(X_train, y_train)
    evaluate_prediction(model_name, pipeline.predict(X_test))
    return pipeline

# Modelling

In [None]:
from sklearn.linear_model import LinearRegression

linear_regression = evaluate_model('linear regression', LinearRegression())
print('R2 score: ', linear_regression.score(X_test, y_test))

In [None]:
from sklearn.tree import DecisionTreeRegressor

decision_tree = evaluate_model(
    'decision tree', 
    DecisionTreeRegressor(random_state=42),
    param_grid={
        'model__max_depth': [10, 15, 20, 25, 30]
    }
)

In [None]:
from sklearn.ensemble import RandomForestRegressor

random_forest = evaluate_model(
    'random forest',
    RandomForestRegressor(random_state=42),
    param_grid={
        'model__n_estimators': [5, 10, 15, 20, 25],
        'model__max_depth': [None, 10, 15]
    }
)