In [20]:
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Ridge
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor


# Notebook with EDA and process to Model Selection

## 1) EDA

## 2) Model Selection

In [21]:
def _merge_external_data(X):
    file_path = Path("data") / "external_data.csv"
    df_ext = pd.read_csv(file_path, parse_dates=["date"])

    X = X.copy()
    # When using merge_asof left frame need to be sorted
    X["orig_index"] = np.arange(X.shape[0])

    X = pd.merge_asof(  # , "nbas" , "raf10"
        X.sort_values("date"), df_ext[["date", "hol_bank", "hol_scol", "quarantine1", "quarantine2", "t", "rr1", "u", "nbas", "raf10"]].sort_values("date").dropna(), on="date")  # , direction="nearest"
    # Sort back to the original order
    X = X.sort_values("orig_index")
    del X["orig_index"]
    return X


def _encode_dates(X):
    X = X.copy()  # modify a copy of X
    # Encode the date information from the DateOfDeparture columns
    X.loc[:, "year"] = X["date"].dt.year
    X.loc[:, "month"] = X["date"].dt.month
    X.loc[:, "day"] = X["date"].dt.day
    X.loc[:, "weekday"] = X["date"].dt.weekday
    X.loc[:, "hour"] = X["date"].dt.hour

    X.loc[:, "weekend"] = X["weekday"] > 4

    X['sin_hours'] = np.sin(2*np.pi*X["hour"]/24)
    X['cos_hours'] = np.cos(2*np.pi*X["hour"]/24)

    X['sin_mnth'] = np.sin(2*np.pi*X["month"]/12)
    X['cos_mnth'] = np.cos(2*np.pi*X["month"]/12)
    
    return X.drop(columns=["date"])


First we create a function to easily get the different features we want to test.
We get:
- the features that do not need to be preprocessed 
- the categorical features that need 1-0 encoding

In [22]:
# function to get the features that do not need to be processed
def get_passthrough(date, list_of_temp):
    """function to get the features that will not be transformed at the prepocessing stage

    Args:
        date (str): "both_date": select all the date features                   
                    "original_date": selecte date without sin transformed
                    "transformed_date": select date with sin-cos transformation

        list_of_temp (list): list of features (no date) that will not be transformed

    Returns:
        _type_: features to not be transformed  
    """
    pass_through_cols = []
    if "both_date" == date:
        pass_through_cols = ["hour", "day", "weekday", "month",
                             "year", "sin_hours", "cos_hours", "sin_mnth", "cos_mnth"]

    if "original_date" == date:
        pass_through_cols = ["hour", "day", "weekday", "month", "year"]

    if "transformed_date" == date:
        pass_through_cols = ["sin_hours", "cos_hours",
                             "sin_mnth", "cos_mnth", "year", "weekday"]

    for el in list_of_temp:
        pass_through_cols.append(el)

    return pass_through_cols


In [23]:
def get_estimator(pass_through_cols, categorical_cols, regressor=XGBRegressor()):

    # define the encoders
    categorical_encoder = OneHotEncoder(handle_unknown="ignore")
    date_encoder = FunctionTransformer(_encode_dates)
    
    # define the transformation of data before using regressor
    preprocessor = ColumnTransformer(
        [
            ("cat", categorical_encoder, categorical_cols),
            # ("std_scaler", StandardScaler(), numerical_cols),
            ("passthrough", "passthrough", pass_through_cols)
        ],
    )

    pipe = make_pipeline(
        FunctionTransformer(_merge_external_data, validate=False),
        date_encoder,
        preprocessor,
        regressor,
    )

    return pipe


In [24]:
# test
a =  ["sin_hours", "cos_hours", "sin_mnth","cos_mnth"]
b =  ["counter_name", "site_name","weekday", "weekend"]
model = get_estimator(a,b,regressor=Ridge())
model

In [25]:
import problem

X_train, y_train = problem.get_train_data()
X_test, y_test = problem.get_test_data()


model.fit(X_train, y_train)


In [29]:
from sklearn.metrics import mean_squared_error

print(
    f"Train set, RMSE={mean_squared_error(y_train, model.predict(X_train), squared=False):.2f}"
)
print(
    f"Test set, RMSE={mean_squared_error(y_test, model.predict(X_test), squared=False):.2f}"
)

Train set, RMSE=0.97
Test set, RMSE=0.91


In [None]:
def test_model(pass_throughs_col, categorical_cols, regressor = XGBRegressor()):
    """test a model given the features and the regressor and output the prediction on the test data set and the scores on train and test

    Args:
        pass_throughs_col (list): list of features not to be transform during pre-processing step
        categorical_cols (list): columns to be one hot encoded
        regressor (regressor to use in our model, optional): scikit compatible function. Defaults to XGBRegressor().

    Returns:
        array,float,float: the prediction on the test set, the score on the training set, the score on the testing set
    """
    X_train, y_train = problem.get_train_data()
    X_test, y_test = problem.get_test_data()    
    model = get_estimator(pass_throughs_col,categorical_cols,regressor)
    model.fit(X_train, y_train)

    return model.predict(X_test), mean_squared_error(y_train, model.predict(X_train), y_test, model.predict(X_test), squared=False)

    