Data Preprocessing

Loading the data, adding variables and removing not valide cases

In [None]:
# Loading the load data 

df_load_raw = pd.read_csv(
    "consommation-quotidienne-brute.csv",
    sep=";",
    nrows=100000,
)
df_load_raw.index = pd.to_datetime(df_load_raw["Date - Heure"], utc=True)
df_load_raw.rename(
    columns={
        "Consommation brute électricité (MW) - RTE": "Historical_consumption"
    },
    inplace=True,
)


# Loading the temperature data  

df_temperature_raw = pd.read_csv(
    "donnees-de-temperature-et-de-pseudo-rayonnement.csv",
    sep=";",
    nrows=100000,
)
df_temperature_raw.index = pd.to_datetime(
    df_temperature_raw["Horodate"], utc=True
)

df_temperature_raw = df_temperature_raw.loc[
    :, ["Température réalisée lissée (°C)", "Pseudo rayonnement (%)"]
]
df_temperature_raw.rename(
    columns={
        "Température réalisée lissée (°C)": "Historical_temperature_smoothed",
        "Pseudo rayonnement (%)": "Historical_irradiance",
    },
    inplace=True,
)

## Merging the two datasets and preliminary data viz

df_load_temperature = df_load_raw.loc[:, ["Historical_consumption"]].merge(
    df_temperature_raw, left_index=True, right_index=True
)
df_load_temperature = df_load_temperature.resample("H").mean()
# Adding more features
df_load_temperature["day_of_week"] = df_load_temperature.index.day_of_week
df_load_temperature["month"] = df_load_temperature.index.month
df_load_temperature["day_of_year"] = df_load_temperature.index.day_of_year
df_load_temperature["day"] = df_load_temperature.index.day
df_load_temperature["timestamp"] = df_load_temperature.index
df_load_temperature["hour"] = df_load_temperature.index.hour
df_load_temperature["date"] = df_load_temperature.index.date
df_load_temperature[
    "weekday_or_week_end"
] = df_load_temperature.index.to_series().apply(
    lambda x: "weekend" if x.day_of_week >= 5 else "weekday"
)
## Dropping COVID days

df_load_temperature = df_load_temperature.loc[
    df_load_temperature.index < "2020-03-15"
]
## Adding the French holidays
## The library holidays provide such data

import holidays

dict_holidays = holidays.France(years=[2016, 2017, 2018, 2019, 2020])


df_holidays = pd.DataFrame.from_dict(
    dict_holidays, orient="index", columns=["holiday"]
)
df_holidays.index = pd.to_datetime(df_holidays.index)
df_holidays["date"] = df_holidays.index.date

df_load_temperature_holidays = (
    df_load_temperature.reset_index()
    .merge(df_holidays, how="left", on="date")
    .set_index("index")
)

df_load_temperature_holidays["is_holiday"] = df_load_temperature_holidays[
    "holiday"
].apply(lambda x: 1 if isinstance(x, str) else 0)

## Aggregating the data to have daily temperatures accessible
df_load_temperature_daily = df_load_temperature.resample("D").mean()
df_load_temperature_daily[
    "day_name"
] = df_load_temperature_daily.index.day_name()
df_load_temperature_daily[
    "weekday_or_week_end"
] = df_load_temperature_daily.index.to_series().apply(
    lambda x: "weekend" if x.day_of_week >= 5 else "weekday"
)

Modeliing: Heka developed their own library, es privada je

In [3]:
## We now import our library and perform the forecasting

%pip install sia_ts_modelling
%pip install scikit-learn
from sia_ts_modelling.automate import pipeline
from sklearn.ensemble import RandomForestRegressor


(benchmarking_df, all_models, prediction_all_models,) = pipeline(
    input_df=df_load_temperature_holidays.dropna(),
    target_column="Historical_consumption",
    features_sets={
        "temperature_only": ["Historical_temperature_smoothed"],
        "temperature_day_month": [
            "Historical_temperature_smoothed",
            "day_of_week",
            "month",
        ],
        "temperature_day_month_hour": [
            "Historical_temperature_smoothed",
            "hour",
            "day_of_week",
            "month",
        ],
    },
    model_and_init_params=[
        {
            "name": "my_first_lasso",
            "model": "lasso",
            "features_sets": ["temperature_only", "temperature_day_month"],
            "model_by_moment": True,
            "period_moment": "hour",
        },
        {
            "name": "my_first_rf",
            "model": RandomForestRegressor,
            "features_sets": ["temperature_only"],
            "kwargs_build": {"n_estimators": 10},
            "model_by_moment": True,
            "period_moment": "hour",
        },
        {
            "name": "my_first_gam",
            "model": "gam",
            "features_sets": ["temperature_day_month"],
            "kwargs_build": {
                "s_terms": [
                    {
                        "feature": "Historical_temperature_smoothed",
                        "extra_kwargs": {"n_splines": 13},
                    },
                ],
                "f_terms": [
                    {
                        "feature": "day_of_week",
                    },
                    {
                        "feature": "month",
                    },
                ],
            },
            "model_by_moment": True,
            "period_moment": "hour",
        },
        {
            "name": "aggregate_model",
            "features_sets": ["temperature_day_month_hour"],
            "model": {"my_lasso": "lasso", "my_rf": RandomForestRegressor},
            "kwargs_build": {
                "weights": "trained",
                "my_rf": {"n_estimators": 10},
            },
            "model_by_moment": False,
            "period_moment": "hour",
        },
    ],
    train_test_split_method="split_proportion",
    test_size=0.25,
    metrics=["MAPE", "R2", "MSE"],
)

Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement sia_ts_modelling (from versions: none)
ERROR: No matching distribution found for sia_ts_modelling


Note: you may need to restart the kernel to use updated packages.


ModuleNotFoundError: No module named 'sia_ts_modelling'

Elijo el modelo de GAM por tener menor MAPE en test

Voy a calcular los shap values para entender como juega cada feature en resultado final

In [None]:
import shap

# The background data is the training data used 
background = shap.maskers.Independent(
    df_load_temperature_holidays[temperature_irradiance_day_year_holiday].dropna(),
    max_samples=1000,
)

explainer_gam = shap.Explainer(my_gam_model.predict, background)

# The shap values are computed below : 3000 values are sampled randomly to limit the computation time 
shap_values_gam = explainer_gam(
    df_load_temperature_holidays[temperature_irradiance_day_year_holiday].dropna().sample(3000)
)