In [95]:
import warnings

warnings.filterwarnings("ignore")

import os
import gc
import pickle

import numpy as np
import pandas as pd
import polars as pl
import plotly.express as px

from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import r2_score as R2

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

import sklearn
import lightgbm
import xgboost

# Classes

### DataStorage

In [96]:
class DataStorage:
    root = "/kaggle/input/predict-energy-behavior-of-prosumers"

    data_cols = [
        "target",
        "county",
        "is_business",
        "product_type",
        "is_consumption",
        "datetime",
        "row_id",
    ]
    client_cols = [
        "product_type",
        "county",
        "eic_count",
        "installed_capacity",
        "is_business",
        "date",
    ]
    gas_prices_cols = ["forecast_date", "lowest_price_per_mwh", "highest_price_per_mwh"]
    electricity_prices_cols = ["forecast_date", "euros_per_mwh"]
    forecast_weather_cols = [
        "latitude",
        "longitude",
        "hours_ahead",
        "temperature",
        "dewpoint",
        "cloudcover_high",
        "cloudcover_low",
        "cloudcover_mid",
        "cloudcover_total",
        "10_metre_u_wind_component",
        "10_metre_v_wind_component",
        "forecast_datetime",
        "direct_solar_radiation",
        "surface_solar_radiation_downwards",
        "snowfall",
        "total_precipitation",
    ]
    historical_weather_cols = [
        "datetime",
        "temperature",
        "dewpoint",
        "rain",
        "snowfall",
        "surface_pressure",
        "cloudcover_total",
        "cloudcover_low",
        "cloudcover_mid",
        "cloudcover_high",
        "windspeed_10m",
        "winddirection_10m",
        "shortwave_radiation",
        "direct_solar_radiation",
        "diffuse_radiation",
        "latitude",
        "longitude",
    ]
    location_cols = ["longitude", "latitude", "county"]
    target_cols = [
        "target",
        "county",
        "is_business",
        "product_type",
        "is_consumption",
        "datetime",
    ]

    def __init__(self):
        self.df_data = pl.read_csv(
            os.path.join(self.root, "train.csv"),
            columns=self.data_cols,
            try_parse_dates=True,
        )
        self.df_client = pl.read_csv(
            os.path.join(self.root, "client.csv"),
            columns=self.client_cols,
            try_parse_dates=True,
        )
        self.df_gas_prices = pl.read_csv(
            os.path.join(self.root, "gas_prices.csv"),
            columns=self.gas_prices_cols,
            try_parse_dates=True,
        )
        self.df_electricity_prices = pl.read_csv(
            os.path.join(self.root, "electricity_prices.csv"),
            columns=self.electricity_prices_cols,
            try_parse_dates=True,
        )
        self.df_forecast_weather = pl.read_csv(
            os.path.join(self.root, "forecast_weather.csv"),
            columns=self.forecast_weather_cols,
            try_parse_dates=True,
        )
        self.df_historical_weather = pl.read_csv(
            os.path.join(self.root, "historical_weather.csv"),
            columns=self.historical_weather_cols,
            try_parse_dates=True,
        )
        self.df_weather_station_to_county_mapping = pl.read_csv(
            os.path.join(self.root, "weather_station_to_county_mapping.csv"),
            columns=self.location_cols,
            try_parse_dates=True,
        )
        self.df_data = self.df_data.filter(
            pl.col("datetime") >= pd.to_datetime("2022-01-01")
        )
        self.df_target = self.df_data.select(self.target_cols)

        self.schema_data = self.df_data.schema
        self.schema_client = self.df_client.schema
        self.schema_gas_prices = self.df_gas_prices.schema
        self.schema_electricity_prices = self.df_electricity_prices.schema
        self.schema_forecast_weather = self.df_forecast_weather.schema
        self.schema_historical_weather = self.df_historical_weather.schema
        self.schema_target = self.df_target.schema

        self.df_weather_station_to_county_mapping = (
            self.df_weather_station_to_county_mapping.with_columns(
                pl.col("latitude").cast(pl.datatypes.Float32),
                pl.col("longitude").cast(pl.datatypes.Float32),
            )
        )

    def update_with_new_data(
        self,
        df_new_client,
        df_new_gas_prices,
        df_new_electricity_prices,
        df_new_forecast_weather,
        df_new_historical_weather,
        df_new_target,
    ):
        df_new_client = pl.from_pandas(
            df_new_client[self.client_cols], schema_overrides=self.schema_client
        )
        df_new_gas_prices = pl.from_pandas(
            df_new_gas_prices[self.gas_prices_cols],
            schema_overrides=self.schema_gas_prices,
        )
        df_new_electricity_prices = pl.from_pandas(
            df_new_electricity_prices[self.electricity_prices_cols],
            schema_overrides=self.schema_electricity_prices,
        )
        df_new_forecast_weather = pl.from_pandas(
            df_new_forecast_weather[self.forecast_weather_cols],
            schema_overrides=self.schema_forecast_weather,
        )
        df_new_historical_weather = pl.from_pandas(
            df_new_historical_weather[self.historical_weather_cols],
            schema_overrides=self.schema_historical_weather,
        )
        df_new_target = pl.from_pandas(
            df_new_target[self.target_cols], schema_overrides=self.schema_target
        )

        self.df_client = pl.concat([self.df_client, df_new_client]).unique(
            ["date", "county", "is_business", "product_type"]
        )
        self.df_gas_prices = pl.concat([self.df_gas_prices, df_new_gas_prices]).unique(
            ["forecast_date"]
        )
        self.df_electricity_prices = pl.concat(
            [self.df_electricity_prices, df_new_electricity_prices]
        ).unique(["forecast_date"])
        self.df_forecast_weather = pl.concat(
            [self.df_forecast_weather, df_new_forecast_weather]
        ).unique(["forecast_datetime", "latitude", "longitude", "hours_ahead"])
        self.df_historical_weather = pl.concat(
            [self.df_historical_weather, df_new_historical_weather]
        ).unique(["datetime", "latitude", "longitude"])
        self.df_target = pl.concat([self.df_target, df_new_target]).unique(
            ["datetime", "county", "is_business", "product_type", "is_consumption"]
        )

    def preprocess_test(self, df_test):
        df_test = df_test.rename(columns={"prediction_datetime": "datetime"})
        df_test = pl.from_pandas(
            df_test[self.data_cols[1:]], schema_overrides=self.schema_data
        )
        return df_test

### FeaturesGenerator

In [97]:
class FeaturesGenerator:
    def __init__(self, data_storage):
        self.data_storage = data_storage

    def _add_general_features(self, df_features):
        df_features = (
            df_features.with_columns(
                pl.col("datetime").dt.ordinal_day().alias("dayofyear"),
                pl.col("datetime").dt.hour().alias("hour"),
                pl.col("datetime").dt.day().alias("day"),
                pl.col("datetime").dt.weekday().alias("weekday"),
                pl.col("datetime").dt.month().alias("month"),
                pl.col("datetime").dt.year().alias("year"),
            )
            .with_columns(
                pl.concat_str(
                    "county",
                    "is_business",
                    "product_type",
                    "is_consumption",
                    separator="_",
                ).alias("segment"),
            )
            .with_columns(
                (np.pi * pl.col("dayofyear") / 183).sin().alias("sin(dayofyear)"),
                (np.pi * pl.col("dayofyear") / 183).cos().alias("cos(dayofyear)"),
                (np.pi * pl.col("hour") / 12).sin().alias("sin(hour)"),
                (np.pi * pl.col("hour") / 12).cos().alias("cos(hour)"),
            )
        )
        return df_features

    def _add_client_features(self, df_features):
        df_client = self.data_storage.df_client

        df_features = df_features.join(
            df_client.with_columns(
                (pl.col("date") + pl.duration(days=2)).cast(pl.Date)
            ),
            on=["county", "is_business", "product_type", "date"],
            how="left",
        )
        return df_features

    def _add_forecast_weather_features(self, df_features):
        df_forecast_weather = self.data_storage.df_forecast_weather
        df_weather_station_to_county_mapping = (
            self.data_storage.df_weather_station_to_county_mapping
        )

        df_forecast_weather = (
            df_forecast_weather.rename({"forecast_datetime": "datetime"})
            .filter((pl.col("hours_ahead") >= 22) & pl.col("hours_ahead") <= 45)
            .drop("hours_ahead")
            .with_columns(
                pl.col("latitude").cast(pl.datatypes.Float32),
                pl.col("longitude").cast(pl.datatypes.Float32),
            )
            .join(
                df_weather_station_to_county_mapping,
                how="left",
                on=["longitude", "latitude"],
            )
            .drop("longitude", "latitude")
        )

        df_forecast_weather_date = (
            df_forecast_weather.group_by("datetime").mean().drop("county")
        )

        df_forecast_weather_local = (
            df_forecast_weather.filter(pl.col("county").is_not_null())
            .group_by("county", "datetime")
            .mean()
        )

        for hours_lag in [0, 7 * 24]:
            df_features = df_features.join(
                df_forecast_weather_date.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ),
                on="datetime",
                how="left",
                suffix=f"_forecast_{hours_lag}h",
            )
            df_features = df_features.join(
                df_forecast_weather_local.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ),
                on=["county", "datetime"],
                how="left",
                suffix=f"_forecast_local_{hours_lag}h",
            )

        return df_features

    def _add_historical_weather_features(self, df_features):
        df_historical_weather = self.data_storage.df_historical_weather
        df_weather_station_to_county_mapping = (
            self.data_storage.df_weather_station_to_county_mapping
        )

        df_historical_weather = (
            df_historical_weather.with_columns(
                pl.col("latitude").cast(pl.datatypes.Float32),
                pl.col("longitude").cast(pl.datatypes.Float32),
            )
            .join(
                df_weather_station_to_county_mapping,
                how="left",
                on=["longitude", "latitude"],
            )
            .drop("longitude", "latitude")
        )

        df_historical_weather_date = (
            df_historical_weather.group_by("datetime").mean().drop("county")
        )

        df_historical_weather_local = (
            df_historical_weather.filter(pl.col("county").is_not_null())
            .group_by("county", "datetime")
            .mean()
        )

        for hours_lag in [2 * 24, 7 * 24]:
            df_features = df_features.join(
                df_historical_weather_date.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ),
                on="datetime",
                how="left",
                suffix=f"_historical_{hours_lag}h",
            )
            df_features = df_features.join(
                df_historical_weather_local.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ),
                on=["county", "datetime"],
                how="left",
                suffix=f"_historical_local_{hours_lag}h",
            )

        for hours_lag in [1 * 24]:
            df_features = df_features.join(
                df_historical_weather_date.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag),
                    pl.col("datetime").dt.hour().alias("hour"),
                )
                .filter(pl.col("hour") <= 10)
                .drop("hour"),
                on="datetime",
                how="left",
                suffix=f"_historical_{hours_lag}h",
            )

        return df_features

    def _add_target_features(self, df_features):
        df_target = self.data_storage.df_target

        df_target_all_type_sum = (
            df_target.group_by(["datetime", "county", "is_business", "is_consumption"])
            .sum()
            .drop("product_type")
        )

        df_target_all_county_type_sum = (
            df_target.group_by(["datetime", "is_business", "is_consumption"])
            .sum()
            .drop("product_type", "county")
        )

        for hours_lag in [
            2 * 24,
            3 * 24,
            4 * 24,
            5 * 24,
            6 * 24,
            7 * 24,
            8 * 24,
            9 * 24,
            10 * 24,
            11 * 24,
            12 * 24,
            13 * 24,
            14 * 24,
        ]:
            df_features = df_features.join(
                df_target.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ).rename({"target": f"target_{hours_lag}h"}),
                on=[
                    "county",
                    "is_business",
                    "product_type",
                    "is_consumption",
                    "datetime",
                ],
                how="left",
            )

        for hours_lag in [2 * 24, 3 * 24, 7 * 24, 14 * 24]:
            df_features = df_features.join(
                df_target_all_type_sum.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ).rename({"target": f"target_all_type_sum_{hours_lag}h"}),
                on=["county", "is_business", "is_consumption", "datetime"],
                how="left",
            )

            df_features = df_features.join(
                df_target_all_county_type_sum.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ).rename({"target": f"target_all_county_type_sum_{hours_lag}h"}),
                on=["is_business", "is_consumption", "datetime"],
                how="left",
                suffix=f"_all_county_type_sum_{hours_lag}h",
            )

        cols_for_stats = [
            f"target_{hours_lag}h" for hours_lag in [2 * 24, 3 * 24, 4 * 24, 5 * 24]
        ]
        df_features = df_features.with_columns(
            df_features.select(cols_for_stats).mean(axis=1).alias(f"target_mean"),
            df_features.select(cols_for_stats)
            .transpose()
            .std()
            .transpose()
            .to_series()
            .alias(f"target_std"),
        )

        for target_prefix, lag_nominator, lag_denomonator in [
            ("target", 24 * 7, 24 * 14),
            ("target", 24 * 2, 24 * 9),
            ("target", 24 * 3, 24 * 10),
            ("target", 24 * 2, 24 * 3),
            ("target_all_type_sum", 24 * 2, 24 * 3),
            ("target_all_type_sum", 24 * 7, 24 * 14),
            ("target_all_county_type_sum", 24 * 2, 24 * 3),
            ("target_all_county_type_sum", 24 * 7, 24 * 14),
        ]:
            df_features = df_features.with_columns(
                (
                    pl.col(f"{target_prefix}_{lag_nominator}h")
                    / (pl.col(f"{target_prefix}_{lag_denomonator}h") + 1e-3)
                ).alias(f"{target_prefix}_ratio_{lag_nominator}_{lag_denomonator}")
            )

        return df_features

    def _reduce_memory_usage(self, df_features):
        df_features = df_features.with_columns(pl.col(pl.Float64).cast(pl.Float32))
        return df_features

    def _drop_columns(self, df_features):
        df_features = df_features.drop(
            "date", "datetime", "hour", "dayofyear"
        )
        return df_features

    def _to_pandas(self, df_features, y):
        cat_cols = [
            "county",
            "is_business",
            "product_type",
            "is_consumption",
            "segment",
        ]

        if y is not None:
            df_features = pd.concat([df_features.to_pandas(), y.to_pandas()], axis=1)
        else:
            df_features = df_features.to_pandas()

        df_features = df_features.set_index("row_id")
                
        df_features[cat_cols] = df_features[cat_cols].astype("category")

        return df_features

    def generate_features(self, df_prediction_items):
        if "target" in df_prediction_items.columns:
            df_prediction_items, y = (
                df_prediction_items.drop("target"),
                df_prediction_items.select("target"),
            )
        else:
            y = None

        df_features = df_prediction_items.with_columns(
            pl.col("datetime").cast(pl.Date).alias("date"),
        )

        for add_features in [
            self._add_general_features,
            self._add_client_features,
            self._add_forecast_weather_features,
            self._add_historical_weather_features,
            self._add_target_features,
            self._reduce_memory_usage,
            self._drop_columns,
        ]:
            df_features = add_features(df_features)

        df_features = self._to_pandas(df_features, y)

        return df_features

# Initialisation

In [98]:
data_storage = DataStorage()
features_generator = FeaturesGenerator(data_storage=data_storage)

# Feature Generation

In [99]:
df_train_features = features_generator.generate_features(data_storage.df_data)
df_train_features = df_train_features.dropna()

In [100]:
import holidays
import datetime

estonian_holidays = holidays.country_holidays('EE', years=range(2021, 2026))
estonian_holidays = list(estonian_holidays.keys())

def add_holidays_as_binary_features(df):
    df['country_holiday'] = df.apply(lambda row: (datetime.date(row['year'], row['month'], row['day']) in estonian_holidays) * 1, axis=1)
    
    return df

df_train_features = add_holidays_as_binary_features(df_train_features)

# Demand Prediction for Consumers

In [101]:
df_train_features = df_train_features[df_train_features['is_consumption'] == 1]
df_train_features.pop('is_consumption')
target = df_train_features.pop('target')

In [102]:
X = df_train_features
y = target

# Benchmarking Different Models

In [104]:
# X, y = X[:1000], y[:1000]
results = []

In [105]:
from sklearn.model_selection import TimeSeriesSplit

tscv = TimeSeriesSplit(n_splits = 5)

## Decision Tree Regressor

In [106]:
from sklearn.tree import DecisionTreeRegressor

maes = []
mses = []
rmses = []
r2s = []

for i, (train_index, test_index) in enumerate(tscv.split(X)):  
    X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[test_index], y.iloc[train_index], y.iloc[test_index]
    
    decisionTreeRegressor = DecisionTreeRegressor(random_state = 42)
    decisionTreeRegressor.fit(X_train, y_train)
    y_pred = decisionTreeRegressor.predict(X_test)

    maes.append(MAE(y_test, y_pred))
    mses.append(MSE(y_test, y_pred))
    rmses.append(np.sqrt(mses[-1]))
    r2s.append(R2(y_test, y_pred))
        
results.append(["DecisionTreeRegressor", np.mean(maes), np.mean(mses), np.mean(rmses), np.mean(r2s)])
print(results[-1])

['DecisionTreeRegressor', 41.24169759036144, 13528.550891753013, 105.00161438575813, 0.9877017171710591]


## Random Forest Regressor

In [107]:
from sklearn.ensemble import RandomForestRegressor

maes = []
mses = []
rmses = []
r2s = []

for i, (train_index, test_index) in enumerate(tscv.split(X)):  
    X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[test_index], y.iloc[train_index], y.iloc[test_index]

    randomForestRegressor = RandomForestRegressor(random_state = 42, n_jobs = -1)
    randomForestRegressor.fit(X_train, y_train)
    y_pred = randomForestRegressor.predict(X_test)

    maes.append(MAE(y_test, y_pred))
    mses.append(MSE(y_test, y_pred))
    rmses.append(np.sqrt(mses[-1]))
    r2s.append(R2(y_test, y_pred))

results.append(["RandomForestRegressor", np.mean(maes), np.mean(mses), np.mean(rmses), np.mean(r2s)])
print(results[-1])

['RandomForestRegressor', 40.14285645783138, 14795.62597391169, 105.4402496722915, 0.9877477993098293]


## Gradient Boosting Regressor

In [108]:
from sklearn.ensemble import GradientBoostingRegressor

maes = []
mses = []
rmses = []
r2s = []

for i, (train_index, test_index) in enumerate(tscv.split(X)):  
    X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[test_index], y.iloc[train_index], y.iloc[test_index]

    gradientBoostingRegressor = GradientBoostingRegressor(random_state = 42)
    gradientBoostingRegressor.fit(X_train, y_train)
    y_pred = gradientBoostingRegressor.predict(X_test)

    maes.append(MAE(y_test, y_pred))
    mses.append(MSE(y_test, y_pred))
    rmses.append(np.sqrt(mses[-1]))
    r2s.append(R2(y_test, y_pred))

results.append(["GradientBoostingRegressor", np.mean(maes), np.mean(mses), np.mean(rmses), np.mean(r2s)])
print(results[-1])

['GradientBoostingRegressor', 39.851074435969466, 16889.82834009955, 106.26839842012689, 0.9860977881766338]


## Hist Gradient Boosting Regressor

In [109]:
from sklearn.ensemble import HistGradientBoostingRegressor

maes = []
mses = []
rmses = []
r2s = []

for i, (train_index, test_index) in enumerate(tscv.split(X)):  
    X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[test_index], y.iloc[train_index], y.iloc[test_index]

    histGradientBoostingRegressor = HistGradientBoostingRegressor(random_state = 42)
    histGradientBoostingRegressor.fit(X_train, y_train)
    y_pred = histGradientBoostingRegressor.predict(X_test)

    maes.append(MAE(y_test, y_pred))
    mses.append(MSE(y_test, y_pred))
    rmses.append(np.sqrt(mses[-1]))
    r2s.append(R2(y_test, y_pred))

results.append(["HistGradientBoostingRegressor", np.mean(maes), np.mean(mses), np.mean(rmses), np.mean(r2s)])
print(results[-1])

['HistGradientBoostingRegressor', 96.52718069089214, 79341.26886537582, 248.98548535558666, 0.9281250378233905]


## XGB Regressor

In [110]:
from xgboost import XGBRegressor

maes = []
mses = []
rmses = []
r2s = []

for i, (train_index, test_index) in enumerate(tscv.split(X)):  
    X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[test_index], y.iloc[train_index], y.iloc[test_index]

    xgbRegressor = XGBRegressor(random_state = 42, n_jobs = -1, enable_categorical = True)
    xgbRegressor.fit(X_train, y_train)
    y_pred = xgbRegressor.predict(X_test)

    maes.append(MAE(y_test, y_pred))
    mses.append(MSE(y_test, y_pred))
    rmses.append(np.sqrt(mses[-1]))
    r2s.append(R2(y_test, y_pred))

results.append(["XGBRegressor", np.mean(maes), np.mean(mses), np.mean(rmses), np.mean(r2s)])
print(results[-1])

['XGBRegressor', 35.948857093856994, 21887.87244233223, 114.43493667626396, 0.9779504901450157]


## LGBM Regressor

In [111]:
from lightgbm import LGBMRegressor

maes = []
mses = []
rmses = []
r2s = []

for i, (train_index, test_index) in enumerate(tscv.split(X)):  
    X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[test_index], y.iloc[train_index], y.iloc[test_index]

    lgbmRegressor = LGBMRegressor(random_state = 42, verbosity = -1, n_jobs = -1)
    lgbmRegressor.fit(X_train, y_train)
    y_pred = lgbmRegressor.predict(X_test)

    maes.append(MAE(y_test, y_pred))
    mses.append(MSE(y_test, y_pred))
    rmses.append(np.sqrt(mses[-1]))
    r2s.append(R2(y_test, y_pred))

results.append(["LGBMRegressor", np.mean(maes), np.mean(mses), np.mean(rmses), np.mean(r2s)])
print(results[-1])

['LGBMRegressor', 102.79184270403645, 86222.78431075637, 258.3775073152983, 0.9212331407068828]


# Tuned LGBM Regressor

In [112]:
maes = []
mses = []
rmses = []
r2s = []

for i, (train_index, test_index) in enumerate(tscv.split(X)):  
    X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[test_index], y.iloc[train_index], y.iloc[test_index]

    tuned_lgbmRegressor = LGBMRegressor(random_state = 42,
                                        num_leaves = 8,
                                        min_data_in_leaf = 17,
                                        max_bin = 123,
                                        learning_rate = 0.09444207700449292,
                                        num_iterations = 947,
                                        lambda_l1 = 0.016874770690266484,
                                        lambda_l2 = 2.999940839335436e-08,
                                        verbosity = -1,
                                        n_jobs = -1)
    tuned_lgbmRegressor.fit(X_train, y_train)
    y_pred = tuned_lgbmRegressor.predict(X_test)

    maes.append(MAE(y_test, y_pred))
    mses.append(MSE(y_test, y_pred))
    rmses.append(np.sqrt(mses[-1]))
    r2s.append(R2(y_test, y_pred))

results.append(["LGBMRegressor (Tuned)", np.mean(maes), np.mean(mses), np.mean(rmses), np.mean(r2s)])
print(results[-1])

['LGBMRegressor (Tuned)', 316.2927847765844, 585958.5157056641, 621.4713096065444, 0.4646482734310151]


## CatBoost Regressor

In [125]:
from catboost import CatBoostRegressor

maes = []
mses = []
rmses = []
r2s = []

for i, (train_index, test_index) in enumerate(tscv.split(X)):  
    X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[test_index], y.iloc[train_index], y.iloc[test_index]

    catBoostRegressor = CatBoostRegressor(random_state = 42, verbose = False, cat_features = ['county', 'is_business', 'product_type', 'segment'])
    catBoostRegressor.fit(X_train, y_train)
    y_pred = catBoostRegressor.predict(X_test)

    maes.append(MAE(y_test, y_pred))
    mses.append(MSE(y_test, y_pred))
    rmses.append(np.sqrt(mses[-1]))
    r2s.append(R2(y_test, y_pred))

results.append(["CatBoostRegressor", np.mean(maes), np.mean(mses), np.mean(rmses), np.mean(r2s)])
print(results[-1])

AttributeError: 'DataFrame' object has no attribute 'append'

### One-hot Encoding

In [114]:
cats = ['county', 'is_business', 'product_type', 'segment']
X_train = pd.get_dummies(X_train, columns=cats, drop_first=True)
X_test = pd.get_dummies(X_test, columns=cats, drop_first=True)

## Linear Regression

In [115]:
# from sklearn.linear_model import LinearRegression

# linearRegression = Pipeline([
#     ('StandardScaler', StandardScaler()),
#     ('LinearRegression', LinearRegression())
# ])

# linearRegression.fit(X_train, y_train)
# y_pred = linearRegression.predict(X_test)

# mae = MAE(y_test, y_pred)
# mse = MSE(y_test, y_pred)
# rmse = np.sqrt(mse)
# r2 = R2(y_test, y_pred)

# results.append(["LinearRegression", mae, mse, rmse, r2])
# print(results[-1])

## Elastic Net

In [116]:
from sklearn.linear_model import ElasticNet

maes = []
mses = []
rmses = []
r2s = []

for i, (train_index, test_index) in enumerate(tscv.split(X)):  
    X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[test_index], y.iloc[train_index], y.iloc[test_index]
    
    elasticNet = Pipeline([
        ('StandardScaler', StandardScaler()),
        ('ElasticNet', ElasticNet(random_state = 42))
    ])

    elasticNet.fit(X_train, y_train)
    y_pred = elasticNet.predict(X_test)

    maes.append(MAE(y_test, y_pred))
    mses.append(MSE(y_test, y_pred))
    rmses.append(np.sqrt(mses[-1]))
    r2s.append(R2(y_test, y_pred))

results.append(["ElasticNet", np.mean(maes), np.mean(mses), np.mean(rmses), np.mean(r2s)])
print(results[-1])

['ElasticNet', 150.136144965297, 46695.46440575803, 208.8755046177881, 0.9526856698506055]


## Lasso

In [117]:
from sklearn.linear_model import Lasso

maes = []
mses = []
rmses = []
r2s = []

for i, (train_index, test_index) in enumerate(tscv.split(X)):  
    X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[test_index], y.iloc[train_index], y.iloc[test_index]

    lasso = Pipeline([
        ('StandardScaler', StandardScaler()),
        ('Lasso', Lasso(random_state = 42))
    ])

    lasso.fit(X_train, y_train)
    y_pred = lasso.predict(X_test)

    maes.append(MAE(y_test, y_pred))
    mses.append(MSE(y_test, y_pred))
    rmses.append(np.sqrt(mses[-1]))
    r2s.append(R2(y_test, y_pred))

results.append(["Lasso", np.mean(maes), np.mean(mses), np.mean(rmses), np.mean(r2s)])
print(results[-1])

['Lasso', 79.72507137828163, 52066.482689565535, 171.03665478102025, 0.9502354125642434]


## Ridge

In [118]:
from sklearn.linear_model import Ridge

maes = []
mses = []
rmses = []
r2s = []

for i, (train_index, test_index) in enumerate(tscv.split(X)):  
    X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[test_index], y.iloc[train_index], y.iloc[test_index]

    ridge = Pipeline([
        ('StandardScaler', StandardScaler()),
        ('Ridge', Ridge())
    ])

    ridge.fit(X_train, y_train)
    y_pred = ridge.predict(X_test)

    maes.append(MAE(y_test, y_pred))
    mses.append(MSE(y_test, y_pred))
    rmses.append(np.sqrt(mses[-1]))
    r2s.append(R2(y_test, y_pred))

results.append(["Ridge", np.mean(maes), np.mean(mses), np.mean(rmses), np.mean(r2s)])
print(results[-1])

['Ridge', 145.0017856326503, 166460.27208339865, 292.16792415141174, 0.8392607975412512]


## LinearSVR

In [119]:
from sklearn.svm import LinearSVR

maes = []
mses = []
rmses = []
r2s = []

for i, (train_index, test_index) in enumerate(tscv.split(X)):  
    X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[test_index], y.iloc[train_index], y.iloc[test_index]

    linearSVR = Pipeline([
        ('StandardScaler', StandardScaler()),
        ('LinearSVR', LinearSVR())
    ])

    linearSVR.fit(X_train, y_train)
    y_pred = linearSVR.predict(X_test)

    maes.append(MAE(y_test, y_pred))
    mses.append(MSE(y_test, y_pred))
    rmses.append(np.sqrt(mses[-1]))
    r2s.append(R2(y_test, y_pred))

results.append(["LinearSVR", np.mean(maes), np.mean(mses), np.mean(rmses), np.mean(r2s)])
print(results[-1])

['LinearSVR', 162.81940064827273, 264395.1417599014, 437.5713513550827, 0.7412153533958503]


## SVR

In [120]:
# from sklearn.svm import SVR

# svr = Pipeline([
#     ('StandardScaler', StandardScaler()),
#     ('SVR', SVR())
# ])

# svr.fit(X_train, y_train)
# y_pred = svr.predict(X_test)

# mae = MAE(y_test, y_pred)
# mse = MSE(y_test, y_pred)
# rmse = np.sqrt(mse)
# r2 = R2(y_test, y_pred)

# results.append(["SVR", mae, mse, rmse, r2])
# print(results[-1])

## KNeighbors Regressor

In [121]:
from sklearn.neighbors import KNeighborsRegressor

maes = []
mses = []
rmses = []
r2s = []

for i, (train_index, test_index) in enumerate(tscv.split(X)):  
    X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[test_index], y.iloc[train_index], y.iloc[test_index]

    kNeighborsRegressor = Pipeline([
        ('StandardScaler', StandardScaler()),
        ('KNeighborsRegressor', KNeighborsRegressor())
    ])

    kNeighborsRegressor.fit(X_train, y_train)
    y_pred = kNeighborsRegressor.predict(X_test)

    maes.append(MAE(y_test, y_pred))
    mses.append(MSE(y_test, y_pred))
    rmses.append(np.sqrt(mses[-1]))
    r2s.append(R2(y_test, y_pred))
    
results.append(["KNeighborsRegressor", np.mean(maes), np.mean(mses), np.mean(rmses), np.mean(r2s)])
print(results[-1])

['KNeighborsRegressor', 227.62744891566263, 278547.9062211109, 517.9308342981318, 0.7211166887656375]


# Result

In [122]:
results = pd.DataFrame(results, columns = ["Models", "MAE", "MSE", "RMSE", "R^2"]).sort_values("R^2", ascending = False).reset_index(drop = True)

In [123]:
results

Unnamed: 0,Models,MAE,MSE,RMSE,R^2
0,RandomForestRegressor,40.142856,14795.625974,105.44025,0.987748
1,DecisionTreeRegressor,41.241698,13528.550892,105.001614,0.987702
2,GradientBoostingRegressor,39.851074,16889.82834,106.268398,0.986098
3,XGBRegressor,35.948857,21887.872442,114.434937,0.97795
4,ElasticNet,150.136145,46695.464406,208.875505,0.952686
5,Lasso,79.725071,52066.48269,171.036655,0.950235
6,CatBoostRegressor,74.230345,77443.291249,257.985413,0.932756
7,HistGradientBoostingRegressor,96.527181,79341.268865,248.985485,0.928125
8,LGBMRegressor,102.791843,86222.784311,258.377507,0.921233
9,Ridge,145.001786,166460.272083,292.167924,0.839261


In [124]:
results.to_csv('results.csv')