## Carga de datos

Antes de poder manipular los diferentes datos, primero es necesário
cargarlos en el sistema

In [1]:
import polars as pl
import numpy as np 
from pathlib import Path
from sklearn.metrics.pairwise import haversine_distances

In [2]:
# Directório de archivos
data_folder = Path("../data")

# Diccionario de archivos
files = {
    "train"              : "train.csv",
    "gas_prices"         : "gas_prices.csv",
    "client"             : "client.csv",
    "electricity_prices" : "electricity_prices.csv",
    "forecast_weather"   : "forecast_weather.csv",
    "historical_weather" : "historical_weather.csv",
    "weather_station"    : "weather_station_to_county_mapping.csv"
}

In [3]:
# Se crea un diccionario de DataFrames para acceder facilmente a todos
# y poder iterar sobre ellos si es necesário
dfs: dict[str, pl.DataFrame] = {}

for key, filename in files.items():
    filepath = data_folder / filename
    try:
        df = pl.read_csv(filepath)
        dfs[key] = df
        print(f"Archivo {key} cargado con exito.")
    except:
        print(f"No pudo cargarse el archivo {key}.")
    

Archivo train cargado con exito.
Archivo gas_prices cargado con exito.
Archivo client cargado con exito.
Archivo electricity_prices cargado con exito.
Archivo forecast_weather cargado con exito.
Archivo historical_weather cargado con exito.
Archivo weather_station cargado con exito.


## Asignación de longitud/latitud a un condado especifico

Con el fin de poder unir los csv de "train.csv" y "forecast_weather.csv"
es necesario poder asignar a cada longitud y latitud, su condado
correspondiente.

Para ello se utilizará el archivo "weather_station_to_county_mapping" con datos que asignan latitudes y longitudes a su correspondiente condado.

### Visualización de los datos

In [61]:
class DataProcessor:
    def __init__(self):
        self.gas_join = ["date"]
        self.ep_join = ["datetime"]
        self.hw_join = [ "county", "datetime"]
        self.client_join = ["product_type", "county", "is_business", "date"]
        self.lat_lon = ["latitude", "longitude"]

    def add_counties(self, df: pl.DataFrame, stations: pl.DataFrame):
        # Se seleccionan los datos requeridos

        coords = df.select(
            pl.col("latitude", "longitude")
        ).unique()

        coords = coords.with_columns(
            pl.col("latitude").radians().alias("lat_rad"),
            pl.col("longitude").radians().alias("lon_rad")
        )

        wthr_stations = stations.drop_nulls()
        wthr_stations = wthr_stations[
            "latitude", "longitude", "county"
        ].with_columns(
                pl.col("latitude", "longitude").radians()
        ).to_numpy()

        coords_arr = coords.to_numpy()

        counties = np.array([], np.int32)
        # Se itera para obtener la menor distancia entre puntos 
        # y asignar el condado
        for coord in coords_arr[:, 2:4]:
            # Cada coordenada tiene una dist minima 
            # y un condado asignado
            min_dist = float("inf")
            cnty = -1

            station_zip = zip(wthr_stations[:, :2], wthr_stations[:, 2])

            for lat_lon, county in station_zip:
                dist = haversine_distances([coord, lat_lon])[0, 1]

                if dist < min_dist:
                    cnty = county
                    min_dist = dist

            # Se añade el condado a la lista
            counties = np.append(
                counties,
                int(cnty)
            )
        
        # Se asigna los condados a cada coordenada y se devuelve el resultado
        result = coords.with_columns(
            pl.lit(counties).alias("county")
        )

        result = df.join(
            result.select(["latitude", "longitude", "county"]),
            on=["latitude", "longitude"],
            how="left"
        )

        return result
    
    def remove_counties(self, df: pl.DataFrame, counties: set[int]):
        return df.filter(~pl.col("county").is_in(counties))

    def change_names(self, df: pl.DataFrame, suffix, no_change):
        renamed_df = df.rename(
            lambda col: col + suffix if col not in no_change else col
        )
        return renamed_df
    
    def to_datetime(self, df: pl.DataFrame, col: str):
        # Cast str to Datetime
        result = df.with_columns(
            pl.col(col).str.strptime(pl.Datetime)
        )
        return result

    def get_gas_features(self, gas_df: pl.DataFrame):
        df = gas_df.with_columns(
            ((pl.col("lowest_price_per_mwh") + 
              pl.col("highest_price_per_mwh")) / 2).alias("mean_price_per_mwh_24h")
        )

        df = df.with_columns(pl.col("origin_date").str.strptime(pl.Date))
        df = df.rename({"origin_date":"date"})

        return self.change_names(df, "_gas", self.gas_join)
    
    def get_client_features(self, client_df: pl.DataFrame):
        df = client_df.with_columns(
            pl.col("date").str.strptime(pl.Date)
        )

        df = self.remove_counties(df, [12])

        return self.change_names(df, "_client", self.client_join)
    
    def get_hw_means(self, data: pl.DataFrame) -> pl.DataFrame:
        data = data.group_by(
        ["datetime", "county"], maintain_order=True
        ).agg(pl.all().mean())
        return data

    def get_hist_weather_features(self, hw_df: pl.DataFrame, ws_df: pl.DataFrame):
        df = self.to_datetime(hw_df, "datetime")

        df = self.add_counties(df, ws_df)
        df = self.get_hw_means(df)
        df = df.drop(pl.col('latitude', 'longitude'))
        df = self.change_names(df, "_hw", self.hw_join)

        return df
    
    def predict_epmwh(self, df: pl.DataFrame):
        from sklearn.linear_model import LinearRegression

        date_dbid_df = df.select(["origin_date", "data_block_id"])

        min_date = date_dbid_df["origin_date"].min()
        max_date = date_dbid_df["origin_date"].max()

        origin_date_range = pl.datetime_range(
            start=min_date,
            end=max_date,
            interval="1h",
            time_unit="us",
            eager=True
        )

        duration = pl.duration(days=1)
        forecast_date_range = pl.datetime_range(
            start = min_date + duration,
            end = max_date + duration,
            interval="1h",
            time_unit="us",
            eager=True
        )

        data_block_id_range = np.repeat(
            np.arange(len(origin_date_range) / 24, dtype=int) + 1,
            24 # Hours of day
        )

        complete_dates = pl.DataFrame({
            "forecast_date" : forecast_date_range,
            "origin_date"   : origin_date_range,
            "data_block_id" : data_block_id_range
        })

        # Get missing values
        missing_dates = complete_dates.join(
            date_dbid_df, 
            on="origin_date",
            how="anti"
        )

        # Return original df if there is no missing values
        if missing_dates.is_empty():
            return df

        # Get missing date's hours
        unique_hours = missing_dates.with_columns(
            pl.col("origin_date").dt.hour().alias("hour")
        ).unique(subset="hour").select("hour")

        # Predict euros per mwh for every missing date on same hours
        for hour in unique_hours:
            train_df = df.select(
                ["origin_date", "data_block_id", "euros_per_mwh"]
            ).filter(
                pl.col("origin_date").dt.hour() == hour
            )

            X = train_df["data_block_id"].to_numpy().reshape(-1, 1)
            y = train_df["euros_per_mwh"].to_numpy()

            pred_rows = missing_dates.filter(
                pl.col("origin_date").dt.hour() == hour
            )

            X_pred = pred_rows.select("data_block_id").to_numpy().reshape(-1, 1)

            lr = LinearRegression().fit(X, y)
            y_pred = lr.predict(X_pred)

            new_df = pl.DataFrame({
                "forecast_date" : pred_rows["forecast_date"],
                "euros_per_mwh" : y_pred,
                "origin_date"   : pred_rows["origin_date"],
                "data_block_id" : X_pred.reshape(-1)
            })
            
            df = pl.concat([df, new_df])

        return df

    def get_electricity_features(self, ep_df: pl.DataFrame):
        # str to datetime
        df = self.to_datetime(ep_df, "origin_date")
        df = self.to_datetime(df, "forecast_date")

        # Preddict euros per mwh missing values
        df = self.predict_epmwh(df)

        # Rename non join columns names
        df = df.rename({"origin_date"   : "datetime"})
        df = df.rename({"euros_per_mwh" : "euros_per_mwh_24h"})

        # Change names
        df = self.change_names(df, "_ep", self.ep_join)

        return df
    
    def get_data_features(self, data: pl.DataFrame):
        
        date = "datetime"
        # Cast to Datetime
        df = self.to_datetime(data, "datetime")

        df = df.with_columns([
            # Date
            pl.col(date).dt.truncate("1d").cast(pl.Date).alias("date"),

            # Year
            pl.col(date).dt.year().alias("year"),

            # Quarter
            pl.col(date).dt.quarter().alias("quarter"),

            # Month
            pl.col(date).dt.month().alias("month"),

            # Week
            pl.col(date).dt.week().alias("week"),

            # Hour
            pl.col(date).dt.hour().alias("hour"),

            # Day of year
            pl.col(date).dt.ordinal_day().alias("day_of_year"),

            # Day of month
            pl.col(date).dt.day().alias("day_of_month"),

            # Day of week
            pl.col(date).dt.weekday().alias("day_of_week")
        ])

        df = self.remove_counties(df, [12])
        df = df.drop(pl.col('row_id'))

        return df
    
    def remove_time_changes_nulls(self, df: pl.DataFrame):
        result = df.filter(
            ~pl.col("target").is_null()
        )

        return result
    
    def _add_client_id(self, df: pl.DataFrame):
        cols = self.client_join[:3]

        uniques = (
            df.unique(cols, maintain_order=True)
            .with_row_index('client_id', offset=1)
        )
        
        result = df.join(
            uniques, 
            on=cols, 
            how='left'
        ).select(df.columns + ['client_id'])

        return result

    def __call__(self, df_dict: dict):

        # Features
        data   = self.get_data_features(df_dict["train"])
        gas_p  = self.get_gas_features(df_dict["gas_prices"])
        el_p   = self.get_electricity_features(df_dict["electricity_prices"])
        client = self.get_client_features(df_dict["client"])
        hist_w = self.get_hist_weather_features(
            df_dict["historical_weather"],
            df_dict["weather_station"]
        )

        # Joins
        final_data = data.join(client, on=self.client_join, how="left")
        final_data = final_data.join(gas_p, on=self.gas_join, how="left")
        final_data = final_data.join(el_p, on=self.ep_join, how="left")
        final_data = final_data.join(hist_w, on=self.hw_join, how="left")

        limit_date = pl.lit("2023-05-29 23:00:00").str.strptime(pl.Datetime)

        # Exclude the last days
        final_data = final_data.filter(pl.col("datetime") <= limit_date)

        # Remove null target values at daylight save time changes
        final_data = self.remove_time_changes_nulls(final_data)

        # Add client_id to final data
        final_data = self._add_client_id(final_data)

        # CREATE DROP REGEX FUNCT
        # Drop data_block_id columns
        regex = '^.*data_block_id.*$'
        final_data = final_data.select(pl.all().exclude(regex))

        # Drop forecast date columns
        regex = '^.*forecast_date.*$'
        final_data = final_data.select(pl.all().exclude(regex))

        return final_data

In [62]:
dp = DataProcessor()
res = dp(dfs)

In [63]:
res.head()

county,is_business,product_type,target,is_consumption,datetime,prediction_unit_id,date,year,quarter,month,week,hour,day_of_year,day_of_month,day_of_week,eic_count_client,installed_capacity_client,lowest_price_per_mwh_gas,highest_price_per_mwh_gas,mean_price_per_mwh_24h_gas,euros_per_mwh_24h_ep,temperature_hw,dewpoint_hw,rain_hw,snowfall_hw,surface_pressure_hw,cloudcover_total_hw,cloudcover_low_hw,cloudcover_mid_hw,cloudcover_high_hw,windspeed_10m_hw,winddirection_10m_hw,shortwave_radiation_hw,direct_solar_radiation_hw,diffuse_radiation_hw,client_id
i64,i64,i64,f64,i64,datetime[μs],i64,date,i32,i8,i8,i8,i8,i16,i8,i8,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,u32
0,0,1,0.713,0,2021-09-01 00:00:00,0,2021-09-01,2021,3,9,35,0,244,1,3,108,952.89,45.62,46.29,45.955,96.99,13.815385,10.6,0.0,0.0,1011.569231,42.461538,44.153846,2.923077,3.384615,5.619658,341.384615,0.0,0.0,0.0,1
0,0,1,96.59,1,2021-09-01 00:00:00,0,2021-09-01,2021,3,9,35,0,244,1,3,108,952.89,45.62,46.29,45.955,96.99,13.815385,10.6,0.0,0.0,1011.569231,42.461538,44.153846,2.923077,3.384615,5.619658,341.384615,0.0,0.0,0.0,1
0,0,2,0.0,0,2021-09-01 00:00:00,1,2021-09-01,2021,3,9,35,0,244,1,3,17,166.4,45.62,46.29,45.955,96.99,13.815385,10.6,0.0,0.0,1011.569231,42.461538,44.153846,2.923077,3.384615,5.619658,341.384615,0.0,0.0,0.0,2
0,0,2,17.314,1,2021-09-01 00:00:00,1,2021-09-01,2021,3,9,35,0,244,1,3,17,166.4,45.62,46.29,45.955,96.99,13.815385,10.6,0.0,0.0,1011.569231,42.461538,44.153846,2.923077,3.384615,5.619658,341.384615,0.0,0.0,0.0,2
0,0,3,2.904,0,2021-09-01 00:00:00,2,2021-09-01,2021,3,9,35,0,244,1,3,688,7207.88,45.62,46.29,45.955,96.99,13.815385,10.6,0.0,0.0,1011.569231,42.461538,44.153846,2.923077,3.384615,5.619658,341.384615,0.0,0.0,0.0,3


In [57]:
cols = ['county', 'is_business', 'product_type']
uniques = (
    res.unique(cols, maintain_order=True)
    .with_row_index('client_id', offset=1)
)
res.join(uniques, on=cols, how='left').select(res.columns + ['client_id']).filter(pl.col('client_id') == 68)

county,is_business,product_type,target,is_consumption,datetime,prediction_unit_id,date,year,quarter,month,week,hour,day_of_year,day_of_month,day_of_week,eic_count_client,installed_capacity_client,lowest_price_per_mwh_gas,highest_price_per_mwh_gas,mean_price_per_mwh_24h_gas,euros_per_mwh_24h_ep,temperature_hw,dewpoint_hw,rain_hw,snowfall_hw,surface_pressure_hw,cloudcover_total_hw,cloudcover_low_hw,cloudcover_mid_hw,cloudcover_high_hw,windspeed_10m_hw,winddirection_10m_hw,shortwave_radiation_hw,direct_solar_radiation_hw,diffuse_radiation_hw,client_id
i64,i64,i64,f64,i64,datetime[μs],i64,date,i32,i8,i8,i8,i8,i16,i8,i8,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,u32
14,1,2,0.0,0,2022-11-01 00:00:00,68,2022-11-01,2022,4,11,44,0,305,1,2,5,149.5,97.02,127.0,112.01,106.38,6.566667,5.433333,0.0,0.0,1010.666667,100.0,100.0,47.333333,0.0,3.944444,234.0,0.0,0.0,0.0,68
14,1,2,25.363,1,2022-11-01 00:00:00,68,2022-11-01,2022,4,11,44,0,305,1,2,5,149.5,97.02,127.0,112.01,106.38,6.566667,5.433333,0.0,0.0,1010.666667,100.0,100.0,47.333333,0.0,3.944444,234.0,0.0,0.0,0.0,68
14,1,2,0.0,0,2022-11-01 01:00:00,68,2022-11-01,2022,4,11,44,1,305,1,2,5,149.5,97.02,127.0,112.01,100.95,7.0,6.033333,0.0,0.0,1010.2,98.0,100.0,14.333333,1.333333,4.083333,231.666667,0.0,0.0,0.0,68
14,1,2,26.006,1,2022-11-01 01:00:00,68,2022-11-01,2022,4,11,44,1,305,1,2,5,149.5,97.02,127.0,112.01,100.95,7.0,6.033333,0.0,0.0,1010.2,98.0,100.0,14.333333,1.333333,4.083333,231.666667,0.0,0.0,0.0,68
14,1,2,0.0,0,2022-11-01 02:00:00,68,2022-11-01,2022,4,11,44,2,305,1,2,5,149.5,97.02,127.0,112.01,100.9,7.4,6.633333,0.033333,0.0,1009.833333,99.666667,100.0,6.0,26.333333,4.111111,224.333333,0.0,0.0,0.0,68
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
14,1,2,6.717,1,2023-05-29 21:00:00,68,2023-05-29,2023,2,5,22,21,149,29,1,7,246.0,29.0,34.0,31.5,82.1,12.666667,1.4,0.0,0.0,1010.233333,38.666667,1.0,47.333333,30.666667,2.574074,323.333333,21.0,6.333333,14.666667,68
14,1,2,0.015,0,2023-05-29 22:00:00,68,2023-05-29,2023,2,5,22,22,149,29,1,7,246.0,29.0,34.0,31.5,82.09,11.3,1.3,0.0,0.0,1010.5,49.333333,0.0,71.333333,22.0,2.453704,330.666667,0.0,0.0,0.0,68
14,1,2,10.148,1,2023-05-29 22:00:00,68,2023-05-29,2023,2,5,22,22,149,29,1,7,246.0,29.0,34.0,31.5,82.09,11.3,1.3,0.0,0.0,1010.5,49.333333,0.0,71.333333,22.0,2.453704,330.666667,0.0,0.0,0.0,68
14,1,2,0.0,0,2023-05-29 23:00:00,68,2023-05-29,2023,2,5,22,23,149,29,1,7,246.0,29.0,34.0,31.5,-1.29,10.266667,1.233333,0.0,0.0,1010.633333,59.0,0.0,78.0,41.666667,2.25,331.666667,0.0,0.0,0.0,68


In [56]:
uniques

client_id,county,is_business,product_type,target,is_consumption,datetime,prediction_unit_id,date,year,quarter,month,week,hour,day_of_year,day_of_month,day_of_week,eic_count_client,installed_capacity_client,lowest_price_per_mwh_gas,highest_price_per_mwh_gas,mean_price_per_mwh_24h_gas,euros_per_mwh_24h_ep,temperature_hw,dewpoint_hw,rain_hw,snowfall_hw,surface_pressure_hw,cloudcover_total_hw,cloudcover_low_hw,cloudcover_mid_hw,cloudcover_high_hw,windspeed_10m_hw,winddirection_10m_hw,shortwave_radiation_hw,direct_solar_radiation_hw,diffuse_radiation_hw
u32,i64,i64,i64,f64,i64,datetime[μs],i64,date,i32,i8,i8,i8,i8,i16,i8,i8,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1,0,0,1,0.713,0,2021-09-01 00:00:00,0,2021-09-01,2021,3,9,35,0,244,1,3,108,952.89,45.62,46.29,45.955,96.99,13.815385,10.6,0.0,0.0,1011.569231,42.461538,44.153846,2.923077,3.384615,5.619658,341.384615,0.0,0.0,0.0
2,0,0,2,0.0,0,2021-09-01 00:00:00,1,2021-09-01,2021,3,9,35,0,244,1,3,17,166.4,45.62,46.29,45.955,96.99,13.815385,10.6,0.0,0.0,1011.569231,42.461538,44.153846,2.923077,3.384615,5.619658,341.384615,0.0,0.0,0.0
3,0,0,3,2.904,0,2021-09-01 00:00:00,2,2021-09-01,2021,3,9,35,0,244,1,3,688,7207.88,45.62,46.29,45.955,96.99,13.815385,10.6,0.0,0.0,1011.569231,42.461538,44.153846,2.923077,3.384615,5.619658,341.384615,0.0,0.0,0.0
4,0,1,0,0.0,0,2021-09-01 00:00:00,3,2021-09-01,2021,3,9,35,0,244,1,3,5,400.0,45.62,46.29,45.955,96.99,13.815385,10.6,0.0,0.0,1011.569231,42.461538,44.153846,2.923077,3.384615,5.619658,341.384615,0.0,0.0,0.0
5,0,1,1,0.0,0,2021-09-01 00:00:00,4,2021-09-01,2021,3,9,35,0,244,1,3,43,1411.0,45.62,46.29,45.955,96.99,13.815385,10.6,0.0,0.0,1011.569231,42.461538,44.153846,2.923077,3.384615,5.619658,341.384615,0.0,0.0,0.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
64,15,1,0,0.0,0,2021-12-01 00:00:00,64,2021-12-01,2021,4,12,48,0,335,1,3,8,260.0,85.2,93.9,89.55,99.17,-6.28,-8.85,0.0,0.014,972.37,93.5,95.4,1.8,50.5,5.572222,286.3,0.0,0.0,0.0
65,2,1,1,0.0,0,2022-01-01 00:00:00,65,2022-01-01,2022,1,1,52,0,1,1,6,5,95.0,82.74,89.99,86.365,57.08,-1.690909,-2.663636,0.0,0.0,996.1,93.272727,62.272727,55.545455,32.636364,2.780303,283.545455,0.0,0.0,0.0
66,4,1,1,0.0,0,2022-01-01 00:00:00,66,2022-01-01,2022,1,1,52,0,1,1,6,8,462.6,82.74,89.99,86.365,57.08,-2.875,-3.3,0.0,0.0,993.8,98.75,87.75,45.0,20.5,2.0,262.0,0.0,0.0,0.0
67,11,1,0,0.0,0,2022-02-01 00:00:00,67,2022-02-01,2022,1,2,5,0,32,1,2,7,280.0,78.01,84.9,81.455,109.6,-3.966667,-5.05,0.0,0.081667,987.266667,100.0,93.5,96.5,36.5,3.680556,112.0,0.0,0.0,0.0


county,is_business,product_type,target,is_consumption,datetime,prediction_unit_id,date,year,quarter,month,week,hour,day_of_year,day_of_month,day_of_week,eic_count_client,installed_capacity_client,lowest_price_per_mwh_gas,highest_price_per_mwh_gas,mean_price_per_mwh_24h_gas,euros_per_mwh_24h_ep,temperature_hw,dewpoint_hw,rain_hw,snowfall_hw,surface_pressure_hw,cloudcover_total_hw,cloudcover_low_hw,cloudcover_mid_hw,cloudcover_high_hw,windspeed_10m_hw,winddirection_10m_hw,shortwave_radiation_hw,direct_solar_radiation_hw,diffuse_radiation_hw,client_id,target_right,is_consumption_right,datetime_right,prediction_unit_id_right,date_right,year_right,quarter_right,month_right,week_right,hour_right,day_of_year_right,day_of_month_right,day_of_week_right,eic_count_client_right,installed_capacity_client_right,lowest_price_per_mwh_gas_right,highest_price_per_mwh_gas_right,mean_price_per_mwh_24h_gas_right,euros_per_mwh_24h_ep_right,temperature_hw_right,dewpoint_hw_right,rain_hw_right,snowfall_hw_right,surface_pressure_hw_right,cloudcover_total_hw_right,cloudcover_low_hw_right,cloudcover_mid_hw_right,cloudcover_high_hw_right,windspeed_10m_hw_right,winddirection_10m_hw_right,shortwave_radiation_hw_right,direct_solar_radiation_hw_right,diffuse_radiation_hw_right
i64,i64,i64,f64,i64,datetime[μs],i64,date,i32,i8,i8,i8,i8,i16,i8,i8,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,u32,f64,i64,datetime[μs],i64,date,i32,i8,i8,i8,i8,i16,i8,i8,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
0,0,1,0.713,0,2021-09-01 00:00:00,0,2021-09-01,2021,3,9,35,0,244,1,3,108,952.89,45.62,46.29,45.955,96.99,13.815385,10.6,0.0,0.0,1011.569231,42.461538,44.153846,2.923077,3.384615,5.619658,341.384615,0.0,0.0,0.0,24,0.713,0,2021-09-01 00:00:00,0,2021-09-01,2021,3,9,35,0,244,1,3,108,952.89,45.62,46.29,45.955,96.99,13.815385,10.6,0.0,0.0,1011.569231,42.461538,44.153846,2.923077,3.384615,5.619658,341.384615,0.0,0.0,0.0
0,0,1,96.59,1,2021-09-01 00:00:00,0,2021-09-01,2021,3,9,35,0,244,1,3,108,952.89,45.62,46.29,45.955,96.99,13.815385,10.6,0.0,0.0,1011.569231,42.461538,44.153846,2.923077,3.384615,5.619658,341.384615,0.0,0.0,0.0,24,0.713,0,2021-09-01 00:00:00,0,2021-09-01,2021,3,9,35,0,244,1,3,108,952.89,45.62,46.29,45.955,96.99,13.815385,10.6,0.0,0.0,1011.569231,42.461538,44.153846,2.923077,3.384615,5.619658,341.384615,0.0,0.0,0.0
0,0,2,0.0,0,2021-09-01 00:00:00,1,2021-09-01,2021,3,9,35,0,244,1,3,17,166.4,45.62,46.29,45.955,96.99,13.815385,10.6,0.0,0.0,1011.569231,42.461538,44.153846,2.923077,3.384615,5.619658,341.384615,0.0,0.0,0.0,51,0.0,0,2021-09-01 00:00:00,1,2021-09-01,2021,3,9,35,0,244,1,3,17,166.4,45.62,46.29,45.955,96.99,13.815385,10.6,0.0,0.0,1011.569231,42.461538,44.153846,2.923077,3.384615,5.619658,341.384615,0.0,0.0,0.0
0,0,2,17.314,1,2021-09-01 00:00:00,1,2021-09-01,2021,3,9,35,0,244,1,3,17,166.4,45.62,46.29,45.955,96.99,13.815385,10.6,0.0,0.0,1011.569231,42.461538,44.153846,2.923077,3.384615,5.619658,341.384615,0.0,0.0,0.0,51,0.0,0,2021-09-01 00:00:00,1,2021-09-01,2021,3,9,35,0,244,1,3,17,166.4,45.62,46.29,45.955,96.99,13.815385,10.6,0.0,0.0,1011.569231,42.461538,44.153846,2.923077,3.384615,5.619658,341.384615,0.0,0.0,0.0
0,0,3,2.904,0,2021-09-01 00:00:00,2,2021-09-01,2021,3,9,35,0,244,1,3,688,7207.88,45.62,46.29,45.955,96.99,13.815385,10.6,0.0,0.0,1011.569231,42.461538,44.153846,2.923077,3.384615,5.619658,341.384615,0.0,0.0,0.0,15,2.904,0,2021-09-01 00:00:00,2,2021-09-01,2021,3,9,35,0,244,1,3,688,7207.88,45.62,46.29,45.955,96.99,13.815385,10.6,0.0,0.0,1011.569231,42.461538,44.153846,2.923077,3.384615,5.619658,341.384615,0.0,0.0,0.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
15,1,0,188.167,1,2023-05-29 23:00:00,64,2023-05-29,2023,2,5,22,23,149,29,1,15,620.0,29.0,34.0,31.5,-1.29,11.72,7.02,0.0,0.0,1006.16,19.8,2.1,15.5,28.8,1.802778,173.1,0.0,0.0,0.0,66,0.0,0,2021-12-01 00:00:00,64,2021-12-01,2021,4,12,48,0,335,1,3,8,260.0,85.2,93.9,89.55,99.17,-6.28,-8.85,0.0,0.014,972.37,93.5,95.4,1.8,50.5,5.572222,286.3,0.0,0.0,0.0
15,1,1,0.0,0,2023-05-29 23:00:00,59,2023-05-29,2023,2,5,22,23,149,29,1,20,624.5,29.0,34.0,31.5,-1.29,11.72,7.02,0.0,0.0,1006.16,19.8,2.1,15.5,28.8,1.802778,173.1,0.0,0.0,0.0,57,0.1,0,2021-09-01 00:00:00,59,2021-09-01,2021,3,9,35,0,244,1,3,7,325.0,45.62,46.29,45.955,96.99,13.23,11.86,0.15,0.0,999.97,93.8,84.1,25.0,97.4,4.005556,321.4,0.0,0.0,0.0
15,1,1,31.484,1,2023-05-29 23:00:00,59,2023-05-29,2023,2,5,22,23,149,29,1,20,624.5,29.0,34.0,31.5,-1.29,11.72,7.02,0.0,0.0,1006.16,19.8,2.1,15.5,28.8,1.802778,173.1,0.0,0.0,0.0,57,0.1,0,2021-09-01 00:00:00,59,2021-09-01,2021,3,9,35,0,244,1,3,7,325.0,45.62,46.29,45.955,96.99,13.23,11.86,0.15,0.0,999.97,93.8,84.1,25.0,97.4,4.005556,321.4,0.0,0.0,0.0
15,1,3,0.0,0,2023-05-29 23:00:00,60,2023-05-29,2023,2,5,22,23,149,29,1,55,2188.2,29.0,34.0,31.5,-1.29,11.72,7.02,0.0,0.0,1006.16,19.8,2.1,15.5,28.8,1.802778,173.1,0.0,0.0,0.0,61,0.0,0,2021-09-01 00:00:00,60,2021-09-01,2021,3,9,35,0,244,1,3,49,1778.7,45.62,46.29,45.955,96.99,13.23,11.86,0.15,0.0,999.97,93.8,84.1,25.0,97.4,4.005556,321.4,0.0,0.0,0.0


In [12]:
res.describe()

statistic,county,is_business,product_type,target,is_consumption,datetime,prediction_unit_id,date,year,quarter,month,week,hour,day_of_year,day_of_month,day_of_week,eic_count_client,installed_capacity_client,lowest_price_per_mwh_gas,highest_price_per_mwh_gas,mean_price_per_mwh_24h_gas,euros_per_mwh_24h_ep,temperature_hw,dewpoint_hw,rain_hw,snowfall_hw,surface_pressure_hw,cloudcover_total_hw,cloudcover_low_hw,cloudcover_mid_hw,cloudcover_high_hw,windspeed_10m_hw,winddirection_10m_hw,shortwave_radiation_hw,direct_solar_radiation_hw,diffuse_radiation_hw,client_id,target_right,is_consumption_right,datetime_right,prediction_unit_id_right,date_right,year_right,quarter_right,month_right,week_right,hour_right,day_of_year_right,day_of_month_right,day_of_week_right,eic_count_client_right,installed_capacity_client_right,lowest_price_per_mwh_gas_right,highest_price_per_mwh_gas_right,mean_price_per_mwh_24h_gas_right,euros_per_mwh_24h_ep_right,temperature_hw_right,dewpoint_hw_right,rain_hw_right,snowfall_hw_right,surface_pressure_hw_right,cloudcover_total_hw_right,cloudcover_low_hw_right,cloudcover_mid_hw_right,cloudcover_high_hw_right,windspeed_10m_hw_right,winddirection_10m_hw_right,shortwave_radiation_hw_right,direct_solar_radiation_hw_right,diffuse_radiation_hw_right
str,f64,f64,f64,f64,f64,str,f64,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,f64,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""",1981064.0,1981064.0,1981064.0,1981064.0,1981064.0,"""1981064""",1981064.0,"""1981064""",1981064.0,1981064.0,1981064.0,1981064.0,1981064.0,1981064.0,1981064.0,1981064.0,1981064.0,1981064.0,1981064.0,1981064.0,1981064.0,1981064.0,1981064.0,1981064.0,1981064.0,1981064.0,1981064.0,1981064.0,1981064.0,1981064.0,1981064.0,1981064.0,1981064.0,1981064.0,1981064.0,1981064.0,1981064.0,1981064.0,1981064.0,"""1981064""",1981064.0,"""1981064""",1981064.0,1981064.0,1981064.0,1981064.0,1981064.0,1981064.0,1981064.0,1981064.0,1981064.0,1981064.0,1981064.0,1981064.0,1981064.0,1981064.0,1981064.0,1981064.0,1981064.0,1981064.0,1981064.0,1981064.0,1981064.0,1981064.0,1981064.0,1981064.0,1981064.0,1981064.0,1981064.0,1981064.0
"""null_count""",0.0,0.0,0.0,0.0,0.0,"""0""",0.0,"""0""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"""0""",0.0,"""0""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""mean""",7.224649,0.529638,1.882033,274.42834,0.5,"""2022-07-19 10:20:42.386919""",32.797669,"""2022-07-18 22:50:34.354000""",2022.054101,2.479784,6.434648,26.138278,11.502231,180.204837,15.65228,4.001975,74.317444,1467.252742,95.53819,108.398718,101.968454,157.533554,5.443911,1.917979,0.048361,0.016768,1007.949243,62.035949,47.956987,35.111673,36.122755,4.468658,198.385621,105.453607,62.932712,42.520895,33.156353,0.0672,0.0,"""2021-09-09 01:00:39.760452""",32.797669,"""2021-09-09 01:00:39.760000""",2021.038225,2.985659,8.805777,35.434721,0.0,238.090033,1.0,3.149456,49.023053,1100.170338,48.447456,49.509437,48.978446,95.274842,11.887506,9.702544,0.009244,0.00114,1007.52337,58.143874,53.683559,9.904071,27.404237,4.347157,314.581703,3.453426,2.695082,0.758344
"""std""",4.781338,0.499121,1.081268,915.138469,0.5,,19.637271,,0.64396,1.180145,3.669668,15.957656,6.921727,112.110752,8.75955,1.99978,144.954364,2437.004042,47.574295,54.747009,50.919672,121.415851,8.171629,7.241336,0.175303,0.069773,12.506026,35.943462,38.982257,36.440564,39.268435,2.044979,82.541755,175.166415,127.948623,61.514422,19.411935,0.367798,0.0,,19.637271,,0.191739,0.452839,1.516123,4.539235,0.0,46.00361,0.0,0.6255,94.268038,1957.626702,9.094316,10.693385,9.880372,7.596868,3.85527,3.753467,0.036071,0.008906,7.362977,25.608047,26.980002,20.717796,30.363935,1.519415,49.033957,8.079203,6.307861,1.772197
"""min""",0.0,0.0,0.0,0.0,0.0,"""2021-09-01 00:00:00""",0.0,"""2021-09-01""",2021.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,5.0,5.5,28.1,34.0,31.1,-10.06,-20.533333,-23.366667,0.0,0.0,952.9,0.0,0.0,0.0,0.0,0.166667,1.0,0.0,0.0,0.0,0.0,0.0,0.0,"""2021-09-01 00:00:00""",0.0,"""2021-09-01""",2021.0,1.0,1.0,5.0,0.0,1.0,1.0,2.0,5.0,16.0,45.62,46.29,45.955,57.08,-6.28,-8.85,0.0,0.0,972.37,13.142857,6.666667,0.0,0.0,2.0,112.0,0.0,0.0,0.0
"""25%""",3.0,0.0,1.0,0.356,0.0,"""2022-02-13 19:00:00""",16.0,"""2022-02-13""",2022.0,1.0,3.0,12.0,6.0,80.0,8.0,2.0,14.0,324.0,60.0,67.67,65.0,85.29,-0.475,-3.02,0.0,0.0,1000.74,28.166667,6.333333,0.75,0.0,2.909259,144.0,0.0,0.0,0.0,16.0,0.0,0.0,"""2021-09-01 00:00:00""",16.0,"""2021-09-01""",2021.0,3.0,9.0,35.0,0.0,244.0,1.0,3.0,9.0,240.7,45.62,46.29,45.955,96.99,12.1,9.966667,0.0,0.0,1004.75,38.0,34.833333,0.0,3.333333,3.458333,314.333333,0.0,0.0,0.0
"""50%""",7.0,1.0,2.0,30.963,1.0,"""2022-07-20 11:00:00""",32.0,"""2022-07-20""",2022.0,2.0,6.0,24.0,12.0,167.0,16.0,4.0,32.0,657.0,86.0,94.0,88.895,128.79,4.966667,1.433333,0.0,0.0,1009.016667,73.222222,44.823529,21.0,16.0,4.242063,206.8,6.166667,0.0,4.4,33.0,0.0,0.0,"""2021-09-01 00:00:00""",32.0,"""2021-09-01""",2021.0,3.0,9.0,35.0,0.0,244.0,1.0,3.0,19.0,503.6,45.62,46.29,45.955,96.99,12.88,10.9,0.0,0.0,1005.566667,48.866667,50.666667,0.533333,16.0,4.005556,334.666667,0.0,0.0,0.0
"""75%""",11.0,1.0,3.0,174.627,1.0,"""2022-12-23 03:00:00""",50.0,"""2022-12-23""",2022.0,4.0,10.0,41.0,18.0,285.0,23.0,6.0,71.0,1574.9,109.74,133.0,120.91,199.99,10.966667,6.8,0.007692,0.0,1016.223529,97.666667,90.333333,68.5,78.9,5.73366,257.333333,138.941176,47.823529,74.7,50.0,0.0,0.0,"""2021-09-01 00:00:00""",50.0,"""2021-09-01""",2021.0,3.0,9.0,35.0,0.0,244.0,1.0,3.0,47.0,1152.9,45.62,46.29,45.955,96.99,13.815385,11.183333,0.0,0.0,1011.569231,88.0,81.166667,8.272727,50.5,4.361111,341.384615,0.0,0.0,0.0
"""max""",15.0,1.0,3.0,15480.274,1.0,"""2023-05-29 23:00:00""",68.0,"""2023-05-29""",2023.0,4.0,12.0,52.0,23.0,365.0,31.0,7.0,1517.0,19314.31,250.0,305.0,272.5,4000.0,30.266667,21.45,4.8,1.58,1048.866667,100.0,100.0,100.0,100.0,17.290123,360.0,821.6,701.1,367.0,67.0,2.904,0.0,"""2022-11-01 00:00:00""",68.0,"""2022-11-01""",2022.0,4.0,12.0,52.0,0.0,335.0,1.0,6.0,688.0,13417.79,97.02,127.0,112.01,109.6,14.642857,12.05,0.15,0.081667,1026.227273,100.0,100.0,96.5,97.4,8.059524,356.0,23.266667,18.266667,5.0


In [64]:
save_path = Path("../data/processed_data")
try:
    save_path.mkdir()
    display(f"Path {save_path} created successfully.")
except FileExistsError:
    display(f"Path {save_path} already exists")

save_file = save_path / "proccessed_data.csv"

res.write_csv(save_file)
display(f"File {save_file} saved successfully.")

'Path ..\\data\\processed_data already exists'

'File ..\\data\\processed_data\\proccessed_data.csv saved successfully.'

In [31]:
dfs["train"].filter(
    pl.col("target").is_null()
).unique(subset="datetime")

county,is_business,product_type,target,is_consumption,datetime,data_block_id,row_id,prediction_unit_id
i64,i64,i64,f64,i64,str,i64,i64,i64
0,0,1,,0,"""2022-03-27 03:00:00""",207,634866,0
0,0,1,,0,"""2021-10-31 03:00:00""",60,178938,0
0,0,1,,0,"""2023-03-26 03:00:00""",571,1806252,0
0,0,1,,0,"""2022-10-30 03:00:00""",424,1332456,0


In [35]:
dfs["train"].filter(
    pl.col("datetime") == "2021-10-31 02:00:00",
    pl.col("county") == 0,
    pl.col("is_business") == 1
)

county,is_business,product_type,target,is_consumption,datetime,data_block_id,row_id,prediction_unit_id
i64,i64,i64,f64,i64,str,i64,i64,i64
0,1,0,0.0,0,"""2021-10-31 02:00:00""",60,178818,3
0,1,0,120.2,1,"""2021-10-31 02:00:00""",60,178819,3
0,1,1,0.0,0,"""2021-10-31 02:00:00""",60,178820,4
0,1,1,639.757,1,"""2021-10-31 02:00:00""",60,178821,4
0,1,2,0.0,0,"""2021-10-31 02:00:00""",60,178822,61
0,1,2,13.501,1,"""2021-10-31 02:00:00""",60,178823,61
0,1,3,0.516,0,"""2021-10-31 02:00:00""",60,178824,5
0,1,3,5314.592,1,"""2021-10-31 02:00:00""",60,178825,5


In [None]:
ep_df = dfs["electricity_prices"].with_columns(
    pl.col("origin_date").str.strptime(pl.Datetime),
    pl.col("forecast_date").str.strptime(pl.Datetime),
)
date_dbid_df = ep_df.select(["origin_date", "data_block_id"])

min_date = date_dbid_df["origin_date"].min()
max_date = date_dbid_df["origin_date"].max()

origin_date_range = pl.datetime_range(
    start=min_date,
    end=max_date,
    interval="1h",
    time_unit="us",
    eager=True
)

duration = pl.duration(days=1)
forecast_date_range = pl.datetime_range(
    start = min_date + duration,
    end = max_date + duration,
    interval="1h",
    time_unit="us",
    eager=True
)

data_block_id_range = np.repeat(
    np.arange(len(origin_date_range) / 24, dtype=int) + 1,
    24 # Hours of day
)

complete_dates = pl.DataFrame({
    "forecast_date" : forecast_date_range,
    "origin_date"   : origin_date_range,
    "data_block_id" : data_block_id_range
})

# Get missing values
missing_dates = complete_dates.join(
    date_dbid_df, 
    on="origin_date",
    how="anti"
)

missing_dates

forecast_date,origin_date,data_block_id
datetime[μs],datetime[μs],i64
2022-03-27 02:00:00,2022-03-26 02:00:00,208
2023-03-26 02:00:00,2023-03-25 02:00:00,572


In [17]:
train = dfs["train"].with_columns(
    pl.col("datetime").str.strptime(pl.Datetime)
)

train.group_by("datetime").agg(pl.count())

  train.group_by("datetime").agg(pl.count())


datetime,count
datetime[μs],u32
2021-09-25 10:00:00,122
2021-11-23 13:00:00,126
2022-02-01 07:00:00,132
2022-04-04 15:00:00,136
2022-04-24 17:00:00,136
…,…
2023-05-18 22:00:00,134
2022-01-31 03:00:00,130
2022-02-26 20:00:00,134
2022-08-20 22:00:00,132
