## Carga de datos

Antes de poder manipular los diferentes datos, primero es necesário
cargarlos en el sistema

In [100]:
import polars as pl
import numpy as np 
from pathlib import Path
from sklearn.metrics.pairwise import haversine_distances

In [101]:
# Directório de archivos
data_folder = Path("../data")

# Diccionario de archivos
files = {
    "train"              : "train.csv",
    "gas_prices"         : "gas_prices.csv",
    "client"             : "client.csv",
    "electricity_prices" : "electricity_prices.csv",
    "forecast_weather"   : "forecast_weather.csv",
    "historical_weather" : "historical_weather.csv",
    "weather_station"    : "weather_station_to_county_mapping.csv"
}

In [102]:
# Se crea un diccionario de DataFrames para acceder facilmente a todos
# y poder iterar sobre ellos si es necesário
dfs: dict[str, pl.DataFrame] = {}

for key, filename in files.items():
    filepath = data_folder / filename
    try:
        df = pl.read_csv(filepath)
        dfs[key] = df
        print(f"Archivo {key} cargado con exito.")
    except:
        print(f"No pudo cargarse el archivo {key}.")
    

Archivo train cargado con exito.
Archivo gas_prices cargado con exito.
Archivo client cargado con exito.
Archivo electricity_prices cargado con exito.
Archivo forecast_weather cargado con exito.
Archivo historical_weather cargado con exito.
Archivo weather_station cargado con exito.


## Asignación de longitud/latitud a un condado especifico

Con el fin de poder unir los csv de "train.csv" y "forecast_weather.csv"
es necesario poder asignar a cada longitud y latitud, su condado
correspondiente.

Para ello se utilizará el archivo "weather_station_to_county_mapping" con datos que asignan latitudes y longitudes a su correspondiente condado.

### Visualización de los datos

In [103]:
# Se eliminan las filas con instancias nulas
weather_station = dfs["weather_station"].drop_nulls()
weather_station

county_name,longitude,latitude,county
str,f64,f64,i64
"""Saaremaa""",22.2,58.2,10
"""Saaremaa""",22.2,58.5,10
"""Saaremaa""",22.7,58.5,10
"""Hiiumaa""",22.7,58.8,1
"""Saaremaa""",23.2,58.5,10
…,…,…,…
"""Ida-Virumaa""",27.2,59.1,2
"""Ida-Virumaa""",27.2,59.4,2
"""Võrumaa""",27.7,57.9,15
"""Ida-Virumaa""",27.7,59.1,2


In [104]:
coords = dfs["historical_weather"].select(["latitude", "longitude"]).unique()
stations = dfs["weather_station"]["latitude", "longitude", "county"].drop_nulls()

coords = coords.with_columns(
    pl.col("latitude").radians().alias("lat_rad"),
    pl.col("longitude").radians().alias("lon_rad")
)
stations = stations.with_columns(
    pl.exclude("county").radians()
)

coords_array = coords.to_numpy()
stations_array = stations.to_numpy()
display(coords[:6])
display(coords_array[:6])

latitude,longitude,lat_rad,lon_rad
f64,f64,f64,f64
59.7,28.2,1.041962,0.492183
57.6,26.7,1.00531,0.466003
58.5,26.2,1.021018,0.457276
58.5,22.7,1.021018,0.39619
58.2,26.7,1.015782,0.466003
57.9,28.2,1.010546,0.492183


array([[59.7       , 28.2       ,  1.04196156,  0.49218285],
       [57.6       , 26.7       ,  1.00530965,  0.46600291],
       [58.5       , 26.2       ,  1.02101761,  0.45727626],
       [58.5       , 22.7       ,  1.02101761,  0.39618974],
       [58.2       , 26.7       ,  1.01578162,  0.46600291],
       [57.9       , 28.2       ,  1.01054564,  0.49218285]])

In [105]:
counties = np.array([], np.int32)
for coord in coords_array[:, 2:4]:
    cnty = -1
    min_dist = float("inf")
    county_zip = zip(stations_array[:, :2], stations_array[:, 2])
    
    for lat_lon, county in county_zip:
        dist = haversine_distances([coord, lat_lon])[0, 1]
        
        if dist < min_dist:
            min_dist = dist
            cnty = county
        
    counties = np.append(
        counties,
        int(cnty)
    )

counties

array([ 2, 15,  4, 10, 11, 15,  5,  7, 11,  3,  2,  1, 15,  2, 10,  7,  1,
        7, 10,  2,  0, 13, 15, 10, 10, 15, 15,  4, 13, 13, 11,  6,  9,  9,
        0,  0,  1, 10,  9,  7, 10,  7, 10,  5,  0,  0, 14,  7,  0, 15,  0,
        1,  2,  6, 15,  6,  1,  2,  7,  3, 10,  5,  0,  7,  1, 15,  5,  5,
        1, 11, 15, 11,  6,  7,  7,  6,  2,  0,  7,  0, 14,  5, 10, 10,  6,
        0,  7, 10,  2, 14,  4,  3, 10,  7,  2,  7,  6,  8, 10,  1,  4,  0,
        2,  8, 11, 10, 10,  0,  1, 10,  7,  2])

In [106]:
coords = coords.with_columns(
    pl.lit(counties).alias("county")
)
coords

latitude,longitude,lat_rad,lon_rad,county
f64,f64,f64,f64,i64
59.7,28.2,1.041962,0.492183,2
57.6,26.7,1.00531,0.466003,15
58.5,26.2,1.021018,0.457276,4
58.5,22.7,1.021018,0.39619,10
58.2,26.7,1.015782,0.466003,11
…,…,…,…,…
59.1,24.2,1.03149,0.42237,0
59.4,22.2,1.036726,0.387463,1
57.9,23.2,1.010546,0.404916,10
57.9,25.2,1.010546,0.439823,7


In [107]:
dfs["historical_weather"].join(
    coords.select(["latitude", "longitude", "county"]),
    on=["latitude", "longitude"],
    how="left"
).describe()

statistic,datetime,temperature,dewpoint,rain,snowfall,surface_pressure,cloudcover_total,cloudcover_low,cloudcover_mid,cloudcover_high,windspeed_10m,winddirection_10m,shortwave_radiation,direct_solar_radiation,diffuse_radiation,latitude,longitude,data_block_id,county
str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""","""1710802""",1710802.0,1710802.0,1710802.0,1710802.0,1710802.0,1710802.0,1710802.0,1710802.0,1710802.0,1710802.0,1710802.0,1710802.0,1710802.0,1710802.0,1710802.0,1710802.0,1710802.0,1710802.0
"""null_count""","""0""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""mean""",,5.740968,2.240312,0.04962,0.016049,1009.281515,60.912696,46.685927,34.40698,36.051408,4.849871,197.869419,106.490504,64.452917,42.037587,58.649999,24.949999,319.270778,6.633931
"""std""",,8.025647,7.224357,0.207911,0.074629,13.088915,37.769048,40.747598,38.327693,41.358521,2.47545,89.937978,179.944912,133.409951,61.952251,0.687387,2.015564,183.729798,4.718874
"""min""","""2021-09-01 00:00:00""",-23.7,-25.9,0.0,0.0,942.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,57.6,21.7,1.0,0.0
"""25%""",,0.0,-2.6,0.0,0.0,1001.5,25.0,3.0,0.0,0.0,3.0,139.0,0.0,0.0,0.0,57.9,23.2,160.0,2.0
"""50%""",,5.1,1.7,0.0,0.0,1010.4,72.0,39.0,16.0,10.0,4.5,208.0,1.0,0.0,1.0,58.5,24.7,319.0,7.0
"""75%""",,11.2,7.2,0.0,0.0,1018.0,100.0,94.0,72.0,85.0,6.277778,263.0,140.0,47.0,74.0,59.1,26.7,478.0,10.0
"""max""","""2023-05-30 10:00:00""",32.6,23.8,16.8,2.66,1049.3,100.0,100.0,100.0,100.0,21.75,360.0,849.0,754.0,388.0,59.7,28.2,637.0,15.0


In [108]:
coords.filter(
    (pl.col("latitude") == 57.6) &
    (pl.col("longitude") == 25.7)
)

latitude,longitude,lat_rad,lon_rad,county
f64,f64,f64,f64,i64
57.6,25.7,1.00531,0.44855,13


In [109]:
dfs["weather_station"]["latitude", "longitude", "county"].with_columns(
    pl.col("latitude", "longitude").radians()
).drop_nulls()

latitude,longitude,county
f64,f64,i64
1.015782,0.387463,10
1.021018,0.387463,10
1.021018,0.39619,10
1.026254,0.39619,1
1.021018,0.404916,10
…,…,…
1.03149,0.47473,2
1.036726,0.47473,2
1.010546,0.483456,15
1.03149,0.483456,2


In [110]:
class DataProcessor:
    def __init__(self):
        self.gas_join = ["datetime"]
        self.ep_join = ["datetime"]
        self.hw_join = [ "county", "datetime"]
        self.client_join = ["product_type", "county", "is_business", "date"]
        self.lat_lon = ["latitude", "longitude"]

    def add_counties(self, df: pl.DataFrame, stations: pl.DataFrame):
        # Se seleccionan los datos requeridos

        coords = df.select(
            pl.col("latitude", "longitude")
        ).unique()

        coords = coords.with_columns(
            pl.col("latitude").radians().alias("lat_rad"),
            pl.col("longitude").radians().alias("lon_rad")
        )

        wthr_stations = stations.drop_nulls()
        wthr_stations = wthr_stations[
            "latitude", "longitude", "county"
        ].with_columns(
                pl.col("latitude", "longitude").radians()
        ).to_numpy()

        coords_arr = coords.to_numpy()

        counties = np.array([], np.int32)
        # Se itera para obtener la menor distancia entre puntos 
        # y asignar el condado
        for coord in coords_arr[:, 2:4]:
            # Cada coordenada tiene una dist minima 
            # y un condado asignado
            min_dist = float("inf")
            cnty = -1

            station_zip = zip(wthr_stations[:, :2], wthr_stations[:, 2])

            for lat_lon, county in station_zip:
                dist = haversine_distances([coord, lat_lon])[0, 1]

                if dist < min_dist:
                    cnty = county
                    min_dist = dist

            # Se añade el condado a la lista
            counties = np.append(
                counties,
                int(cnty)
            )
        
        # Se asigna los condados a cada coordenada y se devuelve el resultado
        result = coords.with_columns(
            pl.lit(counties).alias("county")
        )

        result = df.join(
            result.select(["latitude", "longitude", "county"]),
            on=["latitude", "longitude"],
            how="left"
        )

        return result

    def change_names(self, df: pl.DataFrame, suffix, no_change):
        renamed_df = df.rename(
            lambda col: col + suffix if col not in no_change else col
        )
        return renamed_df
    
    def to_datetime(self, df: pl.DataFrame, col: str):
        # Cast str to Datetime
        result = df.with_columns(
            pl.col(col).str.strptime(pl.Datetime)
        )
        return result

    def get_gas_features(self, gas_df: pl.DataFrame):
        df = gas_df.with_columns(
            ((pl.col("lowest_price_per_mwh") + 
              pl.col("highest_price_per_mwh")) / 2).alias("mean_price_per_mwh_24h")
        )

        df = self.to_datetime(df, "origin_date")
        df = df.rename({"origin_date":"datetime"})

        return self.change_names(df, "_gas", self.gas_join)
    
    def get_client_features(self, client_df: pl.DataFrame):
        df = client_df.with_columns(
            pl.col("date").str.strptime(pl.Date)
        )
        return self.change_names(df, "_client", self.client_join)
    
    def get_hw_means(self, data: pl.DataFrame) -> pl.DataFrame:
        data = data.group_by(
        ["datetime", "county"], maintain_order=True
        ).agg(pl.all().mean())
        return data

    def get_hist_weather_features(self, hw_df: pl.DataFrame, ws_df: pl.DataFrame):
        df = self.to_datetime(hw_df, "datetime")

        df = self.add_counties(df, ws_df)
        df = self.get_hw_means(df)
        df = self.change_names(df, "_hw", self.hw_join)

        return df
    
    def get_electricity_features(self, ep_df: pl.DataFrame):
        # str to datetime
        df = self.to_datetime(ep_df, "origin_date")

        df = df.rename({"origin_date":"datetime"})
        df = df.rename({"euros_per_mwh":"euros_per_mwh_24h"})

        # Change names
        df = self.change_names(df, "_ep", self.ep_join)

        return df
    
    def get_data_features(self, data: pl.DataFrame):
        
        date = "datetime"
        # Cast to Datetime
        df = self.to_datetime(data, "datetime")

        df = df.with_columns([
            # Date
            pl.col(date).dt.truncate("1d").cast(pl.Date).alias("date"),

            # Year
            pl.col(date).dt.year().alias("year"),

            # Quarter
            pl.col(date).dt.quarter().alias("quarter"),

            # Month
            pl.col(date).dt.month().alias("month"),

            # Week
            pl.col(date).dt.week().alias("week"),

            # Hour
            pl.col(date).dt.hour().alias("hour"),

            # Day of year
            pl.col(date).dt.ordinal_day().alias("day_of_year"),

            # Day of month
            pl.col(date).dt.day().alias("day_of_month"),

            # Day of week
            pl.col(date).dt.weekday().alias("day_of_week")
        ])

        return df
    
    def __call__(self, df_dict: dict):

        data = self.get_data_features(df_dict["train"])
        gas_p = self.get_gas_features(df_dict["gas_prices"])
        el_p = self.get_electricity_features(df_dict["electricity_prices"])
        client = self.get_client_features(df_dict["client"])
        hist_w = self.get_hist_weather_features(
            df_dict["historical_weather"],
            df_dict["weather_station"]
        )

        final_data = data.join(client, on=self.client_join, how="left")
        final_data = final_data.join(gas_p, on=self.gas_join, how="left")
        final_data = final_data.join(el_p, on=self.ep_join, how="left")
        final_data = final_data.join(hist_w, on=self.hw_join, how="left")

        return final_data

In [111]:
dp = DataProcessor()
res = dp(dfs)

In [114]:
res

county,is_business,product_type,target,is_consumption,datetime,data_block_id,row_id,prediction_unit_id,date,year,quarter,month,week,hour,day_of_year,day_of_month,day_of_week,eic_count_client,installed_capacity_client,data_block_id_client,forecast_date_gas,lowest_price_per_mwh_gas,highest_price_per_mwh_gas,data_block_id_gas,mean_price_per_mwh_24h_gas,forecast_date_ep,euros_per_mwh_24h_ep,data_block_id_ep,temperature_hw,dewpoint_hw,rain_hw,snowfall_hw,surface_pressure_hw,cloudcover_total_hw,cloudcover_low_hw,cloudcover_mid_hw,cloudcover_high_hw,windspeed_10m_hw,winddirection_10m_hw,shortwave_radiation_hw,direct_solar_radiation_hw,diffuse_radiation_hw,latitude_hw,longitude_hw,data_block_id_hw
i64,i64,i64,f64,i64,datetime[μs],i64,i64,i64,date,i32,i8,i8,i8,i8,i16,i8,i8,i64,f64,i64,str,f64,f64,i64,f64,str,f64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
0,0,1,0.713,0,2021-09-01 00:00:00,0,0,0,2021-09-01,2021,3,9,35,0,244,1,3,108,952.89,2,"""2021-09-02""",45.62,46.29,2,45.955,"""2021-09-02 00:00:00""",96.99,2,13.815385,10.6,0.0,0.0,1011.569231,42.461538,44.153846,2.923077,3.384615,5.619658,341.384615,0.0,0.0,0.0,59.492308,24.584615,1.0
0,0,1,96.59,1,2021-09-01 00:00:00,0,1,0,2021-09-01,2021,3,9,35,0,244,1,3,108,952.89,2,"""2021-09-02""",45.62,46.29,2,45.955,"""2021-09-02 00:00:00""",96.99,2,13.815385,10.6,0.0,0.0,1011.569231,42.461538,44.153846,2.923077,3.384615,5.619658,341.384615,0.0,0.0,0.0,59.492308,24.584615,1.0
0,0,2,0.0,0,2021-09-01 00:00:00,0,2,1,2021-09-01,2021,3,9,35,0,244,1,3,17,166.4,2,"""2021-09-02""",45.62,46.29,2,45.955,"""2021-09-02 00:00:00""",96.99,2,13.815385,10.6,0.0,0.0,1011.569231,42.461538,44.153846,2.923077,3.384615,5.619658,341.384615,0.0,0.0,0.0,59.492308,24.584615,1.0
0,0,2,17.314,1,2021-09-01 00:00:00,0,3,1,2021-09-01,2021,3,9,35,0,244,1,3,17,166.4,2,"""2021-09-02""",45.62,46.29,2,45.955,"""2021-09-02 00:00:00""",96.99,2,13.815385,10.6,0.0,0.0,1011.569231,42.461538,44.153846,2.923077,3.384615,5.619658,341.384615,0.0,0.0,0.0,59.492308,24.584615,1.0
0,0,3,2.904,0,2021-09-01 00:00:00,0,4,2,2021-09-01,2021,3,9,35,0,244,1,3,688,7207.88,2,"""2021-09-02""",45.62,46.29,2,45.955,"""2021-09-02 00:00:00""",96.99,2,13.815385,10.6,0.0,0.0,1011.569231,42.461538,44.153846,2.923077,3.384615,5.619658,341.384615,0.0,0.0,0.0,59.492308,24.584615,1.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
15,1,0,197.233,1,2023-05-31 23:00:00,637,2018347,64,2023-05-31,2023,2,5,22,23,151,31,3,,,,,,,,,,,,,,,,,,,,,,,,,,,,
15,1,1,0.0,0,2023-05-31 23:00:00,637,2018348,59,2023-05-31,2023,2,5,22,23,151,31,3,,,,,,,,,,,,,,,,,,,,,,,,,,,,
15,1,1,28.404,1,2023-05-31 23:00:00,637,2018349,59,2023-05-31,2023,2,5,22,23,151,31,3,,,,,,,,,,,,,,,,,,,,,,,,,,,,
15,1,3,0.0,0,2023-05-31 23:00:00,637,2018350,60,2023-05-31,2023,2,5,22,23,151,31,3,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [113]:
res.filter(
    (pl.col("county") == 0) &
    (pl.col("product_type") == 1) &
    (pl.col("is_business") == 0)
)

county,is_business,product_type,target,is_consumption,datetime,data_block_id,row_id,prediction_unit_id,date,year,quarter,month,week,hour,day_of_year,day_of_month,day_of_week,eic_count_client,installed_capacity_client,data_block_id_client,forecast_date_gas,lowest_price_per_mwh_gas,highest_price_per_mwh_gas,data_block_id_gas,mean_price_per_mwh_24h_gas,forecast_date_ep,euros_per_mwh_24h_ep,data_block_id_ep,temperature_hw,dewpoint_hw,rain_hw,snowfall_hw,surface_pressure_hw,cloudcover_total_hw,cloudcover_low_hw,cloudcover_mid_hw,cloudcover_high_hw,windspeed_10m_hw,winddirection_10m_hw,shortwave_radiation_hw,direct_solar_radiation_hw,diffuse_radiation_hw,latitude_hw,longitude_hw,data_block_id_hw
i64,i64,i64,f64,i64,datetime[μs],i64,i64,i64,date,i32,i8,i8,i8,i8,i16,i8,i8,i64,f64,i64,str,f64,f64,i64,f64,str,f64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
0,0,1,0.713,0,2021-09-01 00:00:00,0,0,0,2021-09-01,2021,3,9,35,0,244,1,3,108,952.89,2,"""2021-09-02""",45.62,46.29,2,45.955,"""2021-09-02 00:00:00""",96.99,2,13.815385,10.6,0.0,0.0,1011.569231,42.461538,44.153846,2.923077,3.384615,5.619658,341.384615,0.0,0.0,0.0,59.492308,24.584615,1.0
0,0,1,96.59,1,2021-09-01 00:00:00,0,1,0,2021-09-01,2021,3,9,35,0,244,1,3,108,952.89,2,"""2021-09-02""",45.62,46.29,2,45.955,"""2021-09-02 00:00:00""",96.99,2,13.815385,10.6,0.0,0.0,1011.569231,42.461538,44.153846,2.923077,3.384615,5.619658,341.384615,0.0,0.0,0.0,59.492308,24.584615,1.0
0,0,1,1.132,0,2021-09-01 01:00:00,0,122,0,2021-09-01,2021,3,9,35,1,244,1,3,108,952.89,2,,,,,,"""2021-09-02 01:00:00""",94.77,2,13.584615,10.307692,0.0,0.0,1011.453846,23.769231,21.461538,1.153846,12.153846,5.49359,342.692308,0.0,0.0,0.0,59.492308,24.584615,1.0
0,0,1,77.691,1,2021-09-01 01:00:00,0,123,0,2021-09-01,2021,3,9,35,1,244,1,3,108,952.89,2,,,,,,"""2021-09-02 01:00:00""",94.77,2,13.584615,10.307692,0.0,0.0,1011.453846,23.769231,21.461538,1.153846,12.153846,5.49359,342.692308,0.0,0.0,0.0,59.492308,24.584615,1.0
0,0,1,0.49,0,2021-09-01 02:00:00,0,244,0,2021-09-01,2021,3,9,35,2,244,1,3,108,952.89,2,,,,,,"""2021-09-02 02:00:00""",93.88,2,13.192308,10.138462,0.0,0.0,1011.115385,12.692308,13.076923,0.153846,2.692308,5.337607,337.692308,0.0,0.0,0.0,59.492308,24.584615,1.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
0,0,1,569.301,1,2023-05-31 21:00:00,637,2017963,0,2023-05-31,2023,2,5,22,21,151,31,3,,,,,,,,,,,,,,,,,,,,,,,,,,,,
0,0,1,5.086,0,2023-05-31 22:00:00,637,2018092,0,2023-05-31,2023,2,5,22,22,151,31,3,,,,,,,,,,,,,,,,,,,,,,,,,,,,
0,0,1,577.499,1,2023-05-31 22:00:00,637,2018093,0,2023-05-31,2023,2,5,22,22,151,31,3,,,,,,,,,,,,,,,,,,,,,,,,,,,,
0,0,1,2.921,0,2023-05-31 23:00:00,637,2018222,0,2023-05-31,2023,2,5,22,23,151,31,3,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [14]:
res

datetime,temperature_hw,dewpoint_hw,rain_hw,snowfall_hw,surface_pressure_hw,cloudcover_total_hw,cloudcover_low_hw,cloudcover_mid_hw,cloudcover_high_hw,windspeed_10m_hw,winddirection_10m_hw,shortwave_radiation_hw,direct_solar_radiation_hw,diffuse_radiation_hw,latitude_hw,longitude_hw,data_block_id_hw,county
datetime[μs],f64,f64,f64,f64,f64,i64,i64,i64,i64,f64,i64,f64,f64,f64,f64,f64,f64,i64
2021-09-01 00:00:00,14.2,11.6,0.0,0.0,1015.9,31,31,0,11,7.083333,8,0.0,0.0,0.0,57.6,21.7,1.0,10
2021-09-01 00:00:00,13.9,11.5,0.0,0.0,1010.7,33,37,0,0,5.111111,359,0.0,0.0,0.0,57.6,22.2,1.0,10
2021-09-01 00:00:00,14.0,12.5,0.0,0.0,1015.0,31,34,0,0,6.333333,355,0.0,0.0,0.0,57.6,22.7,1.0,10
2021-09-01 00:00:00,14.6,11.5,0.0,0.0,1017.3,0,0,0,0,8.083333,297,358.0,277.0,81.0,57.6,23.2,1.0,10
2021-09-01 00:00:00,15.7,12.9,0.0,0.0,1014.0,22,25,0,0,8.416667,5,0.0,0.0,0.0,57.6,23.7,1.0,7
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
2023-05-30 10:00:00,11.7,4.6,0.0,0.0,1018.9,40,9,54,0,1.055556,253,567.0,392.0,175.0,59.7,26.2,637.0,5
2023-05-30 10:00:00,12.3,3.5,0.0,0.0,1019.0,46,4,70,0,0.805556,263,581.0,407.0,174.0,59.7,26.7,637.0,5
2023-05-30 10:00:00,9.8,3.0,0.0,0.0,1019.2,41,4,62,0,1.972222,285,609.0,432.0,177.0,59.7,27.2,637.0,2
2023-05-30 10:00:00,11.7,1.6,0.0,0.0,1019.0,44,0,73,0,3.5,307,658.0,521.0,137.0,59.7,27.7,637.0,2


In [8]:
dfs["historical_weather"]

NameError: name 'dfs' is not defined

In [11]:
# Adivinar que hace la variable hours ahead f mean
# No añadir los Targets de horas anteriores

In [31]:
dfs["client"].tail()

product_type,county,eic_count,installed_capacity,is_business,date,data_block_id
i64,i64,i64,f64,i64,str,i64
1,15,51,415.6,0,"""2023-05-29""",637
3,15,161,2035.75,0,"""2023-05-29""",637
0,15,15,620.0,1,"""2023-05-29""",637
1,15,20,624.5,1,"""2023-05-29""",637
3,15,55,2188.2,1,"""2023-05-29""",637


In [41]:
display(dfs["train"])
display(dfs["electricity_prices"])

county,is_business,product_type,target,is_consumption,datetime,data_block_id,row_id,prediction_unit_id
i64,i64,i64,f64,i64,str,i64,i64,i64
0,0,1,0.713,0,"""2021-09-01 00:00:00""",0,0,0
0,0,1,96.59,1,"""2021-09-01 00:00:00""",0,1,0
0,0,2,0.0,0,"""2021-09-01 00:00:00""",0,2,1
0,0,2,17.314,1,"""2021-09-01 00:00:00""",0,3,1
0,0,3,2.904,0,"""2021-09-01 00:00:00""",0,4,2
…,…,…,…,…,…,…,…,…
15,1,0,197.233,1,"""2023-05-31 23:00:00""",637,2018347,64
15,1,1,0.0,0,"""2023-05-31 23:00:00""",637,2018348,59
15,1,1,28.404,1,"""2023-05-31 23:00:00""",637,2018349,59
15,1,3,0.0,0,"""2023-05-31 23:00:00""",637,2018350,60


forecast_date,euros_per_mwh,origin_date,data_block_id
str,f64,str,i64
"""2021-09-01 00:00:00""",92.51,"""2021-08-31 00:00:00""",1
"""2021-09-01 01:00:00""",88.9,"""2021-08-31 01:00:00""",1
"""2021-09-01 02:00:00""",87.35,"""2021-08-31 02:00:00""",1
"""2021-09-01 03:00:00""",86.88,"""2021-08-31 03:00:00""",1
"""2021-09-01 04:00:00""",88.43,"""2021-08-31 04:00:00""",1
…,…,…,…
"""2023-05-30 19:00:00""",82.1,"""2023-05-29 19:00:00""",637
"""2023-05-30 20:00:00""",150.85,"""2023-05-29 20:00:00""",637
"""2023-05-30 21:00:00""",82.1,"""2023-05-29 21:00:00""",637
"""2023-05-30 22:00:00""",82.09,"""2023-05-29 22:00:00""",637


In [None]:
for key in dfs.keys():
    print(f"{key}:")
    display(dfs[key].head(1))

train:


county,is_business,product_type,target,is_consumption,datetime,data_block_id,row_id,prediction_unit_id
i64,i64,i64,f64,i64,str,i64,i64,i64
0,0,1,0.713,0,"""2021-09-01 00:00:00""",0,0,0


gas_prices:


forecast_date,lowest_price_per_mwh,highest_price_per_mwh,origin_date,data_block_id
str,f64,f64,str,i64
"""2021-09-01""",45.23,46.32,"""2021-08-31""",1


client:


product_type,county,eic_count,installed_capacity,is_business,date,data_block_id
i64,i64,i64,f64,i64,str,i64
1,0,108,952.89,0,"""2021-09-01""",2


electricity_prices:


forecast_date,euros_per_mwh,origin_date,data_block_id
str,f64,str,i64
"""2021-09-01 00:00:00""",92.51,"""2021-08-31 00:00:00""",1


forecast_weather:


latitude,longitude,origin_datetime,hours_ahead,temperature,dewpoint,cloudcover_high,cloudcover_low,cloudcover_mid,cloudcover_total,10_metre_u_wind_component,10_metre_v_wind_component,data_block_id,forecast_datetime,direct_solar_radiation,surface_solar_radiation_downwards,snowfall,total_precipitation
f64,f64,str,i64,f64,f64,f64,f64,f64,f64,f64,f64,i64,str,f64,f64,f64,f64
57.6,21.7,"""2021-09-01 02:00:00""",1,15.655786,11.553613,0.904816,0.019714,0.0,0.905899,-0.411328,-9.106137,1,"""2021-09-01 03:00:00""",0.0,0.0,0.0,0.0


historical_weather:


datetime,temperature,dewpoint,rain,snowfall,surface_pressure,cloudcover_total,cloudcover_low,cloudcover_mid,cloudcover_high,windspeed_10m,winddirection_10m,shortwave_radiation,direct_solar_radiation,diffuse_radiation,latitude,longitude,data_block_id
str,f64,f64,f64,f64,f64,i64,i64,i64,i64,f64,i64,f64,f64,f64,f64,f64,f64
"""2021-09-01 00:00:00""",14.2,11.6,0.0,0.0,1015.9,31,31,0,11,7.083333,8,0.0,0.0,0.0,57.6,21.7,1.0


county_lon_lats:


Unnamed: 0_level_0,county,longitude,latitude
i64,i64,f64,f64
0,0,24.2,59.1


weather_station:


county_name,longitude,latitude,county
str,f64,f64,i64
,21.7,57.6,


In [None]:
dfs["client"].with_columns(
    pl.col("date").str.strptime(pl.Date)
)
.0

product_type,county,eic_count,installed_capacity,is_business,date,data_block_id
i64,i64,i64,f64,i64,date,i64
1,0,108,952.89,0,2021-09-01,2
2,0,17,166.4,0,2021-09-01,2
3,0,688,7207.88,0,2021-09-01,2
0,0,5,400.0,1,2021-09-01,2
1,0,43,1411.0,1,2021-09-01,2
…,…,…,…,…,…,…
1,15,51,415.6,0,2023-05-29,637
3,15,161,2035.75,0,2023-05-29,637
0,15,15,620.0,1,2023-05-29,637
1,15,20,624.5,1,2023-05-29,637


In [17]:
display(dfs["client"].filter(
    (pl.col("product_type") == 1) & 
    (pl.col("county") == 0) & 
    (pl.col("is_business") == 0)
))

product_type,county,eic_count,installed_capacity,is_business,date,data_block_id
i64,i64,i64,f64,i64,str,i64
1,0,108,952.89,0,"""2021-09-01""",2
1,0,108,952.89,0,"""2021-09-02""",3
1,0,108,952.89,0,"""2021-09-03""",4
1,0,108,952.89,0,"""2021-09-04""",5
1,0,108,952.89,0,"""2021-09-05""",6
…,…,…,…,…,…,…
1,0,508,4968.215,0,"""2023-05-25""",633
1,0,507,4960.215,0,"""2023-05-26""",634
1,0,507,4960.215,0,"""2023-05-27""",635
1,0,508,4964.215,0,"""2023-05-28""",636


In [34]:
dfs["train"].filter(
    pl.col("data_block_id") == 1
)

county,is_business,product_type,target,is_consumption,datetime,data_block_id,row_id,prediction_unit_id
i64,i64,i64,f64,i64,str,i64,i64,i64
0,0,1,1.687,0,"""2021-09-02 00:00:00""",1,2928,0
0,0,1,109.366,1,"""2021-09-02 00:00:00""",1,2929,0
0,0,2,0.0,0,"""2021-09-02 00:00:00""",1,2930,1
0,0,2,21.008,1,"""2021-09-02 00:00:00""",1,2931,1
0,0,3,1.003,0,"""2021-09-02 00:00:00""",1,2932,2
…,…,…,…,…,…,…,…,…
15,0,3,59.302,1,"""2021-09-02 23:00:00""",1,5851,58
15,1,1,0.0,0,"""2021-09-02 23:00:00""",1,5852,59
15,1,1,21.756,1,"""2021-09-02 23:00:00""",1,5853,59
15,1,3,0.0,0,"""2021-09-02 23:00:00""",1,5854,60


In [13]:
dfs["forecast_weather"].filter(
    (pl.col("latitude") == 57.6) &
    (pl.col("longitude") == 21.7) &
    (pl.col("origin_datetime") == "2021-09-01 02:00:00")
).select(
    ["latitude", "longitude", "hours_ahead", "origin_datetime"]
).group_by("origin_datetime").agg(pl.col("hours_ahead").mean())

origin_datetime,hours_ahead
str,f64
"""2021-09-01 02:00:00""",24.5


In [14]:
agg_columns = [col for col in dfs["forecast_weather"].columns 
               if col not in ["latitude", "longitude"] + 
               ["forecast_datetime", 'data_block_id']]
agg_dict = {agg_col: ["mean"] for agg_col in agg_columns}

dfs["forecast_weather"].group_by(["forecast_datetime", 'data_block_id']).agg(
    pl.mean(agg_columns)
)

forecast_datetime,data_block_id,origin_datetime,hours_ahead,temperature,dewpoint,cloudcover_high,cloudcover_low,cloudcover_mid,cloudcover_total,10_metre_u_wind_component,10_metre_v_wind_component,direct_solar_radiation,surface_solar_radiation_downwards,snowfall,total_precipitation
str,i64,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""2021-11-05 02:00:00""",65,,25.0,7.517538,6.483776,0.374459,0.788241,0.930931,0.999334,-5.107638,2.407442,0.0,0.0,0.0,0.000828
"""2023-01-13 01:00:00""",498,,48.0,2.547195,1.822729,0.230212,0.996794,0.714138,0.99985,1.465805,6.375389,0.0,0.0,0.000116,0.000515
"""2022-11-07 06:00:00""",432,,29.0,7.805334,6.237741,0.9839,0.646853,0.686397,0.9998,3.204765,5.962434,0.0,0.0,0.0,0.000066
"""2021-11-26 22:00:00""",86,,45.0,0.462178,-1.622852,0.68024,0.548291,0.894366,0.961348,-3.388509,-1.414764,0.0,0.0,0.000109,0.000138
"""2023-04-06 15:00:00""",583,,13.0,7.781676,3.271873,0.371023,0.161743,0.121847,0.429985,-3.997157,-1.128954,631.914921,456.694643,0.0,6.4926e-7
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""2023-02-25 21:00:00""",542,,44.0,-3.086759,-5.321074,0.739758,0.942859,0.748293,0.955175,-3.73673,-7.943222,0.0,0.0,0.000057,0.000057
"""2023-02-27 15:00:00""",544,,38.0,-2.958223,-5.219591,0.231138,0.465095,0.008772,0.525808,3.614254,1.136308,414.168333,183.839881,1.5966e-7,7.8763e-8
"""2022-06-18 06:00:00""",290,,28.0,14.00735,12.099392,0.999698,0.017041,0.60023,0.999998,1.996138,3.770625,77.048254,108.461587,0.0,0.000002
"""2022-01-30 00:00:00""",150,,47.0,1.290996,-0.116119,0.253062,0.99653,0.933028,0.999971,1.889387,9.336265,0.0,0.0,0.000676,0.000913


In [15]:
dfs["forecast_weather"].head()


latitude,longitude,origin_datetime,hours_ahead,temperature,dewpoint,cloudcover_high,cloudcover_low,cloudcover_mid,cloudcover_total,10_metre_u_wind_component,10_metre_v_wind_component,data_block_id,forecast_datetime,direct_solar_radiation,surface_solar_radiation_downwards,snowfall,total_precipitation
f64,f64,str,i64,f64,f64,f64,f64,f64,f64,f64,f64,i64,str,f64,f64,f64,f64
57.6,21.7,"""2021-09-01 02:00:00""",1,15.655786,11.553613,0.904816,0.019714,0.0,0.905899,-0.411328,-9.106137,1,"""2021-09-01 03:00:00""",0.0,0.0,0.0,0.0
57.6,22.2,"""2021-09-01 02:00:00""",1,13.003931,10.689844,0.886322,0.004456,0.0,0.886658,0.206347,-5.355405,1,"""2021-09-01 03:00:00""",0.0,0.0,0.0,0.0
57.6,22.7,"""2021-09-01 02:00:00""",1,14.206567,11.671777,0.729034,0.005615,0.0,0.730499,1.451587,-7.417905,1,"""2021-09-01 03:00:00""",0.0,0.0,0.0,0.0
57.6,23.2,"""2021-09-01 02:00:00""",1,14.844507,12.264917,0.336304,0.074341,0.000626,0.385468,1.090869,-9.163999,1,"""2021-09-01 03:00:00""",0.0,0.0,0.0,0.0
57.6,23.7,"""2021-09-01 02:00:00""",1,15.293848,12.458887,0.102875,0.088074,1.5e-05,0.17659,1.268481,-8.975766,1,"""2021-09-01 03:00:00""",0.0,0.0,0.0,0.0


In [32]:
result = dfs["train"].with_columns(
    pl.col("datetime").str.strptime(
        pl.Datetime, "%Y-%m-%d %H:%M:%S"
    )
)

In [33]:
result

county,is_business,product_type,target,is_consumption,datetime,data_block_id,row_id,prediction_unit_id
i64,i64,i64,f64,i64,datetime[μs],i64,i64,i64
0,0,1,0.713,0,2021-09-01 00:00:00,0,0,0
0,0,1,96.59,1,2021-09-01 00:00:00,0,1,0
0,0,2,0.0,0,2021-09-01 00:00:00,0,2,1
0,0,2,17.314,1,2021-09-01 00:00:00,0,3,1
0,0,3,2.904,0,2021-09-01 00:00:00,0,4,2
…,…,…,…,…,…,…,…,…
15,1,0,197.233,1,2023-05-31 23:00:00,637,2018347,64
15,1,1,0.0,0,2023-05-31 23:00:00,637,2018348,59
15,1,1,28.404,1,2023-05-31 23:00:00,637,2018349,59
15,1,3,0.0,0,2023-05-31 23:00:00,637,2018350,60


In [42]:
result = result.with_columns([
    pl.col("datetime").dt.truncate("1d").cast(pl.Date).alias("date"),
    pl.col("datetime").dt.year().alias("year")
])

In [24]:
dfa = pl.DataFrame({
    "A":[1, 3, 4, 3, 2],
    "B":["uno", "tres", "cuatro", "tres", "dos"]
})
dfa

A,B
i64,str
1,"""uno"""
3,"""tres"""
4,"""cuatro"""
3,"""tres"""
2,"""dos"""


In [25]:
dfb = pl.DataFrame({
    "A":[1, 2, 3, 4],
    "C":["one", "two", "three", "four"]
})
dfb

A,C
i64,str
1,"""one"""
2,"""two"""
3,"""three"""
4,"""four"""


In [26]:
dfa.join(dfb, how="left", on="A")

A,B,C
i64,str,str
1,"""uno""","""one"""
3,"""tres""","""three"""
4,"""cuatro""","""four"""
3,"""tres""","""three"""
2,"""dos""","""two"""


In [None]:
# PASAR A RADIANES
# HIVERSINE DISTANCE
# JOIN CON LOS COUNTIES MAS CERCANOS