## Carga de datos

Antes de poder manipular los diferentes datos, primero es necesário
cargarlos en el sistema

In [1]:
import pandas as pd
import polars as pl
import numpy as np 
from pathlib import Path

In [2]:
# Directório de archivos
data_folder = Path("../data/predict-energy-behavior-of-prosumers")

# Diccionario de archivos
files = {
    "train": "train.csv",
    "gas_prices": "gas_prices.csv",
    "client": "client.csv",
    "electricity_prices": "electricity_prices.csv",
    "forecast_weather": "forecast_weather.csv",
    "historical_weather": "historical_weather.csv",
    "county_lon_lats": "county_lon_lats.csv",
    "weather_station": "weather_station_to_county_mapping.csv"
}

In [3]:
# Se crea un diccionario de DataFrames para acceder facilmente a todos
# y poder iterar sobre ellos si es necesário
dfs = {}

for key, filename in files.items():
    filepath = data_folder / filename
    try:
        df = pl.read_csv(filepath)
        dfs[key] = df
        print(f"Archivo {key} cargado con exito.")
    except:
        print(f"No pudo cargarse el archivo {key}.")
    

Archivo train cargado con exito.
Archivo gas_prices cargado con exito.
Archivo client cargado con exito.
Archivo electricity_prices cargado con exito.
Archivo forecast_weather cargado con exito.
Archivo historical_weather cargado con exito.
Archivo county_lon_lats cargado con exito.
Archivo weather_station cargado con exito.


## Asignación de longitud/latitud a un condado especifico

Con el fin de poder unir los csv de "train.csv" y "forecast_weather.csv"
es necesario poder asignar a cada longitud y latitud, su condado
correspondiente.

Para ello se utilizará el archivo "weather_station_to_county_mapping" con datos que asignan latitudes y longitudes a su correspondiente condado.

### Visualización de los datos

In [4]:
from geopy.geocoders import Nominatim

In [5]:
# Se eliminan las filas con instancias nulas
weather_station = dfs["weather_station"].drop_nulls()
weather_station.head()

county_name,longitude,latitude,county
str,f64,f64,i64
"""Saaremaa""",22.2,58.2,10
"""Saaremaa""",22.2,58.5,10
"""Saaremaa""",22.7,58.5,10
"""Hiiumaa""",22.7,58.8,1
"""Saaremaa""",23.2,58.5,10


In [6]:
# Se inicializa el localizador
geolocator = Nominatim(user_agent="my-app")

In [7]:
import re

class Regex_dict:
    '''
    Diccionario el cual almacena expresiones regulares como keys
    con el fin de poder compilar la entrada a la hora de obtener un value
    '''
    def __init__(self):
        self.keywords = []  # Expresiones regulares

    def add(self, expr, value):
        '''
        Almacena la expresión regular junto a su valor.
        '''
        self.keywords.append(
            (re.compile(expr), value)
        )
    
    def get(self, string):
        '''
        Devuelve, si es posible, el valor de la key del string pasado
        '''
        for expr, value in self.keywords:
            if expr.search(string):
                return value
        return None

In [8]:
# Se inicializa el diccionario de er
county_dict = Regex_dict()

# se obtiene un array con el nombre de los condados y su número asignado
c = ("county_name", "county")
counties = weather_station[c].unique().to_numpy()

In [9]:
# Se añaden los nombres y número de los condados al diccionario
for county_name, county in counties:
    county_dict.add(county_name, county)

In [10]:
class DataProcessor:
    def __init__(self, lon_lat_dataframe):
        self.gas_join = ["data_block_id"]
        self.ep_join = ["datetime", "data_block_id"]
        self.hw_join = ["datetime", "county", "data_block_id"]
        self.client_join = ["county", "is_business", "product_type", "data_block_id"]
        self.lat_lon = ["latitude", "longitude"]

        self.county_dict = self.__create_dict(lon_lat_dataframe)

    def __create_dict(self, lon_lat_dataframe: pl.DataFrame):
        # Se inicializa el diccionario de re
        re_dict = Regex_dict()

        # Se eliminan los valores núlos del dataframe
        lon_lat_dataframe = lon_lat_dataframe.drop_nulls()
        cols = "county_name", "county"

        if cols in lon_lat_dataframe.columns:
            # Se obtiene un array con los nombres y números de cada condado
            counties = lon_lat_dataframe[cols].unique().to_numpy()

            # Se añade los nombres y números al diccionario
            for county_name, county in counties:
                re_dict.add(county_name, county)
        
        else:
            raise RuntimeError(
                "No existen las columnas \"county_name\" y/o \"county\" en el dataframe"
            )

        return re_dict

    def add_counties(self, df: pl.DataFrame):
        geolocator = Nominatim(user_agent="get-county")
        coords = df["latitude", "longitude"].to_numpy()

        county_list = np.array([])
        for coord in coords:
            location = geolocator.reverse((coord[0], coord[1]), exactly_one=True)
            county_name = location.raw["address"].get("municipality")
            county_list = np.append(
                county_list,
                self.county_dict.get(county_name)
            )
        
        return df.with_columns(pl.Series(name="county", values=county_list))



    def change_names(self, df, suffix, no_change):
        renamed_df = df.rename(
            lambda col: col + suffix if col not in no_change else col
        )
        return renamed_df

    def get_gas_features(self, gas_df):
        gas_df = gas_df.with_columns(
            ((pl.col("lowest_price_per_mwh") + 
              pl.col("highest_price_per_mwh")) / 2).alias("mean_price_per_mwh")
        )

        return self.change_names(gas_df, "_gas", self.gas_join)
    
    def get_client_features(self, client_df):
        return self.change_names(client_df, "_client", self.client_join)
    
    def get_hist_weather_features(self, hw_df: pl.DataFrame):
        hw_df = hw_df.with_columns(
            pl.col("datetime").str.strptime(pl.Datetime, "%Y-%m-%d %H:%M:%S")
        )

        hw_df = self.add_counties(hw_df)
        hw_df = self.change_names(hw_df, "_hw", self.hw_join)

        return hw_df
    
    def get_forecast_weather_features(self, fw_df):
        return None
    
    

In [None]:
# Adivinar que hace la variable hours ahead f mean
# No añadir los Targets de horas anteriores

In [16]:
for key in dfs.keys():
    print(f"{key}:")
    display(dfs[key].head(1))
    

train:


county,is_business,product_type,target,is_consumption,datetime,data_block_id,row_id,prediction_unit_id
i64,i64,i64,f64,i64,str,i64,i64,i64
0,0,1,0.713,0,"""2021-09-01 00:00:00""",0,0,0


gas_prices:


forecast_date,lowest_price_per_mwh,highest_price_per_mwh,origin_date,data_block_id
str,f64,f64,str,i64
"""2021-09-01""",45.23,46.32,"""2021-08-31""",1


client:


product_type,county,eic_count,installed_capacity,is_business,date,data_block_id
i64,i64,i64,f64,i64,str,i64
1,0,108,952.89,0,"""2021-09-01""",2


electricity_prices:


forecast_date,euros_per_mwh,origin_date,data_block_id
str,f64,str,i64
"""2021-09-01 00:00:00""",92.51,"""2021-08-31 00:00:00""",1


forecast_weather:


latitude,longitude,origin_datetime,hours_ahead,temperature,dewpoint,cloudcover_high,cloudcover_low,cloudcover_mid,cloudcover_total,10_metre_u_wind_component,10_metre_v_wind_component,data_block_id,forecast_datetime,direct_solar_radiation,surface_solar_radiation_downwards,snowfall,total_precipitation
f64,f64,str,i64,f64,f64,f64,f64,f64,f64,f64,f64,i64,str,f64,f64,f64,f64
57.6,21.7,"""2021-09-01 02:00:00""",1,15.655786,11.553613,0.904816,0.019714,0.0,0.905899,-0.411328,-9.106137,1,"""2021-09-01 03:00:00""",0.0,0.0,0.0,0.0


historical_weather:


datetime,temperature,dewpoint,rain,snowfall,surface_pressure,cloudcover_total,cloudcover_low,cloudcover_mid,cloudcover_high,windspeed_10m,winddirection_10m,shortwave_radiation,direct_solar_radiation,diffuse_radiation,latitude,longitude,data_block_id
str,f64,f64,f64,f64,f64,i64,i64,i64,i64,f64,i64,f64,f64,f64,f64,f64,f64
"""2021-09-01 00:00:00""",14.2,11.6,0.0,0.0,1015.9,31,31,0,11,7.083333,8,0.0,0.0,0.0,57.6,21.7,1.0


county_lon_lats:


Unnamed: 0_level_0,county,longitude,latitude
i64,i64,f64,f64
0,0,24.2,59.1


weather_station:


county_name,longitude,latitude,county
str,f64,f64,i64
,21.7,57.6,
