# **Preparación de datos previa a proceso de modelado**

## **Librerías**

In [1]:
from modeling_auxiliary_functions import (add_sin_cos_transforms, 
                                          create_hourly_features,
                                          create_weekly_features,
                                          prepare_predictor_dataframe, 
                                          get_demand_df
                                 )
import warnings
import pandas as pd
import datetime as dt
from tqdm import tqdm

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 10)
warnings.filterwarnings('ignore')

## **Transformaciones**

###
**Lectura y unión de datasets**

In [3]:
df = pd.read_pickle('basic_dataset.pkl')
print("Shape:", df.shape)
df.head(3)

Shape: (35060, 71)


Unnamed: 0,time,temp_Barcelona,temp_Bilbao,temp_Madrid,temp_Seville,temp_Valencia,temp_min_Barcelona,temp_min_Bilbao,temp_min_Madrid,temp_min_Seville,temp_min_Valencia,temp_max_Barcelona,temp_max_Bilbao,temp_max_Madrid,temp_max_Seville,temp_max_Valencia,pressure_Barcelona,pressure_Bilbao,pressure_Madrid,pressure_Seville,pressure_Valencia,humidity_Barcelona,humidity_Bilbao,humidity_Madrid,humidity_Seville,humidity_Valencia,wind_speed_Barcelona,wind_speed_Bilbao,wind_speed_Madrid,wind_speed_Seville,wind_speed_Valencia,wind_deg_Barcelona,wind_deg_Bilbao,wind_deg_Madrid,wind_deg_Seville,wind_deg_Valencia,rain_1h_Barcelona,rain_1h_Bilbao,rain_1h_Madrid,rain_1h_Seville,rain_1h_Valencia,rain_3h_Barcelona,rain_3h_Bilbao,rain_3h_Madrid,rain_3h_Seville,rain_3h_Valencia,snow_3h_Barcelona,snow_3h_Bilbao,snow_3h_Madrid,snow_3h_Seville,snow_3h_Valencia,clouds_all_Barcelona,clouds_all_Bilbao,clouds_all_Madrid,clouds_all_Seville,clouds_all_Valencia,generation_biomass,generation_fossil_brown_coal/lignite,generation_fossil_gas,generation_fossil_hard_coal,generation_fossil_oil,generation_hydro_pumped_storage_consumption,generation_hydro_run-of-river_and_poundage,generation_hydro_water_reservoir,generation_nuclear,generation_other,generation_other_renewable,generation_solar,generation_waste,generation_wind_onshore,total_load_actual
0,2015-01-01 00:00:00,281.625,269.657312,267.325,273.375,270.475,281.625,269.657312,267.325,273.375,270.475,281.625,269.657312,267.325,273.375,270.475,1035.0,1070.205106,1070.205106,1070.205106,1001.0,100.0,97.0,63.0,75.0,77.0,7.0,0.0,1.0,1.0,1.0,58.0,226.0,309.0,21.0,62.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,447.0,329.0,4844.0,4821.0,298.324249,863.0,1051.0,1899.0,7096.0,43.0,73.0,49.0,196.0,6378.0,25385.0
1,2015-01-01 01:00:00,281.625,269.7635,267.325,273.375,270.475,281.625,269.7635,267.325,273.375,270.475,281.625,269.7635,267.325,273.375,270.475,1035.0,1035.0,1070.205106,1070.205106,1001.0,100.0,97.0,63.0,75.0,77.0,7.0,0.0,1.0,1.0,1.0,58.0,229.0,309.0,21.0,62.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,449.0,328.0,5196.0,4755.0,298.324249,920.0,1009.0,1658.0,7096.0,43.0,71.0,50.0,195.0,5890.0,24382.0
2,2015-01-01 02:00:00,281.286,269.251688,289.708181,274.086,269.686,281.286,269.251688,266.186,274.086,269.686,281.286,269.251688,291.172867,274.086,269.686,1070.205106,1070.205106,1070.205106,1070.205106,1002.0,100.0,97.0,64.0,71.0,78.0,7.0,1.0,1.0,3.0,0.0,48.0,224.0,273.0,27.0,23.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,448.0,323.0,4857.0,4581.0,298.324249,1164.0,973.0,1371.0,7099.0,43.0,73.0,50.0,196.0,5461.0,22734.0


###
**Transformaciones de representación de perídos pasados**

In [4]:
market_offer_dataset = prepare_predictor_dataframe(df, dt.datetime(2015,1,2,0), dt.datetime(2018,12,30,0), "market_offer", 4, 4)
market_offer_dataset = add_sin_cos_transforms(market_offer_dataset, ["month", "dayofweek", "hour"])
market_offer_dataset.to_pickle('market_offer_dataset.pkl')

100%|██████████| 1459/1459 [20:37<00:00,  1.18it/s]


In [5]:
first_session_dataset = prepare_predictor_dataframe(df, dt.datetime(2015,1,2,0), dt.datetime(2018,12,30,0), "first_session", 4, 4)
first_session_dataset = add_sin_cos_transforms(first_session_dataset, ["month", "dayofweek", "hour"])
first_session_dataset.to_pickle('first_session_dataset.pkl')

100%|██████████| 1459/1459 [20:47<00:00,  1.17it/s]


In [6]:
last_session_dataset = prepare_predictor_dataframe(df, dt.datetime(2015,1,2,0), dt.datetime(2018,12,30,0), "last_session", 4, 4)
last_session_dataset = add_sin_cos_transforms(last_session_dataset, ["month", "dayofweek", "hour"])
last_session_dataset.to_pickle('last_session_dataset.pkl')

100%|██████████| 1459/1459 [09:09<00:00,  2.65it/s]


## **Modelado predictivo**

El objetivo es predecir las 24 horas de la demanda total de energía con una anticipación de 12,24 y 36 horas según las sesiones intradiarias del mercado eléctrico mayorista.