# **Preparación de datos previa a proceso de modelado**

## **Librerías**

In [1]:
from fe_auxiliary_functions import (add_sin_cos_transforms, 
                                          prepare_predictor_dataframe, 
                                          pivot_from_column_ref
                                 )
import warnings
import pandas as pd
import datetime as dt

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 10)
warnings.filterwarnings('ignore')

## **Lectura y unificación de información**

### **Lectura**

In [3]:
energy = pd.read_pickle('./clean_datasets/energy_clean_dataset.pkl')
weather = pd.read_pickle('./clean_datasets/weather_clean_dataset.pkl')

### ***Weather***: **Separación de variables por ciudad**

Se requiere utilizar evaluar las variables de las 4 ciudades simultáneamente para cada instante de tiempo. Para ello se realiza un pivot de la información para generar una nueva columna por cada variable y ciudad.

In [4]:
weather = pivot_from_column_ref(weather, index_col="time", new_columns_ref="city_name")
print("Shape:", weather.shape)
weather.head(3)

Shape: (35060, 56)


Unnamed: 0,time,temp_Barcelona,temp_Bilbao,temp_Madrid,temp_Seville,temp_Valencia,temp_min_Barcelona,temp_min_Bilbao,temp_min_Madrid,temp_min_Seville,temp_min_Valencia,temp_max_Barcelona,temp_max_Bilbao,temp_max_Madrid,temp_max_Seville,temp_max_Valencia,pressure_Barcelona,pressure_Bilbao,pressure_Madrid,pressure_Seville,pressure_Valencia,humidity_Barcelona,humidity_Bilbao,humidity_Madrid,humidity_Seville,humidity_Valencia,wind_speed_Barcelona,wind_speed_Bilbao,wind_speed_Madrid,wind_speed_Seville,wind_speed_Valencia,wind_deg_Barcelona,wind_deg_Bilbao,wind_deg_Madrid,wind_deg_Seville,wind_deg_Valencia,rain_1h_Barcelona,rain_1h_Bilbao,rain_1h_Madrid,rain_1h_Seville,rain_1h_Valencia,rain_3h_Barcelona,rain_3h_Bilbao,rain_3h_Madrid,rain_3h_Seville,rain_3h_Valencia,snow_3h_Barcelona,snow_3h_Bilbao,snow_3h_Madrid,snow_3h_Seville,snow_3h_Valencia,clouds_all_Barcelona,clouds_all_Bilbao,clouds_all_Madrid,clouds_all_Seville,clouds_all_Valencia
0,2015-01-01 00:00:00,281.625,269.657312,267.325,273.375,270.475,281.625,269.657312,267.325,273.375,270.475,281.625,269.657312,267.325,273.375,270.475,1035.0,1070.205106,1070.205106,1070.205106,1001.0,100.0,97.0,63.0,75.0,77.0,7.0,0.0,1.0,1.0,1.0,58.0,226.0,309.0,21.0,62.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2015-01-01 01:00:00,281.625,269.7635,267.325,273.375,270.475,281.625,269.7635,267.325,273.375,270.475,281.625,269.7635,267.325,273.375,270.475,1035.0,1035.0,1070.205106,1070.205106,1001.0,100.0,97.0,63.0,75.0,77.0,7.0,0.0,1.0,1.0,1.0,58.0,229.0,309.0,21.0,62.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2015-01-01 02:00:00,281.286,269.251688,289.708181,274.086,269.686,281.286,269.251688,266.186,274.086,269.686,281.286,269.251688,291.172867,274.086,269.686,1070.205106,1070.205106,1070.205106,1070.205106,1002.0,100.0,97.0,64.0,71.0,78.0,7.0,1.0,1.0,3.0,0.0,48.0,224.0,273.0,27.0,23.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### **Unificación de fuentes de información**

In [5]:
basic_dataset = pd.merge(weather, energy, how="inner", on="time")
print("Shape:", basic_dataset.shape)
basic_dataset.head(3)

Shape: (35013, 71)


Unnamed: 0,time,temp_Barcelona,temp_Bilbao,temp_Madrid,temp_Seville,temp_Valencia,temp_min_Barcelona,temp_min_Bilbao,temp_min_Madrid,temp_min_Seville,temp_min_Valencia,temp_max_Barcelona,temp_max_Bilbao,temp_max_Madrid,temp_max_Seville,temp_max_Valencia,pressure_Barcelona,pressure_Bilbao,pressure_Madrid,pressure_Seville,pressure_Valencia,humidity_Barcelona,humidity_Bilbao,humidity_Madrid,humidity_Seville,humidity_Valencia,wind_speed_Barcelona,wind_speed_Bilbao,wind_speed_Madrid,wind_speed_Seville,wind_speed_Valencia,wind_deg_Barcelona,wind_deg_Bilbao,wind_deg_Madrid,wind_deg_Seville,wind_deg_Valencia,rain_1h_Barcelona,rain_1h_Bilbao,rain_1h_Madrid,rain_1h_Seville,rain_1h_Valencia,rain_3h_Barcelona,rain_3h_Bilbao,rain_3h_Madrid,rain_3h_Seville,rain_3h_Valencia,snow_3h_Barcelona,snow_3h_Bilbao,snow_3h_Madrid,snow_3h_Seville,snow_3h_Valencia,clouds_all_Barcelona,clouds_all_Bilbao,clouds_all_Madrid,clouds_all_Seville,clouds_all_Valencia,generation biomass,generation fossil brown coal/lignite,generation fossil gas,generation fossil hard coal,generation fossil oil,generation hydro pumped storage consumption,generation hydro run-of-river and poundage,generation hydro water reservoir,generation nuclear,generation other,generation other renewable,generation solar,generation waste,generation wind onshore,total load actual
0,2015-01-01 00:00:00,281.625,269.657312,267.325,273.375,270.475,281.625,269.657312,267.325,273.375,270.475,281.625,269.657312,267.325,273.375,270.475,1035.0,1070.205106,1070.205106,1070.205106,1001.0,100.0,97.0,63.0,75.0,77.0,7.0,0.0,1.0,1.0,1.0,58.0,226.0,309.0,21.0,62.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,447.0,329.0,4844.0,4821.0,298.33756,863.0,1051.0,1899.0,7096.0,43.0,73.0,49.0,196.0,6378.0,25385.0
1,2015-01-01 01:00:00,281.625,269.7635,267.325,273.375,270.475,281.625,269.7635,267.325,273.375,270.475,281.625,269.7635,267.325,273.375,270.475,1035.0,1035.0,1070.205106,1070.205106,1001.0,100.0,97.0,63.0,75.0,77.0,7.0,0.0,1.0,1.0,1.0,58.0,229.0,309.0,21.0,62.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,449.0,328.0,5196.0,4755.0,298.33756,920.0,1009.0,1658.0,7096.0,43.0,71.0,50.0,195.0,5890.0,24382.0
2,2015-01-01 02:00:00,281.286,269.251688,289.708181,274.086,269.686,281.286,269.251688,266.186,274.086,269.686,281.286,269.251688,291.172867,274.086,269.686,1070.205106,1070.205106,1070.205106,1070.205106,1002.0,100.0,97.0,64.0,71.0,78.0,7.0,1.0,1.0,3.0,0.0,48.0,224.0,273.0,27.0,23.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,448.0,323.0,4857.0,4581.0,298.33756,1164.0,973.0,1371.0,7099.0,43.0,73.0,50.0,196.0,5461.0,22734.0


Este dataset contiene todas las variables que debidamente han pasado por procesos de limpieza y calidad de la información. Por lo que será utilizado para la siguiente fase de modelación predictiva.

## **Transformaciones**

###
**Revisiones básicas de formato**

In [6]:
basic_dataset = basic_dataset.sort_values(by='time')
basic_dataset.columns = basic_dataset.columns.str.replace(' ', '_')

In [7]:
basic_dataset.to_pickle('./modeling_datasets/basic_dataset.pkl')

###
**Transformaciones para información disponible para predicciones según escenarios**

In [8]:
market_offer_dataset = prepare_predictor_dataframe(basic_dataset, dt.datetime(2015,1,2,0), dt.datetime(2018,12,30,0), "market_offer", 4, 4)
market_offer_dataset = add_sin_cos_transforms(market_offer_dataset, ["month", "dayofweek", "hour"])
market_offer_dataset.to_pickle('./modeling_datasets/market_offer_dataset.pkl')

In [9]:
first_session_dataset = prepare_predictor_dataframe(basic_dataset, dt.datetime(2015,1,2,0), dt.datetime(2018,12,30,0), "first_session", 4, 4)
first_session_dataset = add_sin_cos_transforms(first_session_dataset, ["month", "dayofweek", "hour"])
first_session_dataset.to_pickle('./modeling_datasets/first_session_dataset.pkl')

In [10]:
last_session_dataset = prepare_predictor_dataframe(basic_dataset, dt.datetime(2015,1,2,0), dt.datetime(2018,12,30,0), "last_session", 4, 4)
last_session_dataset = add_sin_cos_transforms(last_session_dataset, ["month", "dayofweek", "hour"])
last_session_dataset.to_pickle('./modeling_datasets/last_session_dataset.pkl')