# Generating Features and Dataset to ML models

In [94]:
import warnings

import pandas as pd
import numpy as np

%load_ext autoreload
%autoreload 2
from data_preprocessing import *
from feature_engineering import *

warnings.filterwarnings("ignore")
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 1000)
pd.set_option('display.width', 1000)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Final preprocessing on Historical Dataset

In [95]:
df_stations = pd.read_csv("./../../data/preprocessed/wind_data_conde_and_adjacent_stations.csv")
df_stations = df_stations.drop(columns=['WIND_SPEED_120m_ms'])
df_stations['DATETIME'] = pd.to_datetime(df_stations['DATETIME'])

In [96]:
cols_to_fill = df_stations.drop(columns=['WIND_SPEED_ms','WIND_DIRECTION_degrees','WIND_MAX_GUNS_ms']).columns.tolist()
stations_to_compare = ["AMARGOSA", "ARACAJU", "CONDE", "JEREMOABO", "SALVADOR"]

df_stations_processed = prepare_data_for_feature_generation(
                                    df=df_stations.query("NAME in @stations_to_compare"),
                                    fillna_method='rolling',
                                    cols_to_fill=cols_to_fill)

Processing station A431-CONDE. Current size: 26179 hour points.
Resampled 149 hour points

Processing station A409-ARACAJU. Current size: 26328 hour points.
Resampled 0 hour points

Processing station A401-SALVADOR. Current size: 26309 hour points.
Resampled 19 hour points

Processing station A450-JEREMOABO. Current size: 26261 hour points.
Resampled 67 hour points

Processing station A434-AMARGOSA. Current size: 25194 hour points.
Resampled 1134 hour points



## Generating Target and Features for Hourly Predictions

In [99]:
hourly_features = ['TOTAL_PRECIPITATION_mm', 'ATM_PRESSURE_mB', 'ATM_PRESSURE_SEA_LEVEL_mB', 'MAX_ATM_PRESSURE_PREV_HOUR_mB', 
'MIN_ATM_PRESSURE_PREV_HOUR_mB', 'GLOBAL_RADIATION_Kjm2', 'AIR_TEMPERATURE_DRY_BULB_Celsius', 'DEW_POINT_TEMPERATURE_Celsius', 
'MAX_TEMPERATURE_PREV_HOUR_Celsius', 'MIN_TEMPERATURE_PREV_HOUR_Celsius', 'DEW_POINT_MAX_TEMPERATURE_PREV_HOUR_Celsius', 
'DEW_POINT_MIN_TEMPERATURE_PREV_HOUR_Celsius', 'MAX_RELATIVE_HUMIDITY_PREV_HOUR_percentage', 
'MIN_RELATIVE_HUMIDITY_PREV_HOUR_percentage', 'RELATIVE_HUMIDITY_percentage', 'WIND_DIRECTION_degrees', 
'WIND_MAX_GUNS_ms', 'WIND_SPEED_ms']

targets = ['WIND_DIRECTION_degrees', 'WIND_MAX_GUNS_ms', 'WIND_SPEED_ms']

df_final_dataset = make_wind_prediction_dataset(df=df_stations_processed,
                                        main_features=hourly_features,
                                        lags=[1,2,3,4,5,6,12,24],
                                        rolling_windows = [4,12,24],
                                        granularity='HOUR',
                                        targets=targets,
                                        target_shift=[1,3,6])

df_selected = drop_null_features_and_instances(df_final_dataset,feature_null_percentage=0.2,instance_null_percentage=0.8)
df_selected.shape

Initial dataset size: (131640, 454)
There were 10 features with more than 20.0% of null values:
There were a total of 10504 with less than 80.0% of avaiable data (features).


(121136, 444)

Saving final dataset

In [46]:
df_selected.to_parquet("./../../data/model_train/2022_06_20_wind_dataset_lags_central_tendency_dispersion_hour.gzip",index=False)

## Generating Smaller dataset for hourly Predictions

In [98]:
hourly_features = ['TOTAL_PRECIPITATION_mm', 'ATM_PRESSURE_mB', 'GLOBAL_RADIATION_Kjm2', 'AIR_TEMPERATURE_DRY_BULB_Celsius', 'DEW_POINT_TEMPERATURE_Celsius', 'RELATIVE_HUMIDITY_percentage', 'WIND_DIRECTION_degrees', 'WIND_SPEED_ms']
targets = ['WIND_DIRECTION_degrees', 'WIND_SPEED_ms']
auxiliary_columns = ['NAME','CODE','DATETIME','YEAR']

df_final_dataset = make_wind_prediction_dataset(df=df_stations_processed,
                                        main_features=hourly_features,
                                        lags=[1,2,3],
                                        rolling_windows = [4,12],
                                        granularity='HOUR',
                                        targets=targets,
                                        target_shift=[1,3,6])

df_selected = drop_null_features_and_instances(df_final_dataset,feature_null_percentage=0.2,instance_null_percentage=0.8)
df_selected.shape


Initial dataset size: (131640, 141)
There were 0 features with more than 20.0% of null values:
There were a total of 8999 with less than 80.0% of avaiable data (features).


(122641, 141)

In [65]:
df_selected.to_parquet("./../../data/model_train/2022_06_20_wind_smaller_dataset_lags_central_tendency_dispersion_hour.gzip",index=False)

## Generating Target and Features for Daily Predictions

In [100]:
hourly_features = ['TOTAL_PRECIPITATION_mm', 'ATM_PRESSURE_mB', 'ATM_PRESSURE_SEA_LEVEL_mB', 'MAX_ATM_PRESSURE_PREV_HOUR_mB', 
'MIN_ATM_PRESSURE_PREV_HOUR_mB', 'GLOBAL_RADIATION_Kjm2', 'AIR_TEMPERATURE_DRY_BULB_Celsius', 'DEW_POINT_TEMPERATURE_Celsius', 
'MAX_TEMPERATURE_PREV_HOUR_Celsius', 'MIN_TEMPERATURE_PREV_HOUR_Celsius', 'DEW_POINT_MAX_TEMPERATURE_PREV_HOUR_Celsius', 
'DEW_POINT_MIN_TEMPERATURE_PREV_HOUR_Celsius', 'MAX_RELATIVE_HUMIDITY_PREV_HOUR_percentage', 
'MIN_RELATIVE_HUMIDITY_PREV_HOUR_percentage', 'RELATIVE_HUMIDITY_percentage', 'WIND_DIRECTION_degrees', 
'WIND_MAX_GUNS_ms', 'WIND_SPEED_ms']

targets = ['WIND_DIRECTION_degrees', 'WIND_MAX_GUNS_ms', 'WIND_SPEED_ms']

df_final_dataset_day = make_wind_prediction_dataset(df=df_stations_processed,
                                        main_features=hourly_features,
                                        lags=[1,2,3,4,5,6,7,15,30,60,90],
                                        rolling_windows = [7,15,30,60],
                                        granularity='DAY',
                                        targets=targets,
                                        target_shift=[1,2,3,4,5,6,7])
                                        
df_selected = drop_null_features_and_instances(df_final_dataset_day,feature_null_percentage=0.2,instance_null_percentage=0.9)
df_selected.shape

Initial dataset size: (5485, 613)
There were 30 features with more than 20.0% of null values:
There were a total of 884 with less than 90.0% of avaiable data (features).


(4601, 583)

In [89]:
df_selected.to_parquet("./../../data/model_train/2022_06_20_wind_dataset_lags_central_tendency_dispersion_day.gzip",index=False)

## Generating Smaller Dataset for Daily Prediction

In [91]:
hourly_features = ['TOTAL_PRECIPITATION_mm', 'ATM_PRESSURE_mB', 'GLOBAL_RADIATION_Kjm2', 'AIR_TEMPERATURE_DRY_BULB_Celsius', 'DEW_POINT_TEMPERATURE_Celsius', 'RELATIVE_HUMIDITY_percentage', 'WIND_DIRECTION_degrees', 'WIND_SPEED_ms']
targets = ['WIND_DIRECTION_degrees', 'WIND_SPEED_ms']

df_final_dataset_day = make_wind_prediction_dataset(df=df_stations_processed,
                                        main_features=hourly_features,
                                        lags=[1,2,3],
                                        rolling_windows = [7,15],
                                        granularity='DAY',
                                        targets=targets,
                                        target_shift=[1,2,3,4,5,6,7])

df_selected = drop_null_features_and_instances(df_final_dataset_day,feature_null_percentage=0.2,instance_null_percentage=0.9)


Initial dataset size: (5485, 152)
There were 0 features with more than 20.0% of null values:
There were a total of 482 with less than 90.0% of avaiable data (features).


In [93]:
df_selected.to_parquet("./../../data/model_train/2022_06_20_wind_smaller_dataset_lags_central_tendency_dispersion_day.gzip",index=False)