# Generating Features and Dataset to ML models

In [93]:
import glob
import warnings

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%load_ext autoreload
%autoreload 2
from data_preprocessing import *
from feature_engineering import *

warnings.filterwarnings("ignore")
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 1000)
pd.set_option('display.width', 1000)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Final preprocessing on Historical Dataset

In [78]:
df_stations = pd.read_csv("./../../data/preprocessed/wind_data_conde_and_adjacent_stations.csv")
df_stations = df_stations.drop(columns=['WIND_SPEED_120m_ms'])
df_stations['DATETIME'] = pd.to_datetime(df_stations['DATETIME'])

In [79]:
cols_to_fill = df_stations.drop(columns=['WIND_SPEED_ms','WIND_DIRECTION_degrees','WIND_MAX_GUNS_ms']).columns.tolist()
df_stations_processed_forward = prepare_data_for_feature_generation(
                                    df=df_stations,
                                    fillna_method='rolling',
                                    cols_to_fill=cols_to_fill)

Processing station A431-CONDE. Current size: 26179 hour points.
Resampled 149 hour points

Processing station A458-RIBEIRA DO AMPARO. Current size: 16432 hour points.
Resampled 3443 hour points

Processing station A409-ARACAJU. Current size: 26328 hour points.
Resampled 0 hour points

Processing station A401-SALVADOR. Current size: 26309 hour points.
Resampled 19 hour points

Processing station A413-FEIRA DE SANTANA. Current size: 24295 hour points.
Resampled 2033 hour points

Processing station A406-CRUZ DAS ALMAS. Current size: 21906 hour points.
Resampled 4418 hour points

Processing station A442-EUCLIDES DA CUNHA. Current size: 21968 hour points.
Resampled 4348 hour points

Processing station A450-JEREMOABO. Current size: 26261 hour points.
Resampled 67 hour points

Processing station A434-AMARGOSA. Current size: 25194 hour points.
Resampled 1134 hour points

Processing station A436-QUEIMADAS. Current size: 22296 hour points.
Resampled 3966 hour points



In [4]:
# cols_to_fill = df_stations.drop(columns=['WIND_SPEED_ms','WIND_DIRECTION_degrees','WIND_MAX_GUNS_ms']).columns.tolist()
# df_stations_processed_rolling = prepare_data_for_feature_generation(
#                                     df=df_stations,
#                                     fillna_method='forward_fill',
#                                     cols_to_fill=cols_to_fill)

### Generating Target and Features

In [81]:
hourly_features = ['TOTAL_PRECIPITATION_mm', 'ATM_PRESSURE_mB', 'ATM_PRESSURE_SEA_LEVEL_mB', 'MAX_ATM_PRESSURE_PREV_HOUR_mB', 
'MIN_ATM_PRESSURE_PREV_HOUR_mB', 'GLOBAL_RADIATION_Kjm2', 'AIR_TEMPERATURE_DRY_BULB_Celsius', 'DEW_POINT_TEMPERATURE_Celsius', 
'MAX_TEMPERATURE_PREV_HOUR_Celsius', 'MIN_TEMPERATURE_PREV_HOUR_Celsius', 'DEW_POINT_MAX_TEMPERATURE_PREV_HOUR_Celsius', 
'DEW_POINT_MIN_TEMPERATURE_PREV_HOUR_Celsius', 'MAX_RELATIVE_HUMIDITY_PREV_HOUR_percentage', 
'MIN_RELATIVE_HUMIDITY_PREV_HOUR_percentage', 'RELATIVE_HUMIDITY_percentage', 'WIND_DIRECTION_degrees', 
'WIND_MAX_GUNS_ms', 'WIND_SPEED_ms']

targets = ['WIND_DIRECTION_degrees', 'WIND_MAX_GUNS_ms', 'WIND_SPEED_ms']

auxiliary_columns = ['NAME','CODE','DATETIME','YEAR']

df_final_dataset = make_experiment_dataset(df=df_stations_processed_forward,
                                        hourly_features=hourly_features,
                                        lags=[1,2,3,4,5,6,12,24],
                                        rolling_windows = [4,12,24],
                                        hour_feature='HOUR',
                                        targets=targets,
                                        target_shift=1)

In [82]:
df_final_dataset.shape

(256745, 449)

In [83]:
df_stations_processed_forward

Unnamed: 0,DATETIME,TOTAL_PRECIPITATION_mm,ATM_PRESSURE_mB,ATM_PRESSURE_SEA_LEVEL_mB,MAX_ATM_PRESSURE_PREV_HOUR_mB,MIN_ATM_PRESSURE_PREV_HOUR_mB,GLOBAL_RADIATION_Kjm2,AIR_TEMPERATURE_DRY_BULB_Celsius,DEW_POINT_TEMPERATURE_Celsius,MAX_TEMPERATURE_PREV_HOUR_Celsius,MIN_TEMPERATURE_PREV_HOUR_Celsius,DEW_POINT_MAX_TEMPERATURE_PREV_HOUR_Celsius,DEW_POINT_MIN_TEMPERATURE_PREV_HOUR_Celsius,MAX_RELATIVE_HUMIDITY_PREV_HOUR_percentage,MIN_RELATIVE_HUMIDITY_PREV_HOUR_percentage,RELATIVE_HUMIDITY_percentage,WIND_DIRECTION_degrees,WIND_MAX_GUNS_ms,WIND_SPEED_ms,LATITUDE,LONGITUDE,ALTITUDE,YEAR,MONTH,DAY,HOUR,NAME,CODE
0,2018-01-01 00:00:00,0.0,1009.6,,1009.6,1009.3,-3.540,26.1,23.5,26.3,26.0,23.6,23.3,86.0,84.0,86.0,81.0,7.4,3.9,-12.035833,-37.683889,31.90,2018,1,1,0,CONDE,A431
1,2018-01-01 01:00:00,0.0,1009.7,1013.559272,1009.8,1009.6,-3.540,26.1,23.3,26.1,26.0,23.7,23.3,87.0,85.0,85.0,80.0,8.1,4.3,-12.035833,-37.683889,31.90,2018,1,1,1,CONDE,A431
2,2018-01-01 02:00:00,0.0,1009.7,1013.560564,1009.9,1009.7,-3.540,26.0,23.4,26.1,26.0,23.5,23.3,86.0,85.0,85.0,89.0,9.3,5.6,-12.035833,-37.683889,31.90,2018,1,1,2,CONDE,A431
3,2018-01-01 03:00:00,0.0,1009.0,1012.859181,1009.7,1009.0,-3.540,25.9,23.3,26.1,25.9,23.4,23.1,86.0,84.0,86.0,82.0,10.9,4.9,-12.035833,-37.683889,31.90,2018,1,1,3,CONDE,A431
4,2018-01-01 04:00:00,0.0,1008.5,1012.359855,1009.0,1008.5,-3.540,25.7,23.1,25.9,25.6,23.3,23.0,86.0,85.0,86.0,77.0,9.3,4.2,-12.035833,-37.683889,31.90,2018,1,1,4,CONDE,A431
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26257,2020-12-30 11:00:00,,,,,,,,,,,,,,,,,,,-10.984722,-39.616944,310.11,2020,12,30,11,QUEIMADAS,A436
26258,2020-12-30 12:00:00,,,,,,,,,,,,,,,,,,,-10.984722,-39.616944,310.11,2020,12,30,12,QUEIMADAS,A436
26259,2020-12-30 13:00:00,,,,,,,,,,,,,,,,,,,-10.984722,-39.616944,310.11,2020,12,30,13,QUEIMADAS,A436
26260,2020-12-30 14:00:00,,,,,,,,,,,,,,,,,,,-10.984722,-39.616944,310.11,2020,12,30,14,QUEIMADAS,A436


In [84]:
df_final_dataset.shape

(256745, 449)

In [88]:
df_final_dataset.dropna(axis=0).shape

(46845, 449)

In [91]:
minimum_filled_features = int(np.round(df_final_dataset.shape[1]/1.5)) 
print("Minimum required features:",minimum_filled_features)
df_final_dataset.dropna(axis=0,thresh=minimum_filled_features).shape

Minimum required features: 299


(217373, 449)