# Generating Features and Dataset to ML models

In [4]:
import glob
import warnings

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%load_ext autoreload
%autoreload 2
from data_preprocessing import *

warnings.filterwarnings("ignore")
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 1000)
pd.set_option('display.width', 1000)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Final preprocessing on Historical Dataset

In [5]:
df_stations = pd.read_csv("./../../data/preprocessed/wind_data_conde_and_adjacent_stations.csv")
df_stations = df_stations.drop(columns=['WIND_SPEED_120m_ms'])
df_stations['DATETIME'] = pd.to_datetime(df_stations['DATETIME'])

In [6]:
cols_to_fill = df_stations.drop(columns=['WIND_SPEED_ms','WIND_DIRECTION_degrees','WIND_MAX_GUNS_ms']).columns.tolist()
df_stations_processed_forward = prepare_data_for_feature_generation(
                                    df=df_stations,
                                    fillna_method='rolling',
                                    cols_to_fill=cols_to_fill)

Processing station A431-CONDE. Current size: 26179 hour points.
Resampled 149 hour points

Processing station A458-RIBEIRA DO AMPARO. Current size: 16432 hour points.
Resampled 3443 hour points

Processing station A409-ARACAJU. Current size: 26328 hour points.
Resampled 0 hour points

Processing station A401-SALVADOR. Current size: 26309 hour points.
Resampled 19 hour points

Processing station A413-FEIRA DE SANTANA. Current size: 24295 hour points.
Resampled 2033 hour points

Processing station A406-CRUZ DAS ALMAS. Current size: 21906 hour points.
Resampled 4418 hour points

Processing station A442-EUCLIDES DA CUNHA. Current size: 21968 hour points.
Resampled 4348 hour points

Processing station A450-JEREMOABO. Current size: 26261 hour points.
Resampled 67 hour points

Processing station A434-AMARGOSA. Current size: 25194 hour points.
Resampled 1134 hour points

Processing station A436-QUEIMADAS. Current size: 22296 hour points.
Resampled 3966 hour points



In [7]:
cols_to_fill = df_stations.drop(columns=['WIND_SPEED_ms','WIND_DIRECTION_degrees','WIND_MAX_GUNS_ms']).columns.tolist()
df_stations_processed_rolling = prepare_data_for_feature_generation(
                                    df=df_stations,
                                    fillna_method='forward_fill',
                                    cols_to_fill=cols_to_fill)

Processing station A431-CONDE. Current size: 26179 hour points.
Resampled 149 hour points

Processing station A458-RIBEIRA DO AMPARO. Current size: 16432 hour points.
Resampled 3443 hour points

Processing station A409-ARACAJU. Current size: 26328 hour points.
Resampled 0 hour points

Processing station A401-SALVADOR. Current size: 26309 hour points.
Resampled 19 hour points

Processing station A413-FEIRA DE SANTANA. Current size: 24295 hour points.
Resampled 2033 hour points

Processing station A406-CRUZ DAS ALMAS. Current size: 21906 hour points.
Resampled 4418 hour points

Processing station A442-EUCLIDES DA CUNHA. Current size: 21968 hour points.
Resampled 4348 hour points

Processing station A450-JEREMOABO. Current size: 26261 hour points.
Resampled 67 hour points

Processing station A434-AMARGOSA. Current size: 25194 hour points.
Resampled 1134 hour points

Processing station A436-QUEIMADAS. Current size: 22296 hour points.
Resampled 3966 hour points



### Generating Hourly Features

In [8]:
main_hourly_features = ['TOTAL_PRECIPITATION_mm', 'ATM_PRESSURE_mB', 'ATM_PRESSURE_SEA_LEVEL_mB', 'MAX_ATM_PRESSURE_PREV_HOUR_mB', 
'MIN_ATM_PRESSURE_PREV_HOUR_mB', 'GLOBAL_RADIATION_Kjm2', 'AIR_TEMPERATURE_DRY_BULB_Celsius', 'DEW_POINT_TEMPERATURE_Celsius', 
'MAX_TEMPERATURE_PREV_HOUR_Celsius', 'MIN_TEMPERATURE_PREV_HOUR_Celsius', 'DEW_POINT_MAX_TEMPERATURE_PREV_HOUR_Celsius', 
'DEW_POINT_MIN_TEMPERATURE_PREV_HOUR_Celsius', 'MAX_RELATIVE_HUMIDITY_PREV_HOUR_percentage', 
'MIN_RELATIVE_HUMIDITY_PREV_HOUR_percentage', 'RELATIVE_HUMIDITY_percentage', 'WIND_DIRECTION_degrees', 
'WIND_MAX_GUNS_ms', 'WIND_SPEED_ms']

fixed_features = ['LATITUDE', 'LONGITUDE', 'ALTITUDE', 'MONTH', 'DAY', 'HOUR']

targets = ['WIND_DIRECTION_degrees', 'WIND_MAX_GUNS_ms', 'WIND_SPEED_ms']

cols_to_drop = ['NAME','CODE','DATETIME']

df = df_stations_processed_forward

station_codes = df['CODE'].unique()
station_names = df['NAME'].unique()
stations_data = []
for code,name in zip(station_codes,station_names):

    df_code = df.query("CODE == @code")
    size_before = df_code.shape[0]
    print(f"Processing station {code}-{name}. Current size: {size_before} hour points.")
    df_code.sort_values(by='DATETIME',ascending=True,inplace=True)
    
    # target
    df_code[targets].shift(1)


In [13]:
df_stations_processed_forward.query("CODE == 'A413'")['WIND_SPEED_ms'].shift(1)

0        NaN
1        4.2
2        3.9
3        2.9
4        3.0
        ... 
26323    3.6
26324    2.7
26325    3.8
26326    4.2
26327    4.0
Name: WIND_SPEED_ms, Length: 24295, dtype: float64