# Generating Features and Dataset to ML models

In [1]:
import glob

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%load_ext autoreload
%autoreload 2
from data_preprocessing import *

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 1000)
pd.set_option('display.width', 1000)

## Final cleaning on Historical Dataset

In [27]:
df_stations = pd.read_csv("./../../data/preprocessed/wind_data_conde_and_adjacent_stations.csv")
df_stations = df_stations.drop(columns=['WIND_SPEED_120m_ms','NAME'])
df_stations['DATETIME'] = pd.to_datetime(df_stations['DATETIME'])


In [28]:
df_stations.head()

Unnamed: 0,TOTAL_PRECIPITATION_mm,ATM_PRESSURE_mB,ATM_PRESSURE_SEA_LEVEL_mB,MAX_ATM_PRESSURE_PREV_HOUR_mB,MIN_ATM_PRESSURE_PREV_HOUR_mB,GLOBAL_RADIATION_Kjm2,AIR_TEMPERATURE_DRY_BULB_Celsius,DEW_POINT_TEMPERATURE_Celsius,MAX_TEMPERATURE_PREV_HOUR_Celsius,MIN_TEMPERATURE_PREV_HOUR_Celsius,DEW_POINT_MAX_TEMPERATURE_PREV_HOUR_Celsius,DEW_POINT_MIN_TEMPERATURE_PREV_HOUR_Celsius,MAX_RELATIVE_HUMIDITY_PREV_HOUR_percentage,MIN_RELATIVE_HUMIDITY_PREV_HOUR_percentage,RELATIVE_HUMIDITY_percentage,WIND_DIRECTION_degrees,WIND_MAX_GUNS_ms,WIND_SPEED_ms,LATITUDE,LONGITUDE,ALTITUDE,YEAR,MONTH,DAY,HOUR,DATETIME,CODE
0,0.0,1009.6,,1009.6,1009.3,-3.54,26.1,23.5,26.3,26.0,23.6,23.3,86.0,84.0,86.0,81.0,7.4,3.9,-12.035833,-37.683889,31.9,2018,1,1,0,2018-01-01 00:00:00,A431
1,0.0,1009.7,1013.559272,1009.8,1009.6,-3.54,26.1,23.3,26.1,26.0,23.7,23.3,87.0,85.0,85.0,80.0,8.1,4.3,-12.035833,-37.683889,31.9,2018,1,1,1,2018-01-01 01:00:00,A431
2,0.0,1009.7,1013.560564,1009.9,1009.7,-3.54,26.0,23.4,26.1,26.0,23.5,23.3,86.0,85.0,85.0,89.0,9.3,5.6,-12.035833,-37.683889,31.9,2018,1,1,2,2018-01-01 02:00:00,A431
3,0.0,1009.0,1012.859181,1009.7,1009.0,-3.54,25.9,23.3,26.1,25.9,23.4,23.1,86.0,84.0,86.0,82.0,10.9,4.9,-12.035833,-37.683889,31.9,2018,1,1,3,2018-01-01 03:00:00,A431
4,0.0,1008.5,1012.359855,1009.0,1008.5,-3.54,25.7,23.1,25.9,25.6,23.3,23.0,86.0,85.0,86.0,77.0,9.3,4.2,-12.035833,-37.683889,31.9,2018,1,1,4,2018-01-01 04:00:00,A431


In [44]:
df_stations.shape

(237168, 27)

In [53]:
def prepare_data_for_feature_generation(df,fillna_method = 'forward_fill',columns_not_to_fill=['WIND_SPEED_ms','WIND_DIRECTION_degrees','WIND_MAX_GUNS_ms']):
    '''
    Function to prepare dataset for feature generation.
    
    Steps are:

    - Resample dataset in order to maintain hour time structure, since there are some missing hours.

    - Forward fill of missing values. 
    
    Parameters
    ----------

    df: main dataset containing metheorological data from automatic stations.

    fillna_method: Flag to indicate type of missing value filling. Possible methods are: None, 'forward_fill'. If it is None, no imputation is used.


    Returns
    ---------
    
    '''
    station_codes = df['CODE'].unique()
    stations_data = []
    for code in station_codes:

        df_code = df.query("CODE == @code")
        size_before = df_code.shape[0]
        print(f"Processing station {code}. Current size: {size_before} hour points.")
        df_code = resample_hours_for_wind_speed(df_code)
        df_code.sort_values(by='DATETIME',ascending=True,inplace=True)
        size_after = df_code.shape[0]
        print(f"Filled {size_after-size_before} hour points\n")
        if fillna_method == 'forward_fill':
            df_code = fill_missing_values_for_features(df_code,columns_not_to_fill)

        stations_data.append(df_code)
    df_stations = pd.concat(stations_data)
    return df_stations

def resample_hours_for_wind_speed(df):
    return df.set_index('DATETIME').resample('H').first()

def fill_missing_values_for_features(df,columns_not_to_fill):
    # TODO: restrict fillna to non filling columns
    # TODO: restrict forward fill to maximum of 1h
    return df.fillna(method='ffill')

In [54]:
df_stations_processed = prepare_data_for_feature_generation(df=df_stations,fillna_method='forward_fill')

Processing station A431
Filled 149 hour points

Processing station A458
Filled 3443 hour points

Processing station A409
Filled 0 hour points

Processing station A401
Filled 19 hour points

Processing station A413
Filled 2033 hour points

Processing station A406
Filled 4418 hour points

Processing station A442
Filled 4348 hour points

Processing station A450
Filled 67 hour points

Processing station A434
Filled 1134 hour points

Processing station A436
Filled 3966 hour points



In [52]:
df_stations.shape

(237168, 27)

In [51]:
df_stations_processed.shape

(256745, 26)