# Forecasting Energy Production 2

Step 2. The goal here is to create a range of features from our chosen variables that will be useful for our machine learning model. 
For now this will be the usual lags, rolling averages, cumulative totals and rates of change for all variables (except time obviously). I will then focus on training and validating the model. If I come back and make changes here after assessing the model I will highlight them. 

In [65]:
import pandas as pd
import numpy as np

import fsspec

fs = fsspec.filesystem("")

In [66]:
import warnings
warnings.filterwarnings('ignore', category=pd.errors.PerformanceWarning) # bad form I know but this dataset is small enough for ugly code

In [67]:
df = pd.read_csv('data/forecast_data_cleaned.csv')
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)
cols = list(df)
cols

['temperature_2m',
 'relative_humidity_2m',
 'rain',
 'cloud_cover_low',
 'cloud_cover_mid',
 'cloud_cover_high',
 'wind_speed_10m',
 'wind_speed_100m',
 'wind_direction_10m',
 'wind_direction_100m',
 'direct_radiation',
 'diffuse_radiation',
 'power_x',
 'TOD',
 'TOD_sin',
 'TOD_cos']

In [68]:
feature_cols = cols#[:-6]

In [69]:
feature_cols

['temperature_2m',
 'relative_humidity_2m',
 'rain',
 'cloud_cover_low',
 'cloud_cover_mid',
 'cloud_cover_high',
 'wind_speed_10m',
 'wind_speed_100m',
 'wind_direction_10m',
 'wind_direction_100m',
 'direct_radiation',
 'diffuse_radiation',
 'power_x',
 'TOD',
 'TOD_sin',
 'TOD_cos']

Some specific weather related features first up:

In [70]:
df['wind_speed_ratio'] = df['wind_speed_100m'] / df['wind_speed_10m'] # ratio of upper ws to lower ws

# wind shear is a great predictor of future weather and I imagine has implications for turbine productivity
df['wind_shear'] = np.log(df['wind_speed_100m'] / df['wind_speed_10m']) / np.log(100 / 10)

# wind speed components are sometimes easier for models than ws and direction
def wind_components(speed, direction):
    rad = np.radians(direction)
    u = -speed * np.sin(rad)
    v = -speed * np.cos(rad)
    return u, v

df['U_10m'], df['V_10m'] = wind_components(df['wind_speed_10m'], df['wind_direction_10m'])
df['U_100m'], df['V_100m'] = wind_components(df['wind_speed_100m'], df['wind_direction_100m'])

# magnitude of wind shear may be usefull
df['wind_shear_magnitude'] = np.sqrt((df['U_100m'] - df['U_10m'])**2 + (df['V_100m'] - df['V_10m'])**2) / (100 - 10)

# convert wind directions to cardinal bins to increase sample size. (in practice 355 deg wind is == 005 degree wind, given oscillations during gusts)
directions = ['N', 'NNE', 'NE', 'ENE', 'E', 'ESE', 'SE', 'SSE', 
              'S', 'SSW', 'SW', 'WSW', 'W', 'WNW', 'NW', 'NNW']
angles = np.linspace(0, 360, len(directions) + 1)  # Create bins

# Function to map wind direction to cardinal points
def wind_to_cardinal(angle):
    index = np.digitize([angle], angles, right=True)[0] % len(directions)
    return directions[index]

# Apply mapping
df["wind_direction_10m"] = df["wind_direction_10m"].apply(wind_to_cardinal)
df["wind_direction_100m"] = df["wind_direction_100m"].apply(wind_to_cardinal)

df["wind_direction_10m"] = df["wind_direction_10m"].astype("category").cat.codes
df["wind_direction_100m"] = df["wind_direction_100m"].astype("category").cat.codes

In [71]:
feature_cols = feature_cols + ['wind_speed_ratio', 'wind_shear', 'U_10m', 'U_100m', 'wind_shear_magnitude']

In [72]:
feature_cols

['temperature_2m',
 'relative_humidity_2m',
 'rain',
 'cloud_cover_low',
 'cloud_cover_mid',
 'cloud_cover_high',
 'wind_speed_10m',
 'wind_speed_100m',
 'wind_direction_10m',
 'wind_direction_100m',
 'direct_radiation',
 'diffuse_radiation',
 'power_x',
 'TOD',
 'TOD_sin',
 'TOD_cos',
 'wind_speed_ratio',
 'wind_shear',
 'U_10m',
 'U_100m',
 'wind_shear_magnitude']

Lags, The LSTM model itself will inherintly have some lag knowledge built in but only over short horizons. I will include some extended horizons to enhance forecasting performance. 

In [73]:
df = df.copy()
for col in feature_cols:
    for lag in [12, 24, 48, 72]:
        df[f'{col}_lag_{lag}'] = df[col].shift(lag)

Moving averages:

In [74]:
df = df.copy()
for col in feature_cols:
    for window in [2, 6, 12, 24, 48]:
        df[f'{col}_ma_{window}'] = df[col].rolling(window=window, center = False).mean() # center = False t avoid data leakage

Rates of change:

In [75]:
df = df.copy()
for col in feature_cols:
    for period in [2, 6, 12, 24, 48]:
        df[f'{col}_roc_{period}'] = (df[col] - df[col].shift(period)) / period

In [76]:
df.head()

Unnamed: 0_level_0,temperature_2m,relative_humidity_2m,rain,cloud_cover_low,cloud_cover_mid,cloud_cover_high,wind_speed_10m,wind_speed_100m,wind_direction_10m,wind_direction_100m,...,U_100m_roc_2,U_100m_roc_6,U_100m_roc_12,U_100m_roc_24,U_100m_roc_48,wind_shear_magnitude_roc_2,wind_shear_magnitude_roc_6,wind_shear_magnitude_roc_12,wind_shear_magnitude_roc_24,wind_shear_magnitude_roc_48
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-01-01 02:00:00,19.130999,34.82317,0.0,0.0,0.0,0.0,4.393177,8.099383,9,9,...,,,,,,,,,,
2017-01-01 03:00:00,19.181,32.45357,0.0,0.0,0.0,0.0,3.580503,7.379024,9,9,...,,,,,,,,,,
2017-01-01 04:00:00,18.681,33.363396,0.0,0.0,0.0,0.0,2.641969,6.140033,9,2,...,0.450001,,,,,-0.001032,,,,
2017-01-01 05:00:00,17.380999,36.462326,0.0,0.0,0.0,0.0,2.24722,4.2107,2,2,...,1.200001,,,,,-0.009617,,,,
2017-01-01 06:00:00,18.230999,35.558502,0.0,0.0,0.0,0.0,1.615549,2.86007,5,4,...,2.099997,,,,,-0.012567,,,,


In [77]:
list(df)

['temperature_2m',
 'relative_humidity_2m',
 'rain',
 'cloud_cover_low',
 'cloud_cover_mid',
 'cloud_cover_high',
 'wind_speed_10m',
 'wind_speed_100m',
 'wind_direction_10m',
 'wind_direction_100m',
 'direct_radiation',
 'diffuse_radiation',
 'power_x',
 'TOD',
 'TOD_sin',
 'TOD_cos',
 'wind_speed_ratio',
 'wind_shear',
 'U_10m',
 'V_10m',
 'U_100m',
 'V_100m',
 'wind_shear_magnitude',
 'temperature_2m_lag_12',
 'temperature_2m_lag_24',
 'temperature_2m_lag_48',
 'temperature_2m_lag_72',
 'relative_humidity_2m_lag_12',
 'relative_humidity_2m_lag_24',
 'relative_humidity_2m_lag_48',
 'relative_humidity_2m_lag_72',
 'rain_lag_12',
 'rain_lag_24',
 'rain_lag_48',
 'rain_lag_72',
 'cloud_cover_low_lag_12',
 'cloud_cover_low_lag_24',
 'cloud_cover_low_lag_48',
 'cloud_cover_low_lag_72',
 'cloud_cover_mid_lag_12',
 'cloud_cover_mid_lag_24',
 'cloud_cover_mid_lag_48',
 'cloud_cover_mid_lag_72',
 'cloud_cover_high_lag_12',
 'cloud_cover_high_lag_24',
 'cloud_cover_high_lag_48',
 'cloud_cover_

In [78]:
df = df.dropna(how = 'any') # hmm the 48 hour lags are going to cost us a bit of training data, might need to rethink that

In [79]:
df.to_csv('data/forecast_data_cleaned_feature_engineered.csv')