In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# Import Data

In [4]:
energy_data_path = './data/clean/energy.csv'
weather_data_path = './data/clean/weather.csv'

In [5]:
energy_df = pd.read_csv(energy_data_path)
weather_df = pd.read_csv(weather_data_path)

# Helper Functions

In [None]:
def rolling_means(df, columns, days):
    df = df.copy()
    window_size = days * 24
    for column in columns:
        df[f'{column}_roll_{days}d'] = df[column].rolling(window=window_size, min_periods=1).mean()
    return df

In [7]:
def lag_features(df, features_to_lag, amount_to_lag):
    new_df = df.copy()
    for feature in features_to_lag:
        lagged_column_name = f"{feature}_{amount_to_lag}_lag"
        new_df[lagged_column_name] = df[feature].shift(amount_to_lag)
    return new_df

In [8]:
def run_linear_regression(X_train, Y_train, X_test):
    model = RandomForestRegressor(n_estimators=50)
    model.fit(X_train, Y_train)
    y_pred = model.predict(X_test)
    return y_pred

In [9]:
def lasso_regression_weights(X_train, Y_train):
    model = Lasso(max_iter=10000)
    model.fit(X_train, Y_train)
    weights = model.coef_
    feature_weights = pd.Series(weights, index=X_train.columns)
    return feature_weights

In [10]:
def evaluate_regression(y_true, y_pred):
    mean_error = mean_absolute_error(y_true, y_pred)
    root_mse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    print(f"Error: {mean_error} \nRoot MSE: {root_mse} \nR2: {r2}")

# Modeling

In [11]:
df = pd.merge(weather_df, energy_df, on='dt_iso', how='inner')
df = df.reset_index()
df = df.drop(columns=['dt_iso'])

In [None]:
features_to_use = ['valencia_temp', 'valencia_wind_speed', 'valencia_cloud_cover_pct', 'valencia_precipitation_last_3hr', 'madrid_temp', 'madrid_wind_speed', 'madrid_cloud_cover_pct', 'madrid_precipitation_last_3hr', 'bilbao_temp', 'bilbao_wind_speed', 'bilbao_cloud_cover_pct', 'bilbao_precipitation_last_3hr', 'barcelona_temp', 'barcelona_wind_speed', 'barcelona_cloud_cover_pct', 'barcelona_precipitation_last_3hr', 'seville_temp', 'seville_wind_speed', 'seville_cloud_cover_pct', 'seville_precipitation_last_3hr']
features_to_predict = ['price actual', 'generation wind', 'generation water', 'generation solar']

## Price

In [None]:
X_price = df[features_to_use]
Y_price = df['price actual']
for days in [7,30]:
    X_price = rolling_means(X_price, features_to_use, days)
for hour_lag in [1,3,6,12,24]:
    X_price = lag_features(X_price, features_to_use, hour_lag)
X_price = X_price[721:]
Y_price = Y_price[721:]
X_train_price, X_test_price, Y_train_price, Y_test_price = train_test_split(X_price, Y_price, test_size=.2)

scaler = StandardScaler()
X_train_price = pd.DataFrame(scaler.fit_transform(X_train_price), columns=X_train_price.columns, index=X_train_price.index)
X_test_price = pd.DataFrame(scaler.transform(X_test_price), columns=X_test_price.columns, index=X_test_price.index)

price_feature_weights = lasso_regression_weights(X_train_price, Y_train_price)

In [25]:
price_feature_weights.sort_values(key=lambda x: x.abs(), ascending=False).head(20)

barcelona_wind_speed_roll_30d            -3.321248
madrid_wind_speed_roll_30d               -1.846323
seville_temp_12_lag                      -1.488361
madrid_wind_speed_12_lag                 -1.123649
madrid_precipitation_last_3hr_roll_30d   -0.991364
bilbao_wind_speed_roll_30d               -0.552679
bilbao_wind_speed_12_lag                 -0.535790
madrid_wind_speed                        -0.474724
valencia_wind_speed_12_lag               -0.441122
madrid_wind_speed_6_lag                  -0.351094
madrid_wind_speed_1_lag                  -0.303150
bilbao_wind_speed                        -0.295527
valencia_temp                             0.224555
seville_cloud_cover_pct_roll_30d         -0.083425
madrid_temp                               0.067814
madrid_wind_speed_3_lag                  -0.040518
bilbao_cloud_cover_pct_24_lag            -0.000000
madrid_temp_6_lag                         0.000000
seville_wind_speed_24_lag                 0.000000
madrid_cloud_cover_pct_6_lag   

In [26]:
most_important_price_features = ['barcelona_wind_speed_roll_30d', 'madrid_wind_speed_roll_30d' ,'seville_temp_12_lag','madrid_wind_speed_12_lag','madrid_precipitation_last_3hr_roll_30d','madrid_wind_speed','bilbao_wind_speed_12_lag','bilbao_wind_speed_roll_30d','valencia_wind_speed_12_lag','bilbao_wind_speed','madrid_wind_speed_1_lag','madrid_wind_speed_6_lag','madrid_temp','madrid_wind_speed_3_lag','madrid_temp_24_lag','seville_cloud_cover_pct_roll_30d','bilbao_cloud_cover_pct_roll_30d']

X_train_price = X_train_price[most_important_price_features]
X_test_price = X_test_price[most_important_price_features]
Y_pred_price = run_linear_regression(X_train_price, Y_train_price, X_test_price)
evaluate_regression(Y_test_price, Y_pred_price)

Error: 3.2271885200370325 
Root MSE: 4.460025277100643 
R2: 0.8997545350183631


## Solar

In [220]:
X_solar = df[features_to_use]
Y_solar = df['generation solar']
X_solar = rolling_means(X_solar, features_to_use, 7)
X_solar = rolling_means(X_solar, features_to_use, 30)
X_solar = lag_features(X_solar, features_to_use, 1)
X_solar = lag_features(X_solar, features_to_use, 3)
X_solar = lag_features(X_solar, features_to_use, 6)
X_solar = lag_features(X_solar, features_to_use, 12)
X_solar = lag_features(X_solar, features_to_use, 24)
X_solar = X_solar[721:]
Y_solar = Y_solar[721:]
X_train_solar, X_test_solar, Y_train_solar, Y_test_solar = train_test_split(X_solar, Y_solar, test_size=.2)

scaler = StandardScaler()
X_train_solar = pd.DataFrame(scaler.fit_transform(X_train_solar), columns=X_train_solar.columns, index=X_train_solar.index)
X_test_solar = pd.DataFrame(scaler.transform(X_test_solar), columns=X_test_solar.columns, index=X_test_solar.index)

solar_feature_weights = lasso_regression_weights(X_train_solar, Y_train_solar)

In [221]:
solar_feature_weights.sort_values(key=lambda x: x.abs(), ascending=False).head(20)

barcelona_temp                             715.045632
valencia_temp                              673.770398
seville_temp                               640.795367
madrid_temp_3_lag                         -412.214449
bilbao_temp                                373.620246
madrid_temp                                364.223039
seville_temp_6_lag                        -337.148128
barcelona_temp_3_lag                      -321.089229
bilbao_temp_3_lag                         -307.912887
valencia_temp_3_lag                       -267.624579
barcelona_temp_6_lag                      -244.635620
seville_temp_3_lag                        -244.293801
valencia_cloud_cover_pct_roll_7d          -187.581407
madrid_temp_24_lag                         178.340625
valencia_temp_roll_7d                     -176.772722
valencia_cloud_cover_pct_roll_30d          154.947476
valencia_wind_speed_roll_30d              -145.869518
valencia_precipitation_last_3hr_roll_7d    132.833361
madrid_temp_roll_7d         

In [222]:
most_important_solar_features = ['barcelona_temp', 'valencia_temp', 'seville_temp', 'madrid_temp_3_lag', 'bilbao_temp', 'madrid_temp', 'seville_temp_6_lag', 'barcelona_temp_3_lag', 'bilbao_temp_3_lag']

X_train_solar = X_train_solar[most_important_solar_features]
X_test_solar = X_test_solar[most_important_solar_features]
Y_pred_solar = run_linear_regression(X_train_solar, Y_train_solar, X_test_solar)
evaluate_regression(Y_test_solar, Y_pred_solar)

Error: 518.7916532630563 
Root MSE: 784.1639578421194 
R2: 0.7759053536211877


## Water

In [217]:
X_water = df[features_to_use]
Y_water = df['generation water']
X_water = rolling_means(X_water, features_to_use, 7)
X_water = rolling_means(X_water, features_to_use, 30)
X_water = lag_features(X_water, features_to_use, 1)
X_water = lag_features(X_water, features_to_use, 3)
X_water = lag_features(X_water, features_to_use, 6)
X_water = lag_features(X_water, features_to_use, 12)
X_water = lag_features(X_water, features_to_use, 24)
X_water = X_water[721:]
Y_water = Y_water[721:]
X_train_water, X_test_water, Y_train_water, Y_test_water = train_test_split(X_water, Y_water, test_size=.2)

scaler = StandardScaler()
X_train_water = pd.DataFrame(scaler.fit_transform(X_train_water), columns=X_train_water.columns, index=X_train_water.index)
X_test_water = pd.DataFrame(scaler.transform(X_test_water), columns=X_test_water.columns, index=X_test_water.index)

water_feature_weights = lasso_regression_weights(X_train_water, Y_train_water)

In [218]:
water_feature_weights.sort_values(key=lambda x: x.abs(), ascending=False).head(20)

seville_temp_roll_30d                      -1122.765811
barcelona_temp_roll_7d                      -975.191866
madrid_temp_roll_30d                        -910.005177
valencia_precipitation_last_3hr_roll_30d     774.903927
bilbao_precipitation_last_3hr_roll_30d      -751.591069
madrid_temp_roll_7d                          728.987239
barcelona_temp_roll_30d                      679.800593
seville_wind_speed_roll_30d                  665.123693
seville_temp_roll_7d                         615.358641
barcelona_wind_speed_roll_30d                590.552910
seville_precipitation_last_3hr_roll_30d     -483.465387
valencia_temp_roll_30d                       476.005810
bilbao_wind_speed_roll_7d                   -467.228011
madrid_precipitation_last_3hr_roll_30d       371.586044
valencia_temp_6_lag                          355.227766
valencia_wind_speed_roll_7d                  328.250598
bilbao_wind_speed_roll_30d                   311.920016
barcelona_temp                               277

In [219]:
most_important_water_features = ['seville_temp_roll_30d', 'barcelona_temp_roll_7d', 'madrid_temp_roll_30d', 'valencia_precipitation_last_3hr_roll_30d', 'bilbao_precipitation_last_3hr_roll_30d', 'madrid_temp_roll_7d', 'barcelona_temp_roll_30d', 'seville_wind_speed_roll_30d', 'seville_temp_roll_7d', 'barcelona_wind_speed_roll_30d']

X_train_water = X_train_water[most_important_water_features]
X_test_water = X_test_water[most_important_water_features]
Y_pred_water = run_linear_regression(X_train_water, Y_train_water, X_test_water)
evaluate_regression(Y_test_water, Y_pred_water)

Error: 473.07097209363843 
Root MSE: 650.4418554472056 
R2: 0.908558262292581


## Wind

In [214]:
X_wind = df[features_to_use]
Y_wind = df['generation wind']
X_wind = rolling_means(X_wind, features_to_use, 7)
X_wind = rolling_means(X_wind, features_to_use, 30)
X_wind = lag_features(X_wind, features_to_use, 1)
X_wind = lag_features(X_wind, features_to_use, 3)
X_wind = lag_features(X_wind, features_to_use, 6)
X_wind = lag_features(X_wind, features_to_use, 12)
X_wind = lag_features(X_wind, features_to_use, 24)
X_wind = X_wind[721:]
Y_wind = Y_wind[721:]
X_train_wind, X_test_wind, Y_train_wind, Y_test_wind = train_test_split(X_wind, Y_wind, test_size=.2)

scaler = StandardScaler()
X_train_wind = pd.DataFrame(scaler.fit_transform(X_train_wind), columns=X_train_wind.columns, index=X_train_wind.index)
X_test_wind = pd.DataFrame(scaler.transform(X_test_wind), columns=X_test_wind.columns, index=X_test_wind.index)

wind_feature_weights = lasso_regression_weights(X_train_wind, Y_train_wind)

In [215]:
wind_feature_weights.sort_values(key=lambda x: x.abs(), ascending=False).head(20)

valencia_temp_roll_30d                      -1850.552456
bilbao_temp_roll_30d                         1737.325994
madrid_temp_roll_30d                         1433.335334
bilbao_temp_roll_7d                         -1084.104961
barcelona_temp_roll_7d                      -1072.165830
valencia_temp_roll_7d                         768.243242
madrid_temp                                  -526.789922
seville_temp_roll_7d                         -508.920002
valencia_cloud_cover_pct_roll_7d              441.490574
seville_wind_speed_roll_30d                   423.181526
valencia_precipitation_last_3hr_roll_7d      -408.472856
barcelona_temp_roll_30d                      -328.255845
valencia_cloud_cover_pct_roll_30d            -320.668749
seville_precipitation_last_3hr_roll_7d        268.724279
madrid_precipitation_last_3hr_roll_7d        -267.989595
madrid_wind_speed                             259.111699
barcelona_precipitation_last_3hr_roll_30d     247.569428
bilbao_wind_speed_roll_30d     

In [216]:
most_important_wind_features = ['valencia_temp_roll_30d','bilbao_temp_roll_30d','madrid_temp_roll_30d','barcelona_temp_roll_7d','bilbao_temp_roll_7d','valencia_temp_roll_7d']

X_train_wind = X_train_wind[most_important_wind_features]
X_test_wind = X_test_wind[most_important_wind_features]
Y_pred_wind = run_linear_regression(X_train_wind, Y_train_wind, X_test_wind)
evaluate_regression(Y_test_wind, Y_pred_wind)

Error: 413.4435762465282 
Root MSE: 715.8171771442223 
R2: 0.9513004182081046
