In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from datetime import datetime, timedelta
pd.options.display.max_rows = 100

In [9]:
pv_models = [
    # "full_predictions_cross_validation_v4_without_prob_with_30min_unormalised_formated",
    # "pvnet_predicitons_2021-2023_formated_v2",
    # # "full_pred_v6_2_xgb_pvnet_blend",
    # "forecast_v=6__model_name=national_xg__model_version=1.0.23__start_date=2016-12-01__end_date=2022-08-08",
    # # "forecast_v=7__model_name=national_xg_and_pv_net__model_version=2.0.0__start_date=2016-12-01__end_date=2022-08-08",
    # # "full_pred_v5_2_xgb_pvnet_merge_formated_fix_ts",
    "forecast_v=7__model_name_1=national_xg__model_version_1=1.0.23__model_name_2=pv_net__model_version_2=2.6.10__start_date=2016-12-01__end_date=2022-08-08",
    # # "full_pred_v5_3_xgb_pvnet_merge_formated_fix_ts_blend",
    # # "full_pred_v6_2_xgb_pvnet_blend",
    "full_pred_v6_3_xgb_pvnet_blend",
    # "full_pred_v8_1_2_xgb_pvnet_no_blend_formated",
    "full_pred_v8_2_1_xgb_pvnet_with_blend_formated",
    "full_pred_v8_5_1_xgb_pvnet_with_blend_formated"
]

model_mapping = {
    "full_predictions_cross_validation_v4_without_prob_with_30min_unormalised_formated":"v6.1 XGB + Inter + Bug Fix ",
    "pvnet_predicitons_2021-2023_formated_v2":"v7 PVNet 2020-2022",
    # "full_pred_v6_2_xgb_pvnet_blend":"df_pvnet_form_2",
    "forecast_v=6__model_name=national_xg__model_version=1.0.23__start_date=2016-12-01__end_date=2022-08-08":"v6 XGb",
    # "forecast_v=7__model_name=national_xg_and_pv_net__model_version=2.0.0__start_date=2016-12-01__end_date=2022-08-08":"model_v7_df",
    # "full_pred_v5_2_xgb_pvnet_merge_formated_fix_ts":"model_v7_df_v2",
    "forecast_v=7__model_name_1=national_xg__model_version_1=1.0.23__model_name_2=pv_net__model_version_2=2.6.10__start_date=2016-12-01__end_date=2022-08-08":"v7 XGb/PVNet",
    # "full_pred_v5_3_xgb_pvnet_merge_formated_fix_ts_blend":"model_v7_df_v3",
    # "full_pred_v6_2_xgb_pvnet_blend":"model_v7_df_v4_blend",
    "full_pred_v6_3_xgb_pvnet_blend":"v7 XGb/PVNet + Blend",
    "full_pred_v8_1_2_xgb_pvnet_no_blend_formated":"v8 XGb/PVNet Sum",
    "full_pred_v8_2_1_xgb_pvnet_with_blend_formated":"v8 XGb/PVNet Sum + Blend",
    "full_pred_v8_5_1_xgb_pvnet_with_blend_formated":"v8 XGb/PVNet Sum + Blend v2"
}

pvlive_df = pd.read_csv("../pvlive_2016_2022.csv")

In [10]:
def calculate_mae_for_forecast_horizons(pv_models, df_actual, min_end_datetime_utc=None):
    mae_results = []  # List to store MAE results for each model

    for pv_model in pv_models:
        print(pv_model)
        try:
            df_forecast = pd.read_csv("../data/" + pv_model + ".csv.gz")
        except FileNotFoundError:
            df_forecast = pd.read_csv("../data/" + pv_model + ".csv")
        merged_df = pd.merge(df_forecast, df_actual, on='end_datetime_utc', suffixes=('_forecast', '_actual'))

        # Filter the merged dataframe by the minimum end_datetime_utc if provided
        if min_end_datetime_utc is not None:
            merged_df['end_datetime_utc'] = pd.to_datetime(merged_df['end_datetime_utc'])
            merged_df = merged_df[merged_df['end_datetime_utc'] >= min_end_datetime_utc]

            max_end_datetime_utc = pd.to_datetime("2020-01-01 00:00:00").tz_localize('UTC')
            merged_df = merged_df[merged_df['end_datetime_utc'] <= max_end_datetime_utc]

        # Calculate the absolute error between the forecasted and actual generation
        merged_df['absolute_error'] = np.abs(merged_df['generation_mw_forecast'] - merged_df['generation_mw_actual'])
        # Calculate the forecast horizon in hours
        merged_df['forecast_horizon_hours'] = (pd.to_datetime(merged_df['end_datetime_utc']) - pd.to_datetime(merged_df['forecasting_creation_datetime_utc'])).dt.total_seconds() / 3600
        mae_by_horizon = merged_df.groupby('forecast_horizon_hours')['absolute_error'].mean().reset_index(name='MAE')

        # Add a column for the model name
        mae_by_horizon['Model'] = pv_model

        mae_results.append(mae_by_horizon)

    # Concatenate all MAE results into a single DataFrame
    merged_forecasts = pd.concat(mae_results)

    merged_forecasts['Model'] = merged_forecasts['Model'].replace(model_mapping)

    return merged_forecasts

In [11]:
mae_results = calculate_mae_for_forecast_horizons(pv_models, pvlive_df)

forecast_v=7__model_name_1=national_xg__model_version_1=1.0.23__model_name_2=pv_net__model_version_2=2.6.10__start_date=2016-12-01__end_date=2022-08-08
full_pred_v6_3_xgb_pvnet_blend
full_pred_v8_2_1_xgb_pvnet_with_blend_formated
full_pred_v8_5_1_xgb_pvnet_with_blend_formated


In [13]:
mae_results

Unnamed: 0,forecast_horizon_hours,MAE,Model
0,0.0,115.395093,v7 XGb/PVNet
1,0.5,135.870877,v7 XGb/PVNet
2,1.0,148.720270,v7 XGb/PVNet
3,1.5,170.553101,v7 XGb/PVNet
4,2.0,187.024768,v7 XGb/PVNet
...,...,...,...
76,38.0,244.722297,v8 XGb/PVNet Sum + Blend v2
77,38.5,256.024284,v8 XGb/PVNet Sum + Blend v2
78,39.0,257.639912,v8 XGb/PVNet Sum + Blend v2
79,39.5,271.194385,v8 XGb/PVNet Sum + Blend v2


In [11]:
# Print the MAE results just for model: model_v7_df_v4_blend
test = mae_results[mae_results['Model'] == 'model_v7_df_v4_blend']
test

Unnamed: 0,forecast_horizon_hours,MAE,Model


In [5]:
def calculate_average_mae(df):
    # Get the unique models from the DataFrame
    models = df['Model'].unique()
    
    # Iterate through each model and calculate the average MAE for the specified time frames
    for model in models:
        model_df = df[df['Model'] == model]
        
        # Calculate average MAE for 0-8 hours
        avg_mae_0_8 = model_df[model_df['forecast_horizon_hours'] <= 8]['MAE'].mean()
        
        # Calculate average MAE for 0-40 hours
        avg_mae_0_40 = model_df[model_df['forecast_horizon_hours'] <= 40]['MAE'].mean()
        
        # Print the results
        print(f"Model: {model}")
        print(f"Average MAE for 0-8 hours: {avg_mae_0_8:.2f}")
        print(f"Average MAE for 0-40 hours: {avg_mae_0_40:.2f}")


In [6]:
calculate_average_mae(mae_results)

Model: v7 XGb/PVNet + Blend
Average MAE for 0-8 hours: 180.53
Average MAE for 0-40 hours: 221.28
Model: v8 XGb/PVNet Sum + Blend
Average MAE for 0-8 hours: 129.20
Average MAE for 0-40 hours: 210.51
Model: v8 XGb/PVNet Sum + Blend v2
Average MAE for 0-8 hours: 131.17
Average MAE for 0-40 hours: 210.92


In [66]:
min_date = pd.to_datetime("2019-01-01 00:00:00").tz_localize('UTC')
mae_results_pvnet = calculate_mae_for_forecast_horizons(pv_models, pvlive_df, min_end_datetime_utc=min_date)

full_pred_v6_3_xgb_pvnet_blend
full_pred_v8_2_1_xgb_pvnet_with_blend_formated


In [68]:
calculate_average_mae(mae_results_pvnet)

Model: v7 XGb/PVNet + Blend
Average MAE for 0-8 hours: 187.97
Average MAE for 0-40 hours: 215.20
Model: v8 XGb/PVNet Sum + Blend
Average MAE for 0-8 hours: 123.67
Average MAE for 0-40 hours: 201.70


In [14]:
def plot_multiple_mae_forecast_horizons(df):
    # Create a plotly figure
    fig = go.Figure()

    colors = ['#7BCDF3', '#63BCAF', '#FF9736', '#FFD053' , '#14120E', '#4C9A8E']

    # Get unique models
    models = df['Model'].unique()
    
    # Sort models alphabetically
    models = sorted(models)

    for i, model in enumerate(models):
        # Filter the DataFrame for the current model
        model_df = df[df['Model'] == model]

        # Add line plot for MAE across different forecast horizons with color
        fig.add_trace(go.Scatter(x=model_df['forecast_horizon_hours'], y=model_df['MAE'],
                                 mode='lines+markers', name=model, line=dict(color=colors[i % len(colors)])))

    # Update layout with titles and axis labels
    fig.update_layout(title='MAE for Multiple OCF Models',
                      xaxis_title='Forecast Horizon',
                      yaxis_title='Eror (MAE) MW',
                      template='plotly_white')

    # Show plot
    fig.show()

In [15]:
plot_multiple_mae_forecast_horizons(mae_results)

In [None]:
plot_multiple_mae_forecast_horizons(mae_results_pvnet)

In [38]:
# Function to plot multiple forecasts on the same graph including pvlive_df for 2 days ahead
def plot_multiple_forecasts_with_pvlive(forecasting_datetime, df_list, df_names, pvlive_df):
    fig = go.Figure()
    
    forecasting_datetime_obj = datetime.strptime(forecasting_datetime, "%Y-%m-%d %H:%M:%S%z")
    # Calculate 2 days ahead datetime
    two_days_ahead_datetime = forecasting_datetime_obj + timedelta(days=1)
    pvlive_filtered = pvlive_df[(pvlive_df['start_datetime_utc'] >= forecasting_datetime) & (pvlive_df['end_datetime_utc'] <= two_days_ahead_datetime.isoformat())]
    
    fig.add_trace(go.Scatter(x=pvlive_filtered['end_datetime_utc'], y=pvlive_filtered['generation_mw'], mode='lines', name='pvlive_data'))
    
    for df, name in zip(df_list, df_names):
        filtered_df = df[df['forecasting_creation_datetime_utc'] == forecasting_datetime]
        generation_data = filtered_df[['end_datetime_utc', 'generation_mw']]
        # Use model_mapping to rename the models
        mapped_name = model_mapping.get(name, name)
        fig.add_trace(go.Scatter(x=generation_data['end_datetime_utc'], y=generation_data['generation_mw'], mode='lines', name=mapped_name))
    
    fig.update_layout(title=f'Generation Data for {forecasting_datetime} including pvlive data for 2 days ahead', xaxis_title='End Datetime UTC', yaxis_title='Generation MW')
    fig.show()

# Example usage
forecasting_datetime = "2021-07-05 02:30:00+00:00"

models_df = []

for pv_model in pv_models:
    try:
        df_forecast = pd.read_csv("../data/" + pv_model + ".csv.gz")
    except FileNotFoundError:
        df_forecast = pd.read_csv("../data/" + pv_model + ".csv")

    models_df.append(df_forecast)

plot_multiple_forecasts_with_pvlive(forecasting_datetime, models_df, model_mapping, pvlive_df)