# Exploratory analysis of combined dataset

In [1]:
from pathlib import Path

import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Set max number of columns to display; default 20
pd.options.display.max_columns = 50

In [3]:
# Paths to data directories
cwd_path = Path.cwd()
data_path = cwd_path.parent.joinpath('data')
data_push_path = cwd_path.parent.joinpath('data_to_push')

#### Read in data 

In [39]:
df = pd.read_pickle(data_push_path / 'df_main_smard_era5_final.pkl')
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 48168 entries, 2018-01-01 00:00:00+00:00 to 2023-06-30 23:00:00+00:00
Data columns (total 34 columns):
 #   Column                                          Non-Null Count  Dtype              
---  ------                                          --------------  -----              
 0   hour                                            48168 non-null  int32              
 1   day_of_week                                     48168 non-null  int32              
 2   day_of_month                                    48168 non-null  int32              
 3   month_number                                    48168 non-null  int32              
 4   year                                            48168 non-null  int32              
 5   meteorological_season                           48168 non-null  object             
 6   turbines_in_operation                           48168 non-null  int64              
 7   total_nominal_capacity_operational_tur

In [40]:
df.isna().sum()

hour                                                 0
day_of_week                                          0
day_of_month                                         0
month_number                                         0
year                                                 0
meteorological_season                                0
turbines_in_operation                                0
total_nominal_capacity_operational_turbines_mw       0
datetime_cet                                         0
actual_generated_smard_mwh                           0
day_ahead_price_eur_mwh                           6550
total_nominal_capacity_smard_mw                      0
forecasted_generation_smard_mwh                     24
total_net_load_smard_mwh                             0
residual_load_smard_mwh                              0
mean_wind_speed_10m                                  0
mean_wind_speed_100m                                 0
wind_direction_angle_10m                             0
wind_direc

In [6]:
df.sample(10)

Unnamed: 0_level_0,hour,day_of_week,day_of_month,month_number,year,meteorological_season,turbines_in_operation,total_nominal_capacity_operational_turbines_mw,datetime_cet,actual_generated_smard_mwh,day_ahead_price_eur_mwh,total_nominal_capacity_smard_mw,forecasted_generation_smard_mwh,total_net_load_smard_mwh,residual_load_smard_mwh,mean_wind_speed_10m,mean_wind_speed_100m,wind_direction_angle_10m,wind_direction_angle_100m,mean_sea_level_pressure_mb,wind_gusts_10m,temp_2m_celsius,wind_direction_intercardinal_10m,wind_direction_intercardinal_100m,weighted_temp_2m_celsius,weighted_wind_gusts_10m,weighted_mean_sea_level_pressure_mb,weighted_mean_wind_speed_10m,weighted_mean_wind_speed_100m,weighted_mean_wind_direction_angle_10m,weighted_mean_wind_direction_angle_100m,weighted_wind_direction_intercardinal_10m,weighted_wind_direction_intercardinal_100m
datetime_utc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1
2019-06-14 06:00:00+00:00,6,4,14,6,2019,summer,28302,52377.389417,2019-06-14 08:00:00+02:00,5462.5,59.61,52792.0,4293.0,63771.0,43719.25,2.666891,4.849588,129.407745,135.220627,1016.7475,5.477653,17.053339,SE,SE,16.949889,5.589068,1016.632775,2.725251,4.784163,138.541558,144.489988,SE,SE
2018-02-19 21:00:00+00:00,21,0,19,2,2018,winter,27543,49974.095897,2018-02-19 22:00:00+01:00,1308.75,,51633.0,1317.75,61670.5,60235.0,1.492947,2.729662,119.603149,127.88726,1019.072031,2.588897,-1.131903,ESE,SE,-0.577655,2.533024,1019.324213,1.514602,2.751679,114.681731,123.585384,ESE,ESE
2021-05-12 18:00:00+00:00,18,2,12,5,2021,spring,28819,54848.997577,2021-05-12 20:00:00+02:00,4946.0,80.97,54499.0,5188.25,59293.5,53668.25,2.391572,4.140753,263.408417,265.30896,1010.152344,4.490983,12.193842,W,W,12.573389,4.787108,1010.074557,2.524244,4.281547,278.043539,278.618391,W,W
2023-03-05 06:00:00+00:00,6,6,5,3,2023,spring,29271,58190.218447,2023-03-05 07:00:00+01:00,12473.5,108.39,57590.0,14305.5,43750.5,26678.25,3.546808,6.162352,280.875336,287.69101,1016.389453,6.825418,0.926843,W,WNW,1.213635,7.555302,1015.45991,3.963148,6.86212,285.886766,291.324786,WNW,WNW
2022-04-15 09:00:00+00:00,9,4,15,4,2022,spring,29007,56259.653287,2022-04-15 11:00:00+02:00,5917.5,186.55,55289.0,6601.5,53611.5,28171.0,3.49263,4.933488,259.193237,256.21994,1025.147109,7.794785,10.159723,W,WSW,9.096638,8.108298,1025.653047,3.794623,5.395402,264.161202,257.059241,W,WSW
2020-09-20 08:00:00+00:00,8,6,20,9,2020,autumn,28657,53751.942467,2020-09-20 10:00:00+02:00,1715.25,34.2,53184.0,1950.5,46829.75,23446.0,2.382816,3.449249,78.688354,81.343117,1018.835234,5.651401,13.931238,ENE,E,13.59205,6.016556,1019.735354,2.673587,3.879704,74.341962,77.91516,ENE,ENE
2023-03-05 00:00:00+00:00,0,6,5,3,2023,spring,29271,58190.218447,2023-03-05 01:00:00+01:00,12439.75,106.25,57590.0,12924.25,42541.25,26453.0,3.302279,6.037405,278.224243,285.365204,1019.262812,6.508569,1.659692,W,WNW,2.117535,7.153936,1018.543953,3.750458,6.733505,288.552123,295.28703,WNW,WNW
2020-03-24 18:00:00+00:00,18,1,24,3,2020,spring,28554,53348.998167,2020-03-24 19:00:00+01:00,15562.75,33.78,53184.0,14753.0,64164.75,44617.75,3.481202,7.305354,97.554916,102.570747,1032.079141,6.538061,4.275781,E,ESE,4.919772,6.264026,1032.769011,3.416201,7.237682,108.87337,113.705056,ESE,ESE
2022-09-27 21:00:00+00:00,21,1,27,9,2022,autumn,29189,57260.553187,2022-09-27 23:00:00+02:00,9844.5,319.71,55289.0,10411.5,46893.75,36906.5,3.502725,6.491419,228.165756,234.493332,998.732813,6.649259,7.925745,SW,SW,8.15175,5.899264,997.289835,3.278446,6.112616,227.174181,233.604928,SW,SW
2020-02-17 19:00:00+00:00,19,0,17,2,2020,winter,28534,53253.448167,2020-02-17 20:00:00+01:00,31238.0,32.27,53184.0,28234.25,68084.5,30742.25,5.746999,9.795965,233.932205,236.456497,1015.40375,11.612583,6.578119,SW,WSW,6.635989,12.039689,1013.118461,6.130406,10.337284,231.231785,233.710189,SW,SW


-----

### Just had an idea: Add German public holiday data to dataset!

#### Fetch German public holiday dates from free API service (thanks, guys! 👍🏻)
- Link: https://feiertage-api.de
- 117 public holidays during my chosen time period

In [7]:
# years = [2018, 2019, 2020, 2021, 2022, 2023]
# list_of_datadicts = []

# for year in years:
#     url_endpoint = f'https://feiertage-api.de/api/?jahr={year}&nur_daten'
#     response = requests.get(url_endpoint)

#     if response.status_code == 200:  
#             # Decode JSON response object to Python dict
#             list_of_datadicts.append(response.json())
#     else:
#         print(f'Request failed with status code: {response.status_code}')
        
# len(list_of_datadicts)

6

In [29]:
# df_public_holidays = pd.DataFrame(list_of_datadicts)

# # Map all dates to datetime timestamps localise to UTC (since only day alignment is needed, not hourly)
# df_public_holidays = df_public_holidays.map(lambda date: pd.to_datetime(date, yearfirst=True, utc=True))

# # Flatten the dataframe to 1D array of Timestamps!
# public_holiday_timestamps = df_public_holidays.to_numpy().flatten()

# # Creat new column by putting bool array as condition inside np.where and replacing bool with 0s & 1s
# df['public_holiday'] = np.where(df.index.isin(public_holiday_timestamps), 1, 0)

In [37]:
# df['public_holiday'].sum()

117

#### Save to pickle

In [38]:
# df.to_pickle(data_push_path / 'df_main_smard_era5_final.pkl')

-----