# Building the ML dataframe

In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
import xarray as xr

In [2]:
# Set max number of columns to display; default 20
pd.options.display.max_columns = 80

In [3]:
# Directory where data files will be downloaded
cwd_path = Path.cwd()
data_path = cwd_path.parent.joinpath('data')
data_push_path = cwd_path.parent.joinpath('data_to_push')

### Load in latest wind turbine data
- Note: Only 9 turbines decommissioned before 2019!
- Now only 751 unique grid points used

In [4]:
# Read in latest turbine data
# Now only 30_638 after removing 3 turbines outside bounding box and 4 clearly outside Germany
# Update: now only 30_547 after filtering for operational turbines for period 2018-end of June 2023
df_turbines = pd.read_pickle(data_push_path / 'df_turbines_knn_blades_haversine_elevation_utc_2_2018_2023.pkl')
df_turbines.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30547 entries, 0 to 30546
Data columns (total 30 columns):
 #   Column                                            Non-Null Count  Dtype              
---  ------                                            --------------  -----              
 0   EinheitMastrNummer                                30547 non-null  object             
 1   DatumLetzteAktualisierung                         30547 non-null  datetime64[ns, UTC]
 2   Bundesland                                        30547 non-null  object             
 3   Postleitzahl                                      30547 non-null  int64              
 4   Ort                                               30547 non-null  object             
 5   Laengengrad                                       30547 non-null  float64            
 6   Breitengrad                                       30547 non-null  float64            
 7   Registrierungsdatum                               30547 non-null  d

In [5]:
# List of datetime column names for convenient indexing later
turbine_date_columns = [
    'DatumLetzteAktualisierung',
    'Registrierungsdatum',
    'Inbetriebnahmedatum',
    'DatumEndgueltigeStilllegung',
    'DatumBeginnVoruebergehendeStilllegung',
    'DatumWiederaufnahmeBetrieb'
]

In [6]:
# Use these weights to calculate weighted means for my various weather params
grid_point_weights_dict = df_turbines['nearest_grid_point'].value_counts().to_dict()

#### The below code subsets the turbines dataset by keeping only turbines operational during the chosen time series period

In [1567]:
# Get datetimes in index before ERA5 data cutoff 
# subset_datetime_idx = df_main.loc[df_main.index < pd.to_datetime('2023-06-30 23:00:00', utc=True)].index

# # initialise empty set to update
# unique_set_of_idx = set()

# # loop through all timestamps 
# for timestamp in subset_datetime_idx:
#     # return bool series of turbines in operation on timestamp
#     is_operational_bool_mask = is_operational(timestamp, df_turbines)
    
#     # Then get list of grid points for those turbines, make set, and update main set
#     idx_set = set(df_turbines.loc[is_operational_bool_mask].index.to_list())
#     unique_set_of_idx.update(idx_set)

In [None]:
# New dataframe to save to pickle
# df_turbines_2018_2023 = df_turbines.iloc[list(unique_set_of_idx)]

In [371]:
# # Cast all tz naive datetime64 columns to UTC tz aware
# df_turbines[turbine_date_columns] = df_turbines[turbine_date_columns].apply(lambda series: series.dt.tz_localize('UTC'), axis=0)

# # save to pickle
# df_turbines.to_pickle(data_push_path / 'df_turbines_knn_blades_haversine_elevation_utc.pkl')

-----

# Load in all SMARD data

### Load in SMARD energy generated data (actual measured generation)
- My response variable `y` (aka target/label) 

In [23]:
# Get all SMARD CSV files, load into dataframes, and concat them

def load_SMARD_generated_data():
    """
    Look for CSV files in the SMARD directory
    """
    list_of_dataframes = []
    for filepath in data_path.joinpath('SMARD').iterdir():
        if filepath.name.startswith('Realisierte_Erzeugung'):
            list_of_dataframes.append(pd.read_csv(filepath, sep=';'))
            
    # Check that num of rows is what I expect after concatenating dfs
    num_of_rows = 0
    for df in list_of_dataframes:
        num_of_rows += len(df)
    
    # concat dataframes
    df = pd.concat(list_of_dataframes)
    print(f'Number of rows match up: {num_of_rows == len(df)}')
    
    # Select columns to keep
    # Note: I'm using the interval start time to create the timestamps
    # ERA5 surface parameters are instantaneous so can't perfectly align anyway
    df = df[[
        'Datum',
        'Anfang',
        # 'Ende',
        'Wind Onshore [MWh] Berechnete Auflösungen'
    ]]
    
    # rename columns
    df.rename(columns={
        'Datum': 'date',
        'Anfang': 'interval_start_time', 
        # 'Ende': 'interval_end_time',
        'Wind Onshore [MWh] Berechnete Auflösungen': 'actual_generated_smard_mwh'
    }, inplace=True)
    
    # Handle dates and times to create unified datetime64 timestamps
    df['date'] = pd.to_datetime(df['date'], format='%d.%m.%Y')
    df['start_time_delta'] = pd.to_timedelta(df['interval_start_time'] + ':00')
    df['datetime_cet'] = df['date'] + df['start_time_delta']
    # Add CET timezone info and infer change from CET->CEST->CET, etc
    df['datetime_cet'] = df['datetime_cet'].dt.tz_localize(tz='CET', ambiguous='infer')
    
    # Drop un-needed columns
    df = df.drop(columns=['date', 'start_time_delta', 'interval_start_time'])
    # Re-arrange columns
    df = df[['datetime_cet', 'actual_generated_smard_mwh']]
    
    # Convert European thousands and decimal seperators in values to (US/UK) decimal full stop format
    translation_table = str.maketrans({'.': None, ',': '.'})
    df['actual_generated_smard_mwh'] = df['actual_generated_smard_mwh'].str.translate(translation_table).astype(float)
    
    # Sort rows by datetime_cet
    df.sort_values('datetime_cet', inplace=True)
    # Reset index
    df.reset_index(drop=True, inplace=True)
    
    return df.copy()

In [24]:
# Load SMARD data
df_smard_generated = load_SMARD_generated_data()
df_smard_generated.info()

Number of rows match up: True
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49655 entries, 0 to 49654
Data columns (total 2 columns):
 #   Column                      Non-Null Count  Dtype              
---  ------                      --------------  -----              
 0   datetime_cet                49655 non-null  datetime64[ns, CET]
 1   actual_generated_smard_mwh  49655 non-null  float64            
dtypes: datetime64[ns, CET](1), float64(1)
memory usage: 776.0 KB


In [25]:
df_smard_generated.isna().sum()

datetime_cet                  0
actual_generated_smard_mwh    0
dtype: int64

-----

### Load in SMARD day-ahead prices
- Wholesale prices: https://www.smard.de/page/en/wiki-article/5884/5976
- This is the day-ahead price!
    - "Weighted wholesale electricity price (day-ahead price on the exchange) for each hour [€/MWh] determined on the day-ahead auction that took place ones on the previous day - data is delivered no later than 2 hours after trading closes. Source: ENTSO-E"
- Data only goes back to 2018-10-01 (October 1, 2018)
- This is for the bidding area Germany/Luxembourg (not just Germany)

In [26]:
# Get all SMARD CSV files, load into dataframes, and concat them

def load_SMARD_market_price_data():
    """
    Look for CSV files in the SMARD directory
    """
    list_of_dataframes = []
    for filepath in data_path.joinpath('SMARD').iterdir():
        if filepath.name.startswith('Gro_handelspreise'):
            list_of_dataframes.append(pd.read_csv(filepath, sep=';', na_values='-'))
            
    # Check that num of rows is what I expect after concatenating dfs
    num_of_rows = 0
    for df in list_of_dataframes:
        num_of_rows += len(df)
    
    # concat dataframes
    df = pd.concat(list_of_dataframes)
    print(f'Number of rows match up: {num_of_rows == len(df)}')
    
    # Select columns to keep
    # Note: I'm using the interval start time to create the timestamps
    # ERA5 surface parameters are instantaneous so can't perfectly align anyway
    df = df[[
        'Datum',
        'Anfang',
        # 'Ende',
        'Deutschland/Luxemburg [€/MWh] Originalauflösungen'
    ]]
    
    # rename columns
    df.rename(columns={
        'Datum': 'date',
        'Anfang': 'interval_start_time', 
        # 'Ende': 'interval_end_time',
        'Deutschland/Luxemburg [€/MWh] Originalauflösungen': 'day_ahead_price_eur_mwh'
    }, inplace=True)
    
    # Handle dates and times to create unified datetime64 timestamps
    df['date'] = pd.to_datetime(df['date'], format='%d.%m.%Y')
    df['start_time_delta'] = pd.to_timedelta(df['interval_start_time'] + ':00')
    df['datetime_cet'] = df['date'] + df['start_time_delta']
    # Add CET timezone info and infer change from CET->CEST->CET, etc
    df['datetime_cet'] = df['datetime_cet'].dt.tz_localize(tz='CET', ambiguous='infer')
    
    # Drop un-needed columns
    df = df.drop(columns=['date', 'start_time_delta', 'interval_start_time'])
    # Re-arrange columns
    df = df[['datetime_cet', 'day_ahead_price_eur_mwh']]
    
    # Convert European thousands and decimal seperators in values to (US/UK) decimal full stop format
    translation_table = str.maketrans({'.': None, ',': '.'})
    df['day_ahead_price_eur_mwh'] = df['day_ahead_price_eur_mwh'].str.translate(translation_table).astype(float)
    
    # Drop the numerous (700+) duplicate rows
    df.drop_duplicates(subset='datetime_cet', inplace=True)
    # Sort rows by datetime_cet
    df.sort_values('datetime_cet', inplace=True)
    # Reset index
    df.reset_index(drop=True, inplace=True)
    
    return df.copy()

In [27]:
df_smard_market_price = load_SMARD_market_price_data()

Number of rows match up: True


In [28]:
df_smard_market_price.isna().sum()

datetime_cet                  0
day_ahead_price_eur_mwh    6551
dtype: int64

In [29]:
df_smard_market_price['day_ahead_price_eur_mwh'].describe()

count    43104.000000
mean        97.537291
std        106.238714
min       -500.000000
25%         35.260000
50%         55.155000
75%        118.750000
max        871.000000
Name: day_ahead_price_eur_mwh, dtype: float64

-----

### Load in SMARD installed capacity data
- Note: CSV reader infers decimal comma (European) format 54.499 as decimal point 54.499 even though it's 54,499.00!
    - Specify dtype of column in the read_csv method

In [30]:
def load_SMARD_installed_capacity_data():
    """
    Look for CSV files in the SMARD directory
    """
    list_of_dataframes = []
    for filepath in data_path.joinpath('SMARD').iterdir():
        if filepath.name.startswith('Installierte_Erzeugungsleistung'):
            list_of_dataframes.append(
                pd.read_csv(filepath, sep=';', dtype={'Wind Onshore [MW] Berechnete Auflösungen': str})
            )
            
    # Check that num of rows is what I expect after concatenating dfs
    num_of_rows = 0
    for df in list_of_dataframes:
        num_of_rows += len(df)
    
    # concat dataframes
    df = pd.concat(list_of_dataframes)
    print(f'Number of rows match up: {num_of_rows == len(df)}')
    
    # Select columns to keep
    # Note: I'm using the interval start time to create the timestamps
    # ERA5 surface parameters are instantaneous so can't perfectly align anyway
    df = df[[
        'Datum',
        'Anfang',
        # 'Ende',
        'Wind Onshore [MW] Berechnete Auflösungen'
    ]]
    
    # rename columns
    df.rename(columns={
        'Datum': 'date',
        'Anfang': 'interval_start_time', 
        # 'Ende': 'interval_end_time',
        'Wind Onshore [MW] Berechnete Auflösungen': 'total_nominal_capacity_smard_mw'
    }, inplace=True)
    
    # Handle dates and times to create unified datetime64 timestamps
    df['date'] = pd.to_datetime(df['date'], format='%d.%m.%Y')
    df['start_time_delta'] = pd.to_timedelta(df['interval_start_time'] + ':00')
    df['datetime_cet'] = df['date'] + df['start_time_delta']
    # Add CET timezone info and infer change from CET->CEST->CET, etc
    df['datetime_cet'] = df['datetime_cet'].dt.tz_localize(tz='CET', ambiguous='infer')
    
    # Drop un-needed columns
    df = df.drop(columns=['date', 'start_time_delta', 'interval_start_time'])
    # Re-arrange columns
    df = df[['datetime_cet', 'total_nominal_capacity_smard_mw']]
    
    # Convert European thousands and decimal seperators in values to (US/UK) decimal full stop format
    translation_table = str.maketrans({'.': None, ',': '.'})
    df['total_nominal_capacity_smard_mw'] = df['total_nominal_capacity_smard_mw'].str.translate(translation_table).astype(float)
    
    # Sort rows by datetime_cet
    df.sort_values('datetime_cet', inplace=True)
    # Reset index
    df.reset_index(drop=True, inplace=True)
    
    return df.copy()

In [31]:
df_smard_installed_capacity = load_SMARD_installed_capacity_data()

Number of rows match up: True


In [32]:
df_smard_installed_capacity.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49655 entries, 0 to 49654
Data columns (total 2 columns):
 #   Column                           Non-Null Count  Dtype              
---  ------                           --------------  -----              
 0   datetime_cet                     49655 non-null  datetime64[ns, CET]
 1   total_nominal_capacity_smard_mw  49655 non-null  float64            
dtypes: datetime64[ns, CET](1), float64(1)
memory usage: 776.0 KB


In [33]:
df_smard_installed_capacity.isna().sum()

datetime_cet                       0
total_nominal_capacity_smard_mw    0
dtype: int64

------

### Load in SMARD day-ahead generation forecast
- Wind Onshore [MWh] Berechnete Auflösungen for Germany
- Only has 24 missing values for the day of 2022-12-21 (verified on SMARD data visuals)
- TSOs transmit their forecasts to the ENTSO-E transparency platform no later than 18:00 of the previous day

In [34]:
def load_SMARD_forecasted_generation_data():
    """
    Look for CSV files in the SMARD directory
    """
    list_of_dataframes = []
    for filepath in data_path.joinpath('SMARD').iterdir():
        if filepath.name.startswith('Prognostizierte_Erzeugung_Day-Ahead'):
            list_of_dataframes.append(pd.read_csv(filepath, sep=';', na_values='-'))
            
    # Check that num of rows is what I expect after concatenating dfs
    num_of_rows = 0
    for df in list_of_dataframes:
        num_of_rows += len(df)
    
    # concat dataframes
    df = pd.concat(list_of_dataframes)
    print(f'Number of rows match up: {num_of_rows == len(df)}')
    
    # Select columns to keep
    # Note: I'm using the interval start time to create the timestamps
    # ERA5 surface parameters are instantaneous so can't perfectly align anyway
    df = df[[
        'Datum',
        'Anfang',
        # 'Ende',
        'Wind Onshore [MWh] Berechnete Auflösungen'
    ]]
    
    # rename columns
    df.rename(columns={
        'Datum': 'date',
        'Anfang': 'interval_start_time', 
        # 'Ende': 'interval_end_time',
        'Wind Onshore [MWh] Berechnete Auflösungen': 'forecasted_generation_smard_mwh'
    }, inplace=True)
    
    # Handle dates and times to create unified datetime64 timestamps
    df['date'] = pd.to_datetime(df['date'], format='%d.%m.%Y')
    df['start_time_delta'] = pd.to_timedelta(df['interval_start_time'] + ':00')
    df['datetime_cet'] = df['date'] + df['start_time_delta']
    # Add CET timezone info and infer change from CET->CEST->CET, etc
    df['datetime_cet'] = df['datetime_cet'].dt.tz_localize(tz='CET', ambiguous='infer')
    
    # Drop un-needed columns
    df = df.drop(columns=['date', 'start_time_delta', 'interval_start_time'])
    # Re-arrange columns
    df = df[['datetime_cet', 'forecasted_generation_smard_mwh']]
    
    # Convert European thousands and decimal seperators in values to (US/UK) decimal full stop format
    translation_table = str.maketrans({'.': None, ',': '.'})
    df['forecasted_generation_smard_mwh'] = df['forecasted_generation_smard_mwh'].str.translate(translation_table).astype(float)
        
    # Sort rows by datetime_cet
    df.sort_values('datetime_cet', inplace=True)
    # Reset index
    df.reset_index(drop=True, inplace=True)
    
    return df.copy()

In [35]:
df_smard_forecasted_generation = load_SMARD_forecasted_generation_data()

Number of rows match up: True


In [36]:
df_smard_forecasted_generation.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49655 entries, 0 to 49654
Data columns (total 2 columns):
 #   Column                           Non-Null Count  Dtype              
---  ------                           --------------  -----              
 0   datetime_cet                     49655 non-null  datetime64[ns, CET]
 1   forecasted_generation_smard_mwh  49631 non-null  float64            
dtypes: datetime64[ns, CET](1), float64(1)
memory usage: 776.0 KB


In [37]:
df_smard_forecasted_generation.isna().sum()

datetime_cet                        0
forecasted_generation_smard_mwh    24
dtype: int64

----

### Load in SMARD power consumption data
- Validated for correctness by comparing to visuals on SMARD.de

In [38]:
def load_SMARD_power_consumption_data():
    """
    Look for CSV files in the SMARD directory
    """
    list_of_dataframes = []
    for filepath in data_path.joinpath('SMARD').iterdir():
        if filepath.name.startswith('Realisierter_Stromverbrauch'):
            list_of_dataframes.append(pd.read_csv(filepath, sep=';', na_values='-'))
            
    # Check that num of rows is what I expect after concatenating dfs
    num_of_rows = 0
    for df in list_of_dataframes:
        num_of_rows += len(df)
    
    # concat dataframes
    df = pd.concat(list_of_dataframes)
    print(f'Number of rows match up: {num_of_rows == len(df)}')
    
    # Select columns to keep
    # Note: I'm using the interval start time to create the timestamps
    # ERA5 surface parameters are instantaneous so can't perfectly align anyway
    df = df[[
        'Datum',
        'Anfang',
        # 'Ende',
        'Gesamt (Netzlast) [MWh] Berechnete Auflösungen',
        'Residuallast [MWh] Berechnete Auflösungen'
    ]]
    
    # rename columns
    df.rename(columns={
        'Datum': 'date',
        'Anfang': 'interval_start_time', 
        # 'Ende': 'interval_end_time',
        'Gesamt (Netzlast) [MWh] Berechnete Auflösungen': 'total_net_load_smard_mwh',
        'Residuallast [MWh] Berechnete Auflösungen': 'residual_load_smard_mwh'
    }, inplace=True)
    
    # Handle dates and times to create unified datetime64 timestamps
    df['date'] = pd.to_datetime(df['date'], format='%d.%m.%Y')
    df['start_time_delta'] = pd.to_timedelta(df['interval_start_time'] + ':00')
    df['datetime_cet'] = df['date'] + df['start_time_delta']
    # Add CET timezone info and infer change from CET->CEST->CET, etc
    df['datetime_cet'] = df['datetime_cet'].dt.tz_localize(tz='CET', ambiguous='infer')
    
    # Drop un-needed columns
    df = df.drop(columns=['date', 'start_time_delta', 'interval_start_time'])
    # Re-arrange columns
    df = df[['datetime_cet', 'total_net_load_smard_mwh', 'residual_load_smard_mwh']]
    
    # Convert European thousands and decimal seperators in values to (US/UK) decimal full stop format
    translation_table = str.maketrans({'.': None, ',': '.'})
    df['total_net_load_smard_mwh'] = df['total_net_load_smard_mwh'].str.translate(translation_table).astype(float)
    df['residual_load_smard_mwh'] = df['residual_load_smard_mwh'].str.translate(translation_table).astype(float)
        
    # Sort rows by datetime_cet
    df.sort_values('datetime_cet', inplace=True)
    # Reset index
    df.reset_index(drop=True, inplace=True)
    
    return df.copy()

In [39]:
df_smard_consumption = load_SMARD_power_consumption_data()

Number of rows match up: True


In [40]:
df_smard_consumption

Unnamed: 0,datetime_cet,total_net_load_smard_mwh,residual_load_smard_mwh
0,2018-01-01 00:00:00+01:00,45433.25,14015.50
1,2018-01-01 01:00:00+01:00,44270.00,11634.25
2,2018-01-01 02:00:00+01:00,43195.75,9928.75
3,2018-01-01 03:00:00+01:00,42527.50,8371.50
4,2018-01-01 04:00:00+01:00,42515.75,8291.25
...,...,...,...
49650,2023-08-31 19:00:00+02:00,55208.75,42348.25
49651,2023-08-31 20:00:00+02:00,54667.25,41814.75
49652,2023-08-31 21:00:00+02:00,52445.75,38865.00
49653,2023-08-31 22:00:00+02:00,48692.25,34340.00


In [41]:
df_smard_consumption.isna().sum()

datetime_cet                0
total_net_load_smard_mwh    0
residual_load_smard_mwh     0
dtype: int64

-----

-----

# Begin building ML dataframe `df_main`
- Start with one full year: 2022
    - Then bring in another year like 2021 and run through the same transformations and then concat along datetimeindex?
- Good resource on time-related feature engineering: https://scikit-learn.org/stable/auto_examples/applications/plot_cyclical_feature_engineering.html
- Once I validate my approach, create the full period 2018-01-01 to 2023-08-31

In [14]:
# Make time zone aware UTC?
datetime_index_utc = pd.date_range(start='2018-01-01', end='2023-08-31 23:59:59', freq='H', name='datetime_utc', tz='UTC')

# Make index to dataframe and reset
df_main = datetime_index_utc.to_frame().reset_index(drop=True)

# Extract properties and derive new columns
df_main['hour'] = df_main['datetime_utc'].dt.hour
# 0-6 (Monday-Sunday)
df_main['day_of_week'] = df_main['datetime_utc'].dt.dayofweek
df_main['day_of_month'] = df_main['datetime_utc'].dt.day
df_main['month_number'] = df_main['datetime_utc'].dt.month
df_main['year'] = df_main['datetime_utc'].dt.year

# Function to get meteorological season based on month number (1-12)
# Should I just go ahead and encode the discrete numerical value?
def get_meteorological_season(month):
    if 3 <= month <= 5:
        return 'spring'
    elif 6 <= month <= 8:
        return 'summer'
    elif 9 <= month <= 11:
        return 'autumn'
    else:
        return 'winter'
    
df_main['meteorological_season'] = df_main['month_number'].apply(get_meteorological_season)

df_main

Unnamed: 0,datetime_utc,hour,day_of_week,day_of_month,month_number,year,meteorological_season
0,2018-01-01 00:00:00+00:00,0,0,1,1,2018,winter
1,2018-01-01 01:00:00+00:00,1,0,1,1,2018,winter
2,2018-01-01 02:00:00+00:00,2,0,1,1,2018,winter
3,2018-01-01 03:00:00+00:00,3,0,1,1,2018,winter
4,2018-01-01 04:00:00+00:00,4,0,1,1,2018,winter
...,...,...,...,...,...,...,...
49651,2023-08-31 19:00:00+00:00,19,3,31,8,2023,summer
49652,2023-08-31 20:00:00+00:00,20,3,31,8,2023,summer
49653,2023-08-31 21:00:00+00:00,21,3,31,8,2023,summer
49654,2023-08-31 22:00:00+00:00,22,3,31,8,2023,summer


----

## Derive new columns `turbines_in_operation` and `total_nominal_capacity_operational_turbines_mw`

#### Function to check if turbine operational at a given UTC hour timestamp

In [15]:
def is_operational(timestamp_utc, df_turbines):
    """
    Inputs: hourly tz aware utc timestamps, turbine dataframe
    Get a bool series of turbines that are operational at the time of the [hourly] timestamp
    """
    # Turbines that went into operation before the timestamp; returns bool series for bool indexing
    started_operations = df_turbines['Inbetriebnahmedatum'] <= timestamp_utc

    # Turbines already decommissioned before the timestamp; returns bool series for bool indexing
    already_decommissioned = df_turbines['DatumEndgueltigeStilllegung'] <= timestamp_utc

    # Turbines that went into maintenance before timestamp and haven't come back into operation before the timestamp
    # Note: Some turbines go straight from temporary maintenance to decommissioned without ever going back into operation
    still_in_maintenance_or_decommissioned = (df_turbines['DatumBeginnVoruebergehendeStilllegung'] <= timestamp_utc) & \
    ((df_turbines['DatumWiederaufnahmeBetrieb'] > timestamp_utc) | df_turbines['DatumWiederaufnahmeBetrieb'].isna())

    # Number of turbines operational at the timestamp
    # Note the tildas to inverse these bool series
    # This is a bool series with df_turbines index
    turbines_in_operation_bool_series = (started_operations & ~already_decommissioned & ~still_in_maintenance_or_decommissioned)

    return turbines_in_operation_bool_series

##### Derive column for total turbines in operation for every hourly timestamp

In [16]:
# 1min 7s to run for 2018-23
df_main['turbines_in_operation'] = df_main['datetime_utc'].apply(lambda timestamp: is_operational(timestamp, df_turbines).sum())

##### Derive column for total nominal capacity for every hourly timestamp

In [17]:
# Nettonennleistung is in kW; I divide by 1_000 to convert kilo-watts (kW) to mega-watts (MW)
# 1min 30s to run for 2018-23
df_main['total_nominal_capacity_operational_turbines_mw'] = df_main['datetime_utc'].apply(
    lambda timestamp: df_turbines.loc[:, 'Nettonennleistung'].loc[is_operational(timestamp, df_turbines)].sum() / 1_000)

#### Save progress to pickle

In [18]:
# df_main.to_pickle(data_push_path / 'df_main.pkl')

In [1241]:
# df_main['turbines_in_operation'].plot()

In [1242]:
# df_main['total_nominal_capacity_operational_turbines_mw'].plot()

In [19]:
df_main.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49656 entries, 0 to 49655
Data columns (total 9 columns):
 #   Column                                          Non-Null Count  Dtype              
---  ------                                          --------------  -----              
 0   datetime_utc                                    49656 non-null  datetime64[ns, UTC]
 1   hour                                            49656 non-null  int32              
 2   day_of_week                                     49656 non-null  int32              
 3   day_of_month                                    49656 non-null  int32              
 4   month_number                                    49656 non-null  int32              
 5   year                                            49656 non-null  int32              
 6   meteorological_season                           49656 non-null  object             
 7   turbines_in_operation                           49656 non-null  int64              
 8

-----

## Merge the 5 SMARD datasets onto `df_main` on key datetime
- Dataframes to merge with `df_main`:
    - `df_smard_generated`
    - `df_smard_market_price`
    - `df_smard_installed_capacity`
    - `df_smard_forecasted_generation`
    - `df_smard_consumption`

##### Load in latest `df_main`

In [42]:
df_main = pd.read_pickle(data_push_path / 'df_main.pkl')
df_main.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49656 entries, 0 to 49655
Data columns (total 9 columns):
 #   Column                                          Non-Null Count  Dtype              
---  ------                                          --------------  -----              
 0   datetime_utc                                    49656 non-null  datetime64[ns, UTC]
 1   hour                                            49656 non-null  int32              
 2   day_of_week                                     49656 non-null  int32              
 3   day_of_month                                    49656 non-null  int32              
 4   month_number                                    49656 non-null  int32              
 5   year                                            49656 non-null  int32              
 6   meteorological_season                           49656 non-null  object             
 7   turbines_in_operation                           49656 non-null  int64              
 8

##### Merge operations

In [43]:
# Generated data; the response variable `y`
# keep the datetime_cet column for reference
df_main = df_main.merge(df_smard_generated, left_on='datetime_utc', right_on='datetime_cet', how='left')

# The day-ahead price data
# -> only add suffix to overlapping columns in right df being merged
df_main = df_main.merge(df_smard_market_price, left_on='datetime_utc', right_on='datetime_cet', how='left', suffixes=(None, '_remove'))

# Installed capacity
df_main = df_main.merge(df_smard_installed_capacity, left_on='datetime_utc', right_on='datetime_cet', how='left', suffixes=(None, '_remove'))

# The day-ahead forecasted generation
df_main = df_main.merge(df_smard_forecasted_generation, left_on='datetime_utc', right_on='datetime_cet', how='left', suffixes=(None, '_remove'))

# Power consumption (total net load and residual load)
# Merges both on left
df_main = df_main.merge(df_smard_consumption, left_on='datetime_utc', right_on='datetime_cet', how='left', suffixes=(None, '_remove'))

##### Remove all the duplicate `datetime_cet` columns with suffix `_remove`

In [44]:
# filter using regex; $ binds the expression to end of a string
df_main.drop(df_main.filter(regex='_remove$').columns, axis=1, inplace=True)

#### Save progress to pickle

In [45]:
# df_main.to_pickle(data_push_path / 'df_main_smard.pkl')

-----

----

# Checkpoints: Next step here...

##### Load in latest `df_main`

In [49]:
df_main = pd.read_pickle(data_push_path / 'df_main_smard.pkl')
df_main.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49656 entries, 0 to 49655
Data columns (total 16 columns):
 #   Column                                          Non-Null Count  Dtype              
---  ------                                          --------------  -----              
 0   datetime_utc                                    49656 non-null  datetime64[ns, UTC]
 1   hour                                            49656 non-null  int32              
 2   day_of_week                                     49656 non-null  int32              
 3   day_of_month                                    49656 non-null  int32              
 4   month_number                                    49656 non-null  int32              
 5   year                                            49656 non-null  int32              
 6   meteorological_season                           49656 non-null  object             
 7   turbines_in_operation                           49656 non-null  int64              
 

#### Set datetime_utc column to index

In [51]:
df_main.set_index('datetime_utc', inplace=True)

-------

## Merge mean hourly ERA5 *MASKED* data vars across all used grid points into `df_main`
- This is the masked data where I kept data var values for grid points linked to turbines
- Grid points not linked to a nearby turbine have data vars set to nan (i.e. the mask)
- 750 grid points connected to turbines that were operational between start of 2018 to end of June 2023

#### Read in latest ERA5 dataset that I've combined, subsetted, and with the new data vars I derived

In [53]:
ds = xr.open_dataset(data_path.joinpath('ERA5').joinpath('era5_combined_subset_derived_dropped_masked_2018_2023.nc'))
ds.data_vars

Data variables:
    t2m                        (time, latitude, longitude) float32 ...
    i10fg                      (time, latitude, longitude) float32 ...
    msl                        (time, latitude, longitude) float32 ...
    mean_wind_speed_10m        (time, latitude, longitude) float32 ...
    mean_wind_speed_100m       (time, latitude, longitude) float32 ...
    wind_direction_angle_10m   (time, latitude, longitude) float32 ...
    wind_direction_angle_100m  (time, latitude, longitude) float32 ...

#### How to get mean for a data var for every hour in time dimension?
- Use `xarray.DataArray.mean`: https://docs.xarray.dev/en/stable/generated/xarray.DataArray.mean.html

-----

## Taking the mean of my 7 data vars across all lats and lons and assigning to pandas dataframes
- Also localise ERA5 tz naive datetimeindex objects to tz UTC aware!

In [1400]:
# I validated that it is indeed the hourly mean across all lats and lons!
# (ds.sel(time=['2018-01-01 00:00:00'])['mean_wind_speed_10m'].values).mean()

#### All dataframes have index set to datetimeindex

In [55]:
# Calculate mean across all lats and lons, assign to new df, and convert naive datetimeindex to UTC tz aware

# Units: m/s
df_mean_wind_speed_10m = ds['mean_wind_speed_10m'].mean(dim=['latitude', 'longitude']).to_dataframe()
df_mean_wind_speed_10m.index = df_mean_wind_speed_10m.index.tz_localize(tz='utc')

# Units: m/s
df_mean_wind_speed_100m = ds['mean_wind_speed_100m'].mean(dim=['latitude', 'longitude']).to_dataframe()
df_mean_wind_speed_100m.index = df_mean_wind_speed_100m.index.tz_localize(tz='utc')

# Units: degrees
df_wind_direction_angle_10m = ds['wind_direction_angle_10m'].mean(dim=['latitude', 'longitude']).to_dataframe()
df_wind_direction_angle_10m.index = df_wind_direction_angle_10m.index.tz_localize(tz='utc')

# Units: degrees
df_wind_direction_angle_100m = ds['wind_direction_angle_100m'].mean(dim=['latitude', 'longitude']).to_dataframe()
df_wind_direction_angle_100m.index = df_wind_direction_angle_100m.index.tz_localize(tz='utc')

# Units: Pascals (Pa)
df_msl = ds['msl'].mean(dim=['latitude', 'longitude']).to_dataframe()
df_msl.index = df_msl.index.tz_localize(tz='utc')

# instantaneous_10m_wind_gust; units: m/s
df_i10fg = ds['i10fg'].mean(dim=['latitude', 'longitude']).to_dataframe()
df_i10fg.index = df_i10fg.index.tz_localize(tz='utc')

# 2m_temperature; units: Kelvin
df_t2m = ds['t2m'].mean(dim=['latitude', 'longitude']).to_dataframe()
df_t2m.index = df_t2m.index.tz_localize(tz='utc')

#### Left merge above 7 dataframes of mean values onto `df_main` using indices of both

In [56]:
df_main = df_main.merge(df_mean_wind_speed_10m, left_index=True, right_index=True, how='left')
df_main = df_main.merge(df_mean_wind_speed_100m, left_index=True, right_index=True, how='left')
df_main = df_main.merge(df_wind_direction_angle_10m, left_index=True, right_index=True, how='left')
df_main = df_main.merge(df_wind_direction_angle_100m, left_index=True, right_index=True, how='left')
df_main = df_main.merge(df_msl, left_index=True, right_index=True, how='left')
df_main = df_main.merge(df_i10fg, left_index=True, right_index=True, how='left')
df_main = df_main.merge(df_t2m, left_index=True, right_index=True, how='left')

In [57]:
df_main.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 49656 entries, 2018-01-01 00:00:00+00:00 to 2023-08-31 23:00:00+00:00
Data columns (total 22 columns):
 #   Column                                          Non-Null Count  Dtype              
---  ------                                          --------------  -----              
 0   hour                                            49656 non-null  int32              
 1   day_of_week                                     49656 non-null  int32              
 2   day_of_month                                    49656 non-null  int32              
 3   month_number                                    49656 non-null  int32              
 4   year                                            49656 non-null  int32              
 5   meteorological_season                           49656 non-null  object             
 6   turbines_in_operation                           49656 non-null  int64              
 7   total_nominal_capacity_operational_tur

In [58]:
df_main.isna().sum()

hour                                                 0
day_of_week                                          0
day_of_month                                         0
month_number                                         0
year                                                 0
meteorological_season                                0
turbines_in_operation                                0
total_nominal_capacity_operational_turbines_mw       0
datetime_cet                                         2
actual_generated_smard_mwh                           2
day_ahead_price_eur_mwh                           6552
total_nominal_capacity_smard_mw                      2
forecasted_generation_smard_mwh                     26
total_net_load_smard_mwh                             2
residual_load_smard_mwh                              2
mean_wind_speed_10m                               1488
mean_wind_speed_100m                              1488
wind_direction_angle_10m                          1488
wind_direc

#### Here I'll rename some features, convert to other units, and derive the intercardinal wind directions!
- Pascals to hectopascal (hPa); same as millibar --> (1 hPa = 1 mb)

In [59]:
df_main.rename(columns={
    'msl': 'mean_sea_level_pressure_mb',
    'i10fg': 'wind_gusts_10m',
    't2m': 'temp_2m_celsius',
}, inplace=True)

In [60]:
# Convert Pascals to hPA/mb
df_main['mean_sea_level_pressure_mb'] = df_main['mean_sea_level_pressure_mb'].apply(lambda pressure: pressure / 100)
df_main['mean_sea_level_pressure_mb'].mean()

1016.3648106983629

In [61]:
# Convert Kelvin to Celsius
df_main['temp_2m_celsius'] = df_main['temp_2m_celsius'].apply(lambda temp: temp - 273.15)
df_main['temp_2m_celsius'].mean()

10.216760820947304

In [62]:
df_main.sample(10)

Unnamed: 0_level_0,hour,day_of_week,day_of_month,month_number,year,meteorological_season,turbines_in_operation,total_nominal_capacity_operational_turbines_mw,datetime_cet,actual_generated_smard_mwh,day_ahead_price_eur_mwh,total_nominal_capacity_smard_mw,forecasted_generation_smard_mwh,total_net_load_smard_mwh,residual_load_smard_mwh,mean_wind_speed_10m,mean_wind_speed_100m,wind_direction_angle_10m,wind_direction_angle_100m,mean_sea_level_pressure_mb,wind_gusts_10m,temp_2m_celsius
datetime_utc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2021-07-13 17:00:00+00:00,17,1,13,7,2021,summer,28865,55109.247577,2021-07-13 19:00:00+02:00,5344.5,105.0,54499.0,6245.75,62619.25,53600.75,2.734197,4.450896,158.554245,152.361435,1009.228984,5.523561,21.436884
2020-01-03 04:00:00+00:00,4,4,3,1,2020,winter,28494,53064.101667,2020-01-03 05:00:00+01:00,24809.25,9.92,53184.0,25690.75,49863.0,18429.0,4.945132,8.611056,206.563034,214.337662,1021.287266,9.149274,2.671259
2018-06-02 16:00:00+00:00,16,5,2,6,2018,summer,27855,50943.563777,2018-06-02 18:00:00+02:00,3892.5,,51633.0,4063.75,51636.25,40492.25,3.016524,4.214153,272.805481,271.064972,1016.768281,6.659554,21.102655
2019-03-12 22:00:00+00:00,22,1,12,3,2019,spring,28293,52297.867817,2019-03-12 23:00:00+01:00,34365.0,0.36,52792.0,32634.5,56701.75,16668.75,6.400259,11.044964,207.111298,211.175735,1003.150469,13.716187,6.345209
2018-08-22 00:00:00+00:00,0,2,22,8,2018,summer,28024,51488.382777,2018-08-22 02:00:00+02:00,6530.5,,51633.0,5811.0,44995.5,37491.75,2.193209,4.708311,104.559769,110.493813,1019.978438,4.192609,16.356348
2018-08-23 08:00:00+00:00,8,3,23,8,2018,summer,28024,51488.382777,2018-08-23 10:00:00+02:00,2264.25,,51633.0,1983.5,67671.0,47792.5,2.04669,2.704856,220.829956,221.579132,1013.4925,5.422263,22.682581
2022-08-17 17:00:00+00:00,17,2,17,8,2022,summer,29121,56923.069087,2022-08-17 19:00:00+02:00,4319.25,747.93,55289.0,3752.0,56500.25,47141.5,2.099707,3.537026,133.637558,129.329376,1010.254609,4.388546,26.432214
2023-05-02 13:00:00+00:00,13,1,2,5,2023,spring,29328,58708.285537,2023-05-02 15:00:00+02:00,11176.0,81.97,57590.0,11742.25,58379.25,23867.25,3.682838,4.957393,309.917938,309.386749,1023.652891,8.758039,11.985406
2023-08-31 12:00:00+00:00,12,3,31,8,2023,summer,29359,59228.739337,2023-08-31 14:00:00+02:00,16225.25,86.26,57590.0,14523.5,58496.75,16515.0,,,,,,,
2020-10-21 11:00:00+00:00,11,2,21,10,2020,autumn,28680,53870.948467,2020-10-21 13:00:00+02:00,17960.5,31.83,53184.0,20634.25,69363.25,38439.0,4.401401,7.301148,183.995621,187.923492,1008.467813,9.324576,13.439386


In [63]:
df_main.head(2)

Unnamed: 0_level_0,hour,day_of_week,day_of_month,month_number,year,meteorological_season,turbines_in_operation,total_nominal_capacity_operational_turbines_mw,datetime_cet,actual_generated_smard_mwh,day_ahead_price_eur_mwh,total_nominal_capacity_smard_mw,forecasted_generation_smard_mwh,total_net_load_smard_mwh,residual_load_smard_mwh,mean_wind_speed_10m,mean_wind_speed_100m,wind_direction_angle_10m,wind_direction_angle_100m,mean_sea_level_pressure_mb,wind_gusts_10m,temp_2m_celsius
datetime_utc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2018-01-01 00:00:00+00:00,0,0,1,1,2018,winter,27464,49732.097897,2018-01-01 01:00:00+01:00,29638.0,,51633.0,29632.75,44270.0,11634.25,5.978772,10.395984,220.756622,225.207733,1000.349453,12.4197,8.510492
2018-01-01 01:00:00+00:00,1,0,1,1,2018,winter,27464,49732.097897,2018-01-01 02:00:00+01:00,30173.75,,51633.0,30978.75,43195.75,9928.75,6.309438,10.805341,224.697327,228.564713,1000.427969,13.007113,8.281854


#### Save `df_main` progress to pickle

In [65]:
# df_main.to_pickle(data_push_path / 'df_main_smard_era5.pkl')

------

# Next step: derive intercardinal directions!

##### Read in latest `df_main`

In [73]:
df_main = pd.read_pickle(data_push_path / 'df_main_smard_era5.pkl')

In [75]:
def wind_direction_to_intercardinal(wind_direction_degrees):
    # Make sure value is between 0-360 using modulo python assignment operator %= (same as x = x % 360)
    # If 380 --> 20 degrees (starts back at 0 at 360)
    wind_direction_degrees %= 360
    
    # The 16 intercardinal direction sectors in order as they appear on compass and their corresponding ranges spanning 22.5 degrees each
    # Note: the boundaries are 11.25 in both directions from the direction's true centre line, creating sectors
    intercardinal_directions = ['N', 'NNE', 'NE', 'ENE', 'E', 'ESE', 'SE', 'SSE', 'S', 'SSW', 'SW', 'WSW', 'W', 'WNW', 'NW', 'NNW', 'N']
    sector_boundaries = [11.25, 33.75, 56.25, 78.75, 101.25, 123.75, 146.25, 168.75, 191.25, 213.75, 236.25, 258.75, 281.25, 303.75, 326.25, 348.75, 360]

    # Loop through sector boundaries and return first direction at index i when condition is False
    for i in range(len(sector_boundaries)):
        # Only enter if statement once conditional is False (i.e. when wind angle is no longer less than sector boundary)
        if wind_direction_degrees < sector_boundaries[i]:
            return intercardinal_directions[i]

In [76]:
df_main['wind_direction_intercardinal_10m'] = df_main['wind_direction_angle_10m'].apply(wind_direction_to_intercardinal)
df_main['wind_direction_intercardinal_100m'] = df_main['wind_direction_angle_100m'].apply(wind_direction_to_intercardinal)

--------

## Calculate/derive weighted mean features for ERA5 data variables!
- The weights will be applied to each grid point linked to a turbine that was operational during relevant period
- The weights are the number of turbines linked to each grid point, so ERA5 data is more heavily weighted where turbines density is higher when calculating the mean

In [82]:
# Get latest unique grid points
used_grid_points = df_turbines['nearest_grid_point'].unique()

In [83]:
used_grid_points.shape

(750,)

In [85]:
# Zeros array with same shape at lat and lon dimensions (30,37)
weights_array = np.zeros((len(ds['latitude']), len(ds['longitude'])))

# loop through all 750 grid points
for lat, lon in used_grid_points:
    lat_idx = np.where(ds['latitude'] == lat)[0][0]
    lon_idx = np.where(ds['longitude'] == lon)[0][0]
    # Use broadcasting to insert weights into weights array
    weights_array[lat_idx, lon_idx] = grid_point_weights_dict[(lat, lon)]
    
# create the DataArray of weights
da_weights = xr.DataArray(weights_array, coords=[ds['latitude'], ds['longitude']], dims=['latitude', 'longitude'])

# total of the weights
total_weight = da_weights.sum(dim=['latitude', 'longitude'])

# calculate the weighted means for all 7 of my data variables
weighted_mean_t2m = (ds['t2m'] * da_weights).sum(dim=['latitude', 'longitude']) / total_weight
weighted_mean_i10fg = (ds['i10fg'] * da_weights).sum(dim=['latitude', 'longitude']) / total_weight
weighted_mean_msl = (ds['msl'] * da_weights).sum(dim=['latitude', 'longitude']) / total_weight
weighted_mean_mean_wind_speed_10m = (ds['mean_wind_speed_10m'] * da_weights).sum(dim=['latitude', 'longitude']) / total_weight
weighted_mean_mean_wind_speed_100m = (ds['mean_wind_speed_100m'] * da_weights).sum(dim=['latitude', 'longitude']) / total_weight
weighted_mean_wind_direction_angle_10m = (ds['wind_direction_angle_10m'] * da_weights).sum(dim=['latitude', 'longitude']) / total_weight
weighted_mean_wind_direction_angle_100m = (ds['wind_direction_angle_100m'] * da_weights).sum(dim=['latitude', 'longitude']) / total_weight

In [87]:
# Convert weighted mean DataArrays for each data variable to pandas dataframe
# And set index to tz aware UTC
df_weighted_mean_t2m = weighted_mean_t2m.to_dataframe(name='weighted_t2m')
df_weighted_mean_t2m.index = df_weighted_mean_t2m.index.tz_localize(tz='utc')

df_weighted_mean_i10fg = weighted_mean_i10fg.to_dataframe(name='weighted_i10fg')
df_weighted_mean_i10fg.index = df_weighted_mean_i10fg.index.tz_localize(tz='utc')

df_weighted_mean_msl = weighted_mean_msl.to_dataframe(name='weighted_msl')
df_weighted_mean_msl.index = df_weighted_mean_msl.index.tz_localize(tz='utc')

df_weighted_mean_mean_wind_speed_10m = weighted_mean_mean_wind_speed_10m.to_dataframe(name='weighted_mean_wind_speed_10m')
df_weighted_mean_mean_wind_speed_10m.index = df_weighted_mean_mean_wind_speed_10m.index.tz_localize(tz='utc')

df_weighted_mean_mean_wind_speed_100m = weighted_mean_mean_wind_speed_100m.to_dataframe(name='weighted_mean_wind_speed_100m')
df_weighted_mean_mean_wind_speed_100m.index = df_weighted_mean_mean_wind_speed_100m.index.tz_localize(tz='utc')

df_weighted_mean_wind_direction_angle_10m = weighted_mean_wind_direction_angle_10m.to_dataframe(name='weighted_mean_wind_direction_angle_10m')
df_weighted_mean_wind_direction_angle_10m.index = df_weighted_mean_wind_direction_angle_10m.index.tz_localize(tz='utc')

df_weighted_mean_wind_direction_angle_100m = weighted_mean_wind_direction_angle_100m.to_dataframe(name='weighted_mean_wind_direction_angle_100m')
df_weighted_mean_wind_direction_angle_100m.index = df_weighted_mean_wind_direction_angle_100m.index.tz_localize(tz='utc')

#### Left merge above 7 dataframes of mean values onto `df_main` using indices of both

In [88]:
df_main = df_main.merge(df_weighted_mean_t2m, left_index=True, right_index=True, how='left')
df_main = df_main.merge(df_weighted_mean_i10fg, left_index=True, right_index=True, how='left')
df_main = df_main.merge(df_weighted_mean_msl, left_index=True, right_index=True, how='left')
df_main = df_main.merge(df_weighted_mean_mean_wind_speed_10m, left_index=True, right_index=True, how='left')
df_main = df_main.merge(df_weighted_mean_mean_wind_speed_100m, left_index=True, right_index=True, how='left')
df_main = df_main.merge(df_weighted_mean_wind_direction_angle_10m, left_index=True, right_index=True, how='left')
df_main = df_main.merge(df_weighted_mean_wind_direction_angle_100m, left_index=True, right_index=True, how='left')

In [89]:
df_main.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 49656 entries, 2018-01-01 00:00:00+00:00 to 2023-08-31 23:00:00+00:00
Data columns (total 31 columns):
 #   Column                                          Non-Null Count  Dtype              
---  ------                                          --------------  -----              
 0   hour                                            49656 non-null  int32              
 1   day_of_week                                     49656 non-null  int32              
 2   day_of_month                                    49656 non-null  int32              
 3   month_number                                    49656 non-null  int32              
 4   year                                            49656 non-null  int32              
 5   meteorological_season                           49656 non-null  object             
 6   turbines_in_operation                           49656 non-null  int64              
 7   total_nominal_capacity_operational_tur

In [90]:
df_main.rename(columns={
    'weighted_msl': 'weighted_mean_sea_level_pressure_mb',
    'weighted_i10fg': 'weighted_wind_gusts_10m',
    'weighted_t2m': 'weighted_temp_2m_celsius',
}, inplace=True)

In [91]:
# Convert Pascals to hPA/mb
df_main['weighted_mean_sea_level_pressure_mb'] = df_main['weighted_mean_sea_level_pressure_mb'].apply(lambda pressure: pressure / 100)
df_main['weighted_mean_sea_level_pressure_mb'].mean()

1015.9126181005638

In [92]:
# Convert Kelvin to Celsius
df_main['weighted_temp_2m_celsius'] = df_main['weighted_temp_2m_celsius'].apply(lambda temp: temp - 273.15)
df_main['weighted_temp_2m_celsius'].mean()

10.357187341127316

In [93]:
df_main['weighted_wind_direction_intercardinal_10m'] = df_main['weighted_mean_wind_direction_angle_10m'].apply(wind_direction_to_intercardinal)
df_main['weighted_wind_direction_intercardinal_100m'] = df_main['weighted_mean_wind_direction_angle_100m'].apply(wind_direction_to_intercardinal)

In [94]:
df_main.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 49656 entries, 2018-01-01 00:00:00+00:00 to 2023-08-31 23:00:00+00:00
Data columns (total 33 columns):
 #   Column                                          Non-Null Count  Dtype              
---  ------                                          --------------  -----              
 0   hour                                            49656 non-null  int32              
 1   day_of_week                                     49656 non-null  int32              
 2   day_of_month                                    49656 non-null  int32              
 3   month_number                                    49656 non-null  int32              
 4   year                                            49656 non-null  int32              
 5   meteorological_season                           49656 non-null  object             
 6   turbines_in_operation                           49656 non-null  int64              
 7   total_nominal_capacity_operational_tur

In [95]:
df_main

Unnamed: 0_level_0,hour,day_of_week,day_of_month,month_number,year,meteorological_season,turbines_in_operation,total_nominal_capacity_operational_turbines_mw,datetime_cet,actual_generated_smard_mwh,day_ahead_price_eur_mwh,total_nominal_capacity_smard_mw,forecasted_generation_smard_mwh,total_net_load_smard_mwh,residual_load_smard_mwh,mean_wind_speed_10m,mean_wind_speed_100m,wind_direction_angle_10m,wind_direction_angle_100m,mean_sea_level_pressure_mb,wind_gusts_10m,temp_2m_celsius,wind_direction_intercardinal_10m,wind_direction_intercardinal_100m,weighted_temp_2m_celsius,weighted_wind_gusts_10m,weighted_mean_sea_level_pressure_mb,weighted_mean_wind_speed_10m,weighted_mean_wind_speed_100m,weighted_mean_wind_direction_angle_10m,weighted_mean_wind_direction_angle_100m,weighted_wind_direction_intercardinal_10m,weighted_wind_direction_intercardinal_100m
datetime_utc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1
2018-01-01 00:00:00+00:00,0,0,1,1,2018,winter,27464,49732.097897,2018-01-01 01:00:00+01:00,29638.00,,51633.0,29632.75,44270.00,11634.25,5.978772,10.395984,220.756622,225.207733,1000.349453,12.419700,8.510492,SW,SW,9.207286,13.089894,997.749018,6.337690,10.819870,223.051711,226.127482,SW,SW
2018-01-01 01:00:00+00:00,1,0,1,1,2018,winter,27464,49732.097897,2018-01-01 02:00:00+01:00,30173.75,,51633.0,30978.75,43195.75,9928.75,6.309438,10.805341,224.697327,228.564713,1000.427969,13.007113,8.281854,SW,SW,8.921161,13.771541,997.737445,6.722992,11.314126,226.120270,228.464003,SW,SW
2018-01-01 02:00:00+00:00,2,0,1,1,2018,winter,27464,49732.097897,2018-01-01 03:00:00+01:00,31021.50,,51633.0,32154.25,42527.50,8371.50,6.685641,11.288467,228.800705,232.298874,1000.821328,13.622252,8.135431,SW,SW,8.717961,14.434282,998.046871,7.154381,11.899254,229.267554,231.341858,SW,SW
2018-01-01 03:00:00+00:00,3,0,1,1,2018,winter,27464,49732.097897,2018-01-01 04:00:00+01:00,31015.00,,51633.0,33045.50,42515.75,8291.25,6.894922,11.552816,231.736252,235.041382,1001.264219,14.090265,7.977838,SW,SW,8.495501,14.913757,998.378348,7.391741,12.219056,230.808404,232.740422,SW,SW
2018-01-01 04:00:00+00:00,4,0,1,1,2018,winter,27464,49732.097897,2018-01-01 05:00:00+01:00,31534.00,,51633.0,33644.50,42278.00,7465.50,7.060956,11.751130,234.274536,236.804169,1001.436406,14.186666,7.647913,SW,WSW,8.080761,14.866367,998.388523,7.513337,12.358206,231.433313,233.460190,SW,SW
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-08-31 19:00:00+00:00,19,3,31,8,2023,summer,29359,59228.739337,2023-08-31 21:00:00+02:00,11080.75,125.50,57590.0,9225.00,52445.75,38865.00,,,,,,,,,,,,,,,,,,
2023-08-31 20:00:00+00:00,20,3,31,8,2023,summer,29359,59228.739337,2023-08-31 22:00:00+02:00,11553.25,106.03,57590.0,9885.75,48692.25,34340.00,,,,,,,,,,,,,,,,,,
2023-08-31 21:00:00+00:00,21,3,31,8,2023,summer,29359,59228.739337,2023-08-31 23:00:00+02:00,11290.00,96.89,57590.0,10282.50,45385.75,31281.25,,,,,,,,,,,,,,,,,,
2023-08-31 22:00:00+00:00,22,3,31,8,2023,summer,29359,59228.739337,NaT,,,,,,,,,,,,,,,,,,,,,,,,


#### Save `df_main` progress to pickle

In [96]:
# df_main.to_pickle(data_push_path / 'df_main_smard_era5_prefinal.pkl')

## Next: cut off tail end of rows that go beyond ERA5 data (beyond end of June 2023)

In [104]:
df_main = df_main.loc[df_main.index < pd.to_datetime('2023-07-01 00:00:00+00:00', utc=True)]

In [106]:
df_main.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 48168 entries, 2018-01-01 00:00:00+00:00 to 2023-06-30 23:00:00+00:00
Data columns (total 33 columns):
 #   Column                                          Non-Null Count  Dtype              
---  ------                                          --------------  -----              
 0   hour                                            48168 non-null  int32              
 1   day_of_week                                     48168 non-null  int32              
 2   day_of_month                                    48168 non-null  int32              
 3   month_number                                    48168 non-null  int32              
 4   year                                            48168 non-null  int32              
 5   meteorological_season                           48168 non-null  object             
 6   turbines_in_operation                           48168 non-null  int64              
 7   total_nominal_capacity_operational_tur

#### Save final `df_main` progress to pickle

In [107]:
# df_main.to_pickle(data_push_path / 'df_main_smard_era5_final.pkl')

In [109]:
df_main.isna().sum()

hour                                                 0
day_of_week                                          0
day_of_month                                         0
month_number                                         0
year                                                 0
meteorological_season                                0
turbines_in_operation                                0
total_nominal_capacity_operational_turbines_mw       0
datetime_cet                                         0
actual_generated_smard_mwh                           0
day_ahead_price_eur_mwh                           6550
total_nominal_capacity_smard_mw                      0
forecasted_generation_smard_mwh                     24
total_net_load_smard_mwh                             0
residual_load_smard_mwh                              0
mean_wind_speed_10m                                  0
mean_wind_speed_100m                                 0
wind_direction_angle_10m                             0
wind_direc

## Idea - weight the mean calculation using number of turbines linked to each grid point!
- run for every hourly timestamp and use value counts for turbines in operation at that time
- Get value counts of nearest grid points for turbines operational during 01.01.2018 to 01.06.2023!
- Run through is_operational function and keep track of grid point occurences by updating a dict's keys and values?
    - if key seen for first time, add key to dict
    - keep tally of times that key (the coord pair) comes up
    - Use Counter?

### Get overall coord pair weights since 2018 

### ACTUAL relevant grid points if my time series is only going back to start of 2018!
- Basically same as the 751 I had before (750 now)
- This also doesn't give me the WEIGHTS... WTF!
- Looks like 30_547 turbines were active during my time series period (out of 30_638) That means 91 turbines operated completely outside of this period!

In [1567]:
# This codeblock takes 5+ minutes to run! I didn't bother to write something more performant as it's not critical

# Get datetimes in index before ERA5 data cutoff 
# subset_datetime_idx = df_main.loc[df_main.index < pd.to_datetime('2023-06-30 23:00:00', utc=True)].index

# # initialise empty set to update
# unique_set_of_idx = set()

# # loop through all timestamps 
# for timestamp in subset_datetime_idx:
#     # return bool series of turbines in operation on timestamp
#     is_operational_bool_mask = is_operational(timestamp, df_turbines)
    
#     # Then get list of grid points for those turbines, make set, and update main set
#     idx_set = set(df_turbines.loc[is_operational_bool_mask].index.to_list())
#     unique_set_of_idx.update(idx_set)

#### Now filter df_turbines using this set of indexes

In [1577]:
# New dataframe to save to pickle
# df_turbines_2018_2023 = df_turbines.iloc[list(unique_set_of_idx)]

In [1601]:
# Reset index before saving to pickle
# df_turbines_2018_2023.reset_index(drop=True, inplace=True)

In [1603]:
# Save to pickle
# df_turbines_2018_2023.to_pickle(data_push_path / 'df_turbines_knn_blades_haversine_elevation_utc_2_2018_2023.pkl')

In [1594]:
# df_turbines_2018_2023['nearest_grid_point'].nunique()

750

#### These are the weights for each grid point!

In [1595]:
# df_turbines_2018_2023['nearest_grid_point'].value_counts()

nearest_grid_point
(54.07, 8.97)     454
(54.57, 8.97)     356
(54.57, 9.22)     343
(51.57, 8.72)     325
(53.32, 13.97)    286
                 ... 
(48.82, 8.97)       1
(47.57, 10.72)      1
(52.32, 12.97)      1
(50.57, 7.47)       1
(48.82, 8.22)       1
Name: count, Length: 750, dtype: int64

----

-----

## Scratch book of random code I want to keep for reference 

In [1411]:
ds.sel(time=['2022-01-01T00:00:00'], latitude=47.82, longitude=10.97)['mean_wind_speed_10m'].values

array([3.1659725], dtype=float32)

In [1412]:
# All values for a data var for a specific grid point across all timestamps
len(ds.sel(latitude=47.82, longitude=10.97)['mean_wind_speed_10m'].values)

48168

In [1413]:
ds.sel(latitude=47.82, longitude=10.97)['mean_wind_speed_10m'].values

array([2.603206 , 2.6633532, 2.7366667, ..., 2.0954723, 3.0069587,
       3.1353345], dtype=float32)