# Building the ML dataframe

In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
import xarray as xr

In [3]:
# Set max number of columns to display; default 20
pd.options.display.max_columns = 80

In [4]:
# Directory where data files will be downloaded
cwd_path = Path.cwd()
data_path = cwd_path.parent.joinpath('data')
data_push_path = cwd_path.parent.joinpath('data_to_push')

### Load in latest wind turbine data
- Note: Only 9 turbines decommissioned before 2019!

In [375]:
# Read in latest turbine data
# Now only 30,642 after removing 3 turbines outside bounding box
df_turbines = pd.read_pickle(data_push_path / 'df_turbines_knn_blades_haversine_elevation_utc.pkl')
df_turbines.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30642 entries, 0 to 30641
Data columns (total 30 columns):
 #   Column                                            Non-Null Count  Dtype              
---  ------                                            --------------  -----              
 0   EinheitMastrNummer                                30642 non-null  object             
 1   DatumLetzteAktualisierung                         30642 non-null  datetime64[ns, UTC]
 2   Bundesland                                        30642 non-null  object             
 3   Postleitzahl                                      30642 non-null  int64              
 4   Ort                                               30642 non-null  object             
 5   Laengengrad                                       30642 non-null  float64            
 6   Breitengrad                                       30642 non-null  float64            
 7   Registrierungsdatum                               30642 non-null  d

In [376]:
# List of datetime column names for convenient indexing later
turbine_date_columns = [
    'DatumLetzteAktualisierung',
    'Registrierungsdatum',
    'Inbetriebnahmedatum',
    'DatumEndgueltigeStilllegung',
    'DatumBeginnVoruebergehendeStilllegung',
    'DatumWiederaufnahmeBetrieb'
]

In [371]:
# # Cast all tz naive datetime64 columns to UTC tz aware
# df_turbines[turbine_date_columns] = df_turbines[turbine_date_columns].apply(lambda series: series.dt.tz_localize('UTC'), axis=0)

# # save to pickle
# df_turbines.to_pickle(data_push_path / 'df_turbines_knn_blades_haversine_elevation_utc.pkl')

-----

# Load in all SMARD data

### Load in SMARD energy generated data (actual measured generation)
- My response variable `y` (aka target/label) 

In [853]:
# Get all SMARD CSV files, load into dataframes, and concat them

def load_SMARD_generated_data():
    """
    Look for CSV files in the SMARD directory
    """
    list_of_dataframes = []
    for filepath in data_path.joinpath('SMARD').iterdir():
        if filepath.name.startswith('Realisierte_Erzeugung'):
            list_of_dataframes.append(pd.read_csv(filepath, sep=';'))
            
    # Check that num of rows is what I expect after concatenating dfs
    num_of_rows = 0
    for df in list_of_dataframes:
        num_of_rows += len(df)
    
    # concat dataframes
    df = pd.concat(list_of_dataframes)
    print(f'Number of rows match up: {num_of_rows == len(df)}')
    
    # Select columns to keep
    # Note: I'm using the interval start time to create the timestamps
    # ERA5 surface parameters are instantaneous so can't perfectly align anyway
    df = df[[
        'Datum',
        'Anfang',
        # 'Ende',
        'Wind Onshore [MWh] Berechnete Auflösungen'
    ]]
    
    # rename columns
    df.rename(columns={
        'Datum': 'date',
        'Anfang': 'interval_start_time', 
        # 'Ende': 'interval_end_time',
        'Wind Onshore [MWh] Berechnete Auflösungen': 'generated_smard_mwh'
    }, inplace=True)
    
    # Handle dates and times to create unified datetime64 timestamps
    df['date'] = pd.to_datetime(df['date'], format='%d.%m.%Y')
    df['start_time_delta'] = pd.to_timedelta(df['interval_start_time'] + ':00')
    df['datetime_cet'] = df['date'] + df['start_time_delta']
    # Add CET timezone info and infer change from CET->CEST->CET, etc
    df['datetime_cet'] = df['datetime_cet'].dt.tz_localize(tz='CET', ambiguous='infer')
    
    # Drop un-needed columns
    df = df.drop(columns=['date', 'start_time_delta', 'interval_start_time'])
    # Re-arrange columns
    df = df[['datetime_cet', 'generated_smard_mwh']]
    
    # Convert European thousands and decimal seperators in values to (US/UK) decimal full stop format
    translation_table = str.maketrans({'.': None, ',': '.'})
    df['generated_smard_mwh'] = df['generated_smard_mwh'].str.translate(translation_table).astype(float)
    
    # Sort rows by datetime_cet
    df.sort_values('datetime_cet', inplace=True)
    # Reset index
    df.reset_index(drop=True, inplace=True)
    
    return df.copy()

In [854]:
# Load SMARD data
df_smard_generated = load_SMARD_generated_data()
df_smard_generated.info()

Number of rows match up: True
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49655 entries, 0 to 49654
Data columns (total 2 columns):
 #   Column               Non-Null Count  Dtype              
---  ------               --------------  -----              
 0   datetime_cet         49655 non-null  datetime64[ns, CET]
 1   generated_smard_mwh  49655 non-null  float64            
dtypes: datetime64[ns, CET](1), float64(1)
memory usage: 776.0 KB


In [855]:
# df_smard[df_smard['datetime'] < pd.Timestamp('2022-10-30')]

In [856]:
df_smard_generated.isna().sum()

datetime_cet           0
generated_smard_mwh    0
dtype: int64

In [857]:
df_smard_generated.head(3)

Unnamed: 0,datetime_cet,generated_smard_mwh
0,2018-01-01 00:00:00+01:00,28503.5
1,2018-01-01 01:00:00+01:00,29638.0
2,2018-01-01 02:00:00+01:00,30173.75


-----

### Load in SMARD day-ahead prices
- Wholesale prices: https://www.smard.de/page/en/wiki-article/5884/5976
- This is the day-ahead price!
    - "Weighted wholesale electricity price (day-ahead price on the exchange) for each hour [€/MWh] determined on the day-ahead auction that took place ones on the previous day - data is delivered no later than 2 hours after trading closes. Source: ENTSO-E"
- Data only goes back to 2018-10-01 (October 1, 2018)

In [940]:
# Get all SMARD CSV files, load into dataframes, and concat them

def load_SMARD_market_price_data():
    """
    Look for CSV files in the SMARD directory
    """
    list_of_dataframes = []
    for filepath in data_path.joinpath('SMARD').iterdir():
        if filepath.name.startswith('Gro_handelspreise'):
            list_of_dataframes.append(pd.read_csv(filepath, sep=';', na_values='-'))
            
    # Check that num of rows is what I expect after concatenating dfs
    num_of_rows = 0
    for df in list_of_dataframes:
        num_of_rows += len(df)
    
    # concat dataframes
    df = pd.concat(list_of_dataframes)
    print(f'Number of rows match up: {num_of_rows == len(df)}')
    
    # Select columns to keep
    # Note: I'm using the interval start time to create the timestamps
    # ERA5 surface parameters are instantaneous so can't perfectly align anyway
    df = df[[
        'Datum',
        'Anfang',
        # 'Ende',
        'Deutschland/Luxemburg [€/MWh] Originalauflösungen'
    ]]
    
    # rename columns
    df.rename(columns={
        'Datum': 'date',
        'Anfang': 'interval_start_time', 
        # 'Ende': 'interval_end_time',
        'Deutschland/Luxemburg [€/MWh] Originalauflösungen': 'day_ahead_price_eur_mwh'
    }, inplace=True)
    
    # Handle dates and times to create unified datetime64 timestamps
    df['date'] = pd.to_datetime(df['date'], format='%d.%m.%Y')
    df['start_time_delta'] = pd.to_timedelta(df['interval_start_time'] + ':00')
    df['datetime_cet'] = df['date'] + df['start_time_delta']
    # Add CET timezone info and infer change from CET->CEST->CET, etc
    df['datetime_cet'] = df['datetime_cet'].dt.tz_localize(tz='CET', ambiguous='infer')
    
    # Drop un-needed columns
    df = df.drop(columns=['date', 'start_time_delta', 'interval_start_time'])
    # Re-arrange columns
    df = df[['datetime_cet', 'day_ahead_price_eur_mwh']]
    
    # Convert European thousands and decimal seperators in values to (US/UK) decimal full stop format
    translation_table = str.maketrans({'.': None, ',': '.'})
    df['day_ahead_price_eur_mwh'] = df['day_ahead_price_eur_mwh'].str.translate(translation_table).astype(float)
    
    # Drop the numerous (700+) duplicate rows
    df.drop_duplicates(subset='datetime_cet', inplace=True)
    # Sort rows by datetime_cet
    df.sort_values('datetime_cet', inplace=True)
    # Reset index
    df.reset_index(drop=True, inplace=True)
    
    return df.copy()

In [941]:
df_smard_market_price = load_SMARD_market_price_data()

Number of rows match up: True


In [942]:
# df_smard_market_price = pd.read_csv(data_path.joinpath('SMARD') / 'Gro_handelspreise_202201010000_202212312359_Stunde.csv', sep=';')

In [943]:
df_smard_market_price.isna().sum()

datetime_cet                  0
day_ahead_price_eur_mwh    6551
dtype: int64

In [944]:
# df_smard_market_price.iloc[6550:].head(20)

In [945]:
df_smard_market_price.isna().sum()

datetime_cet                  0
day_ahead_price_eur_mwh    6551
dtype: int64

In [946]:
df_smard_market_price['day_ahead_price_eur_mwh'].describe()

count    43104.000000
mean        97.537291
std        106.238714
min       -500.000000
25%         35.260000
50%         55.155000
75%        118.750000
max        871.000000
Name: day_ahead_price_eur_mwh, dtype: float64

-----

### Load in SMARD installed capacity data
- Note: CSV reader infers decimal comma (European) format 54.499 as decimal point 54.499 even though it's 54,499.00!
    - Specify dtype of column in the read_csv method

In [881]:
def load_SMARD_installed_capacity_data():
    """
    Look for CSV files in the SMARD directory
    """
    list_of_dataframes = []
    for filepath in data_path.joinpath('SMARD').iterdir():
        if filepath.name.startswith('Installierte_Erzeugungsleistung'):
            list_of_dataframes.append(
                pd.read_csv(filepath, sep=';', dtype={'Wind Onshore [MW] Berechnete Auflösungen': str})
            )
            
    # Check that num of rows is what I expect after concatenating dfs
    num_of_rows = 0
    for df in list_of_dataframes:
        num_of_rows += len(df)
    
    # concat dataframes
    df = pd.concat(list_of_dataframes)
    print(f'Number of rows match up: {num_of_rows == len(df)}')
    
    # Select columns to keep
    # Note: I'm using the interval start time to create the timestamps
    # ERA5 surface parameters are instantaneous so can't perfectly align anyway
    df = df[[
        'Datum',
        'Anfang',
        # 'Ende',
        'Wind Onshore [MW] Berechnete Auflösungen'
    ]]
    
    # rename columns
    df.rename(columns={
        'Datum': 'date',
        'Anfang': 'interval_start_time', 
        # 'Ende': 'interval_end_time',
        'Wind Onshore [MW] Berechnete Auflösungen': 'total_nominal_capacity_smard_mw'
    }, inplace=True)
    
    # Handle dates and times to create unified datetime64 timestamps
    df['date'] = pd.to_datetime(df['date'], format='%d.%m.%Y')
    df['start_time_delta'] = pd.to_timedelta(df['interval_start_time'] + ':00')
    df['datetime_cet'] = df['date'] + df['start_time_delta']
    # Add CET timezone info and infer change from CET->CEST->CET, etc
    df['datetime_cet'] = df['datetime_cet'].dt.tz_localize(tz='CET', ambiguous='infer')
    
    # Drop un-needed columns
    df = df.drop(columns=['date', 'start_time_delta', 'interval_start_time'])
    # Re-arrange columns
    df = df[['datetime_cet', 'total_nominal_capacity_smard_mw']]
    
    # Convert European thousands and decimal seperators in values to (US/UK) decimal full stop format
    translation_table = str.maketrans({'.': None, ',': '.'})
    df['total_nominal_capacity_smard_mw'] = df['total_nominal_capacity_smard_mw'].str.translate(translation_table).astype(float)
    
    # Sort rows by datetime_cet
    df.sort_values('datetime_cet', inplace=True)
    # Reset index
    df.reset_index(drop=True, inplace=True)
    
    return df.copy()

In [882]:
df_smard_installed_capacity = load_SMARD_installed_capacity_data()

Number of rows match up: True


In [883]:
df_smard_installed_capacity.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41639 entries, 0 to 41638
Data columns (total 2 columns):
 #   Column                           Non-Null Count  Dtype              
---  ------                           --------------  -----              
 0   datetime_cet                     41639 non-null  datetime64[ns, CET]
 1   total_nominal_capacity_smard_mw  41639 non-null  float64            
dtypes: datetime64[ns, CET](1), float64(1)
memory usage: 650.7 KB


In [885]:
df_smard_installed_capacity.isna().sum()

datetime_cet                       0
total_nominal_capacity_smard_mw    0
dtype: int64

------

### Load in SMARD day-ahead generation forecast
- Wind Onshore [MWh] Berechnete Auflösungen for Germany
- Only has 24 missing values for the day of 2022-12-21 (verified on SMARD data visuals)

In [1058]:
def load_SMARD_forecasted_generation_data():
    """
    Look for CSV files in the SMARD directory
    """
    list_of_dataframes = []
    for filepath in data_path.joinpath('SMARD').iterdir():
        if filepath.name.startswith('Prognostizierte_Erzeugung_Day-Ahead'):
            list_of_dataframes.append(pd.read_csv(filepath, sep=';', na_values='-'))
            
    # Check that num of rows is what I expect after concatenating dfs
    num_of_rows = 0
    for df in list_of_dataframes:
        num_of_rows += len(df)
    
    # concat dataframes
    df = pd.concat(list_of_dataframes)
    print(f'Number of rows match up: {num_of_rows == len(df)}')
    
    # Select columns to keep
    # Note: I'm using the interval start time to create the timestamps
    # ERA5 surface parameters are instantaneous so can't perfectly align anyway
    df = df[[
        'Datum',
        'Anfang',
        # 'Ende',
        'Wind Onshore [MWh] Berechnete Auflösungen'
    ]]
    
    # rename columns
    df.rename(columns={
        'Datum': 'date',
        'Anfang': 'interval_start_time', 
        # 'Ende': 'interval_end_time',
        'Wind Onshore [MWh] Berechnete Auflösungen': 'forecasted_generation_smard_mwh'
    }, inplace=True)
    
    # Handle dates and times to create unified datetime64 timestamps
    df['date'] = pd.to_datetime(df['date'], format='%d.%m.%Y')
    df['start_time_delta'] = pd.to_timedelta(df['interval_start_time'] + ':00')
    df['datetime_cet'] = df['date'] + df['start_time_delta']
    # Add CET timezone info and infer change from CET->CEST->CET, etc
    df['datetime_cet'] = df['datetime_cet'].dt.tz_localize(tz='CET', ambiguous='infer')
    
    # Drop un-needed columns
    df = df.drop(columns=['date', 'start_time_delta', 'interval_start_time'])
    # Re-arrange columns
    df = df[['datetime_cet', 'forecasted_generation_smard_mwh']]
    
    # Convert European thousands and decimal seperators in values to (US/UK) decimal full stop format
    translation_table = str.maketrans({'.': None, ',': '.'})
    # df['forecasted_generation_smard_mwh'] = df['forecasted_generation_smard_mwh'].str.translate(translation_table).astype(float)
        
    # Sort rows by datetime_cet
    df.sort_values('datetime_cet', inplace=True)
    # Reset index
    df.reset_index(drop=True, inplace=True)
    
    return df.copy()

In [1059]:
df_smard_forecasted_generation = load_SMARD_forecasted_generation_data()

Number of rows match up: True


In [1060]:
df_smard_forecasted_generation.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49655 entries, 0 to 49654
Data columns (total 2 columns):
 #   Column                           Non-Null Count  Dtype              
---  ------                           --------------  -----              
 0   datetime_cet                     49655 non-null  datetime64[ns, CET]
 1   forecasted_generation_smard_mwh  49631 non-null  object             
dtypes: datetime64[ns, CET](1), object(1)
memory usage: 776.0+ KB


In [1071]:
df_smard_forecasted_generation.isna().sum()

datetime_cet                        0
forecasted_generation_smard_mwh    24
dtype: int64

-----

-----

# Begin building ML dataframe `df_main`
- Start with one full year: 2022
    - Then bring in another year like 2021 and run through the same transformations and then concat along datetimeindex?
- Good resource on time-related feature engineering: https://scikit-learn.org/stable/auto_examples/applications/plot_cyclical_feature_engineering.html

In [987]:
# Make time zone aware UTC?
datetime_index_utc = pd.date_range(start='2018-01-01', end='2023-08-31 23:59:59', freq='H', name='datetime_utc', tz='UTC')

# Make index to dataframe and reset
df_main = datetime_index_utc.to_frame().reset_index(drop=True)

# Extract properties and derive new columns
df_main['hour'] = df_main['datetime_utc'].dt.hour
# 0-6 (Monday-Sunday)
df_main['day_of_week'] = df_main['datetime_utc'].dt.dayofweek
df_main['day_of_month'] = df_main['datetime_utc'].dt.day
df_main['month_number'] = df_main['datetime_utc'].dt.month
df_main['year'] = df_main['datetime_utc'].dt.year

# Function to get meteorological season based on month number (1-12)
# Should I just go ahead and encode the discrete numerical value?
def get_meteorological_season(month):
    if 3 <= month <= 5:
        return 'spring'
    elif 6 <= month <= 8:
        return 'summer'
    elif 9 <= month <= 11:
        return 'autumn'
    else:
        return 'winter'
    
df_main['meteorological_season'] = df_main['month_number'].apply(get_meteorological_season)

df_main

Unnamed: 0,datetime_utc,hour,day_of_week,day_of_month,month_number,year,meteorological_season
0,2018-01-01 00:00:00+00:00,0,0,1,1,2018,winter
1,2018-01-01 01:00:00+00:00,1,0,1,1,2018,winter
2,2018-01-01 02:00:00+00:00,2,0,1,1,2018,winter
3,2018-01-01 03:00:00+00:00,3,0,1,1,2018,winter
4,2018-01-01 04:00:00+00:00,4,0,1,1,2018,winter
...,...,...,...,...,...,...,...
49651,2023-08-31 19:00:00+00:00,19,3,31,8,2023,summer
49652,2023-08-31 20:00:00+00:00,20,3,31,8,2023,summer
49653,2023-08-31 21:00:00+00:00,21,3,31,8,2023,summer
49654,2023-08-31 22:00:00+00:00,22,3,31,8,2023,summer


----

## Derive new columns `turbines_in_operation` and `total_nominal_capacity_operational_turbines_mw`

#### Function to check if turbine operational at a given UTC hour timestamp

In [988]:
def is_operational(timestamp_utc, df_turbines):
    """
    Inputs: hourly tz aware utc timestamps, turbine dataframe
    Get a bool series of turbines that are operational at the time of the [hourly] timestamp
    """
    # Turbines that went into operation before the timestamp; returns bool series for bool indexing
    started_operations = df_turbines['Inbetriebnahmedatum'] <= timestamp_utc

    # Turbines already decommissioned before the timestamp; returns bool series for bool indexing
    already_decommissioned = df_turbines['DatumEndgueltigeStilllegung'] <= timestamp_utc

    # Turbines that went into maintenance before timestamp and haven't come back into operation before the timestamp
    # Note: Some turbines go straight from temporary maintenance to decommissioned without ever going back into operation
    still_in_maintenance_or_decommissioned = (df_turbines['DatumBeginnVoruebergehendeStilllegung'] <= timestamp_utc) & \
    ((df_turbines['DatumWiederaufnahmeBetrieb'] > timestamp_utc) | df_turbines['DatumWiederaufnahmeBetrieb'].isna())

    # Number of turbines operational at the timestamp
    # Note the tildas to inverse these bool series
    # This is a bool series with df_turbines index
    turbines_in_operation_bool_series = (started_operations & ~already_decommissioned & ~still_in_maintenance_or_decommissioned)

    return turbines_in_operation_bool_series

##### Derive column for total turbines in operation for every hourly timestamp

In [989]:
# 1min 7s to run for 2018-23
df_main['turbines_in_operation'] = df_main['datetime_utc'].apply(lambda timestamp: is_operational(timestamp, df_turbines).sum())

##### Derive column for total nominal capacity for every hourly timestamp

In [990]:
# Nettonennleistung is in kW; I divide by 1_000 to convert kilo-watts (kW) to mega-watts (MW)
# 1min 30s to run for 2018-23
df_main['total_nominal_capacity_operational_turbines_mw'] = df_main['datetime_utc'].apply(
    lambda timestamp: df_turbines.loc[:, 'Nettonennleistung'].loc[is_operational(timestamp, df_turbines)].sum() / 1_000)

##### Save to pickle

In [997]:
# df_main.to_pickle(data_push_path / 'df_main.pkl')

In [858]:
# df_main['turbines_in_operation'].plot()

In [859]:
# df_main['total_nominal_capacity_operational_turbines_mw'].plot()

In [808]:
df_main.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49656 entries, 0 to 49655
Data columns (total 8 columns):
 #   Column                                          Non-Null Count  Dtype              
---  ------                                          --------------  -----              
 0   datetime_utc                                    49656 non-null  datetime64[ns, UTC]
 1   hour                                            49656 non-null  int32              
 2   day_of_week                                     49656 non-null  int32              
 3   day_of_month                                    49656 non-null  int32              
 4   month_number                                    49656 non-null  int32              
 5   meteorological_season                           49656 non-null  object             
 6   turbines_in_operation                           49656 non-null  int64              
 7   total_nominal_capacity_operational_turbines_mw  49656 non-null  float64            
dt

-----

## Test merging SMARD data onto `df_main` on key datetime
- Dataframes to merge with `df_main`:
    - `df_smard_generated`
        - Is missing 1 hour timestamp row - find it
    - `df_smard_market_price`
        - 744 rows are duplicates
        - 6_552 nans
    - `df_smard_installed_capacity`
        - has 41_639 rows... so missing lots of hourly timestamps
        - 8_018 nans
    - `df_smard_forecasted_generation`
        - only 41_639 rows when should be 49_656?
        - 17_544 are rows with nans/nats in both columns
        - 25_562 nans after merging

#### Problems:
- I get duplicated timestamps on left merge key `datetime_utc`
    - 50_400 after removing duplicated cet columns; should only be 49_656 unique rows

##### Load in latest `df_main`

In [1063]:
df_main = pd.read_pickle(data_push_path / 'df_main.pkl')
df_main.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49656 entries, 0 to 49655
Data columns (total 9 columns):
 #   Column                                          Non-Null Count  Dtype              
---  ------                                          --------------  -----              
 0   datetime_utc                                    49656 non-null  datetime64[ns, UTC]
 1   hour                                            49656 non-null  int32              
 2   day_of_week                                     49656 non-null  int32              
 3   day_of_month                                    49656 non-null  int32              
 4   month_number                                    49656 non-null  int32              
 5   year                                            49656 non-null  int32              
 6   meteorological_season                           49656 non-null  object             
 7   turbines_in_operation                           49656 non-null  int64              
 8

#### Try removing datetime_cet altogether

In [1011]:
# df_smard_forecasted_generation.value_counts(dropna=False)

In [985]:
df_smard_forecasted_generation.loc[df_smard_forecasted_generation['datetime_cet'].isin(df_main['datetime_cet'])]

Unnamed: 0,datetime_cet,forecasted_generation_smard_mwh
1,2018-01-01 01:00:00+01:00,29632.75
2,2018-01-01 02:00:00+01:00,30978.75
3,2018-01-01 03:00:00+01:00,32154.25
4,2018-01-01 04:00:00+01:00,33045.50
5,2018-01-01 05:00:00+01:00,33644.50
...,...,...
41634,NaT,
41635,NaT,
41636,NaT,
41637,NaT,


In [995]:
df_smard_forecasted_generation.loc[df_smard_forecasted_generation['datetime_cet'] == pd.to_datetime('2019-12-31 18:00:00+00:00')]

Unnamed: 0,datetime_cet,forecasted_generation_smard_mwh


In [993]:
df_smard_forecasted_generation.iloc[8755:8763]

Unnamed: 0,datetime_cet,forecasted_generation_smard_mwh
8755,2018-12-31 19:00:00+01:00,12666.75
8756,2018-12-31 20:00:00+01:00,14047.75
8757,2018-12-31 21:00:00+01:00,15222.75
8758,2018-12-31 22:00:00+01:00,16090.25
8759,2018-12-31 23:00:00+01:00,17375.0
8760,2021-01-01 00:00:00+01:00,4102.25
8761,2021-01-01 01:00:00+01:00,3653.75
8762,2021-01-01 02:00:00+01:00,3218.0


In [986]:
df_main.loc[~(df_main['datetime_cet'].isin(df_smard_forecasted_generation['datetime_cet']))]

Unnamed: 0,datetime_utc,hour,day_of_week,day_of_month,month_number,meteorological_season,turbines_in_operation,total_nominal_capacity_operational_turbines_mw,datetime_cet,generated_smard_mwh,day_ahead_price_eur_mwh,total_nominal_capacity_smard_mw,forecasted_generation_smard_mwh
8759,2018-12-31 23:00:00+00:00,23,0,31,12,winter,28266,52200.785317,2019-01-01 00:00:00+01:00,20391.75,28.32,52792.0,
8760,2019-01-01 00:00:00+00:00,0,1,1,1,winter,28266,52200.785317,2019-01-01 01:00:00+01:00,22365.50,10.07,52792.0,
8761,2019-01-01 01:00:00+00:00,1,1,1,1,winter,28266,52200.785317,2019-01-01 02:00:00+01:00,23213.25,-4.08,52792.0,
8762,2019-01-01 02:00:00+00:00,2,1,1,1,winter,28266,52200.785317,2019-01-01 03:00:00+01:00,24493.50,-9.91,52792.0,
8763,2019-01-01 03:00:00+00:00,3,1,1,1,winter,28266,52200.785317,2019-01-01 04:00:00+01:00,26399.00,-7.41,52792.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
43818,2022-12-31 18:00:00+00:00,18,5,31,12,winter,29262,57973.178937,2022-12-31 19:00:00+01:00,30471.25,-1.01,,
43819,2022-12-31 19:00:00+00:00,19,5,31,12,winter,29262,57973.178937,2022-12-31 20:00:00+01:00,30505.00,-1.39,,
43820,2022-12-31 20:00:00+00:00,20,5,31,12,winter,29262,57973.178937,2022-12-31 21:00:00+01:00,30768.50,-1.04,,
43821,2022-12-31 21:00:00+00:00,21,5,31,12,winter,29262,57973.178937,2022-12-31 22:00:00+01:00,30490.75,-1.07,,


In [977]:
df_smard_forecasted_generation.loc[~(df_smard_forecasted_generation['datetime_cet'].isna()) &
                                  ~(df_smard_forecasted_generation['forecasted_generation_smard_mwh'].isna())]

Unnamed: 0,datetime_cet,forecasted_generation_smard_mwh
0,2018-01-01 00:00:00+01:00,28432.25
1,2018-01-01 01:00:00+01:00,29632.75
2,2018-01-01 02:00:00+01:00,30978.75
3,2018-01-01 03:00:00+01:00,32154.25
4,2018-01-01 04:00:00+01:00,33045.50
...,...,...
24090,2023-08-31 19:00:00+02:00,9062.75
24091,2023-08-31 20:00:00+02:00,8645.25
24092,2023-08-31 21:00:00+02:00,9225.00
24093,2023-08-31 22:00:00+02:00,9885.75


In [968]:
df_smard_forecasted_generation.set_index('datetime_cet')

Unnamed: 0_level_0,forecasted_generation_smard_mwh
datetime_cet,Unnamed: 1_level_1
2018-01-01 00:00:00+01:00,28432.25
2018-01-01 01:00:00+01:00,29632.75
2018-01-01 02:00:00+01:00,30978.75
2018-01-01 03:00:00+01:00,32154.25
2018-01-01 04:00:00+01:00,33045.50
...,...
NaT,
NaT,
NaT,
NaT,


In [1064]:
# Generated data; the response variable `y`
# keep the datetime_cet column for reference
df_main = df_main.merge(df_smard_generated, left_on='datetime_utc', right_on='datetime_cet', how='left')

In [1065]:
df_main = df_main.merge(df_smard_forecasted_generation, left_on='datetime_utc', right_on='datetime_cet', how='left', suffixes=(None, '_remove'))

In [1066]:
df_main = df_main.merge(df_smard_market_price, left_on='datetime_utc', right_on='datetime_cet', how='left', suffixes=(None, '_remove'))

In [1067]:
df_main = df_main.merge(df_smard_installed_capacity, left_on='datetime_utc', right_on='datetime_cet', how='left', suffixes=(None, '_remove'))

In [1068]:
df_main.drop(df_main.filter(regex='_remove$').columns, axis=1, inplace=True)

In [1069]:
df_main.isna().sum()

datetime_utc                                         0
hour                                                 0
day_of_week                                          0
day_of_month                                         0
month_number                                         0
year                                                 0
meteorological_season                                0
turbines_in_operation                                0
total_nominal_capacity_operational_turbines_mw       0
datetime_cet                                         2
generated_smard_mwh                                  2
forecasted_generation_smard_mwh                     26
day_ahead_price_eur_mwh                           6552
total_nominal_capacity_smard_mw                   8018
dtype: int64

In [1070]:
df_main.loc[df_main['forecasted_generation_smard_mwh'].isna()]

Unnamed: 0,datetime_utc,hour,day_of_week,day_of_month,month_number,year,meteorological_season,turbines_in_operation,total_nominal_capacity_operational_turbines_mw,datetime_cet,generated_smard_mwh,forecasted_generation_smard_mwh,day_ahead_price_eur_mwh,total_nominal_capacity_smard_mw
43559,2022-12-20 23:00:00+00:00,23,1,20,12,2022,winter,29240,57831.253937,2022-12-21 00:00:00+01:00,16965.0,,182.11,
43560,2022-12-21 00:00:00+00:00,0,2,21,12,2022,winter,29245,57850.803937,2022-12-21 01:00:00+01:00,16051.25,,174.76,
43561,2022-12-21 01:00:00+00:00,1,2,21,12,2022,winter,29245,57850.803937,2022-12-21 02:00:00+01:00,15389.0,,178.93,
43562,2022-12-21 02:00:00+00:00,2,2,21,12,2022,winter,29245,57850.803937,2022-12-21 03:00:00+01:00,14840.25,,167.93,
43563,2022-12-21 03:00:00+00:00,3,2,21,12,2022,winter,29245,57850.803937,2022-12-21 04:00:00+01:00,13779.0,,168.96,
43564,2022-12-21 04:00:00+00:00,4,2,21,12,2022,winter,29245,57850.803937,2022-12-21 05:00:00+01:00,12826.0,,180.0,
43565,2022-12-21 05:00:00+00:00,5,2,21,12,2022,winter,29245,57850.803937,2022-12-21 06:00:00+01:00,11942.0,,218.57,
43566,2022-12-21 06:00:00+00:00,6,2,21,12,2022,winter,29245,57850.803937,2022-12-21 07:00:00+01:00,11285.5,,253.16,
43567,2022-12-21 07:00:00+00:00,7,2,21,12,2022,winter,29245,57850.803937,2022-12-21 08:00:00+01:00,10974.25,,273.0,
43568,2022-12-21 08:00:00+00:00,8,2,21,12,2022,winter,29245,57850.803937,2022-12-21 09:00:00+01:00,11002.0,,269.84,


In [1019]:
df_main.isna().sum()

datetime_utc                                         0
hour                                                 0
day_of_week                                          0
day_of_month                                         0
month_number                                         0
year                                                 0
meteorological_season                                0
turbines_in_operation                                0
total_nominal_capacity_operational_turbines_mw       0
datetime_cet                                         2
generated_smard_mwh                                  2
forecasted_generation_smard_mwh                   8018
day_ahead_price_eur_mwh                           6552
total_nominal_capacity_smard_mw                   8018
dtype: int64

# HERE!!!!!! ------

In [1038]:
df_smard_forecasted_generation.loc[df_smard_forecasted_generation['datetime_cet'] > pd.to_datetime('2018-10-27 18:00:00+00:00', utc=True)].head(10)

Unnamed: 0,datetime_cet,forecasted_generation_smard_mwh
7196,2018-10-27 21:00:00+02:00,5955.75
7197,2018-10-27 22:00:00+02:00,6557.5
7198,2018-10-27 23:00:00+02:00,7169.5
7199,2018-10-28 00:00:00+02:00,8617.25
7200,2018-10-28 01:00:00+02:00,9332.5
7201,2018-10-28 02:00:00+02:00,10032.5
7202,2018-10-28 02:00:00+01:00,10562.0
7203,2018-10-28 03:00:00+01:00,11291.75
7204,2018-10-28 04:00:00+01:00,12070.75
7205,2018-10-28 05:00:00+01:00,12746.0


In [None]:
28.10.2018

In [1023]:
df_main.loc[df_main['forecasted_generation_smard_mwh'].isna()].head(20)

Unnamed: 0,datetime_utc,hour,day_of_week,day_of_month,month_number,year,meteorological_season,turbines_in_operation,total_nominal_capacity_operational_turbines_mw,datetime_cet,generated_smard_mwh,forecasted_generation_smard_mwh,day_ahead_price_eur_mwh,total_nominal_capacity_smard_mw
35807,2022-01-31 23:00:00+00:00,23,0,31,1,2022,winter,28971,55947.742347,2022-02-01 00:00:00+01:00,17846.75,,160.15,
35808,2022-02-01 00:00:00+00:00,0,1,1,2,2022,winter,28970,55947.142347,2022-02-01 01:00:00+01:00,17980.75,,154.54,
35809,2022-02-01 01:00:00+00:00,1,1,1,2,2022,winter,28970,55947.142347,2022-02-01 02:00:00+01:00,18460.75,,156.24,
35810,2022-02-01 02:00:00+00:00,2,1,1,2,2022,winter,28970,55947.142347,2022-02-01 03:00:00+01:00,18544.5,,155.17,
35811,2022-02-01 03:00:00+00:00,3,1,1,2,2022,winter,28970,55947.142347,2022-02-01 04:00:00+01:00,19305.75,,165.13,
35812,2022-02-01 04:00:00+00:00,4,1,1,2,2022,winter,28970,55947.142347,2022-02-01 05:00:00+01:00,20852.25,,163.01,
35813,2022-02-01 05:00:00+00:00,5,1,1,2,2022,winter,28970,55947.142347,2022-02-01 06:00:00+01:00,23527.25,,206.15,
35814,2022-02-01 06:00:00+00:00,6,1,1,2,2022,winter,28970,55947.142347,2022-02-01 07:00:00+01:00,25740.75,,239.17,
35815,2022-02-01 07:00:00+00:00,7,1,1,2,2022,winter,28970,55947.142347,2022-02-01 08:00:00+01:00,27419.0,,243.72,
35816,2022-02-01 08:00:00+00:00,8,1,1,2,2022,winter,28970,55947.142347,2022-02-01 09:00:00+01:00,29627.25,,213.0,


In [966]:
df_main.loc[:, ['datetime_utc', 'datetime_cet', 'forecasted_generation_smard_mwh']].loc[df_main['forecasted_generation_smard_mwh'].isna()]

Unnamed: 0,datetime_utc,datetime_cet,forecasted_generation_smard_mwh
8759,2018-12-31 23:00:00+00:00,2019-01-01 00:00:00+01:00,
8760,2019-01-01 00:00:00+00:00,2019-01-01 01:00:00+01:00,
8761,2019-01-01 01:00:00+00:00,2019-01-01 02:00:00+01:00,
8762,2019-01-01 02:00:00+00:00,2019-01-01 03:00:00+01:00,
8763,2019-01-01 03:00:00+00:00,2019-01-01 04:00:00+01:00,
...,...,...,...
43820,2022-12-31 20:00:00+00:00,2022-12-31 21:00:00+01:00,
43821,2022-12-31 21:00:00+00:00,2022-12-31 22:00:00+01:00,
43822,2022-12-31 22:00:00+00:00,2022-12-31 23:00:00+01:00,
49654,2023-08-31 22:00:00+00:00,NaT,


In [950]:
# df_smard_market_price.drop_duplicates(subset='datetime_cet')

In [951]:
# df_smard_market_price.loc[df_smard_market_price.duplicated(subset='datetime_cet')].sample(5)

In [952]:
# df_smard_market_price.loc[df_smard_market_price['datetime_cet'] == pd.to_datetime('2022-01-28 04:00:00+01:00')]

In [953]:
df_smard_market_price['datetime_cet'].value_counts()

datetime_cet
2018-01-01 00:00:00+01:00    1
2021-10-10 01:00:00+02:00    1
2021-10-11 03:00:00+02:00    1
2021-10-11 04:00:00+02:00    1
2021-10-11 05:00:00+02:00    1
                            ..
2019-11-21 18:00:00+01:00    1
2019-11-21 19:00:00+01:00    1
2019-11-21 20:00:00+01:00    1
2019-11-21 21:00:00+01:00    1
2023-08-31 23:00:00+02:00    1
Name: count, Length: 49655, dtype: int64

In [906]:
df_smard_market_price

Unnamed: 0,datetime_cet,day_ahead_price_eur_mwh
0,2018-01-01 00:00:00+01:00,
1,2018-01-01 01:00:00+01:00,
2,2018-01-01 02:00:00+01:00,
3,2018-01-01 03:00:00+01:00,
4,2018-01-01 04:00:00+01:00,
...,...,...
50394,2023-08-31 19:00:00+02:00,148.55
50395,2023-08-31 20:00:00+02:00,139.15
50396,2023-08-31 21:00:00+02:00,125.50
50397,2023-08-31 22:00:00+02:00,106.03


In [887]:
# The day-ahead price data
# -> only add suffix to overlapping columns in right df being merged
df_main = df_main.merge(df_smard_market_price, left_on='datetime_utc', right_on='datetime_cet', how='left', suffixes=(None, '_remove'))

# Installed capacity
df_main = df_main.merge(df_smard_installed_capacity, left_on='datetime_utc', right_on='datetime_cet', how='left', suffixes=(None, '_remove'))

# The day-ahead forecasted generation
df_main = df_main.merge(df_smard_forecasted_generation, left_on='datetime_utc', right_on='datetime_cet', how='left', suffixes=(None, '_remove'))

##### Merge operations

In [887]:
# Generated data; the response variable `y`
# keep the datetime_cet column for reference
df_main = df_main.merge(df_smard_generated, left_on='datetime_utc', right_on='datetime_cet', how='left')

# The day-ahead price data
# -> only add suffix to overlapping columns in right df being merged
df_main = df_main.merge(df_smard_market_price, left_on='datetime_utc', right_on='datetime_cet', how='left', suffixes=(None, '_remove'))

# Installed capacity
df_main = df_main.merge(df_smard_installed_capacity, left_on='datetime_utc', right_on='datetime_cet', how='left', suffixes=(None, '_remove'))

# The day-ahead forecasted generation
df_main = df_main.merge(df_smard_forecasted_generation, left_on='datetime_utc', right_on='datetime_cet', how='left', suffixes=(None, '_remove'))

##### Remove all the duplicate `datetime_cet` columns with suffix `_remove`

In [888]:
# filter using regex; $ binds the expression to end of a string
df_main.drop(df_main.filter(regex='_remove$').columns, axis=1, inplace=True)

In [889]:
df_main.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50400 entries, 0 to 50399
Data columns (total 13 columns):
 #   Column                                          Non-Null Count  Dtype              
---  ------                                          --------------  -----              
 0   datetime_utc                                    50400 non-null  datetime64[ns, UTC]
 1   hour                                            50400 non-null  int32              
 2   day_of_week                                     50400 non-null  int32              
 3   day_of_month                                    50400 non-null  int32              
 4   month_number                                    50400 non-null  int32              
 5   meteorological_season                           50400 non-null  object             
 6   turbines_in_operation                           50400 non-null  int64              
 7   total_nominal_capacity_operational_turbines_mw  50400 non-null  float64            
 

In [891]:
df_main.isna().sum()

datetime_utc                                          0
hour                                                  0
day_of_week                                           0
day_of_month                                          0
month_number                                          0
meteorological_season                                 0
turbines_in_operation                                 0
total_nominal_capacity_operational_turbines_mw        0
datetime_cet                                          2
generated_smard_mwh                                   2
day_ahead_price_eur_mwh                            6552
total_nominal_capacity_smard_mw                    8018
forecasted_generation_smard_mwh                   25562
dtype: int64

In [892]:
df_main

Unnamed: 0,datetime_utc,hour,day_of_week,day_of_month,month_number,meteorological_season,turbines_in_operation,total_nominal_capacity_operational_turbines_mw,datetime_cet,generated_smard_mwh,day_ahead_price_eur_mwh,total_nominal_capacity_smard_mw,forecasted_generation_smard_mwh
0,2018-01-01 00:00:00+00:00,0,0,1,1,winter,27468,49734.697897,2018-01-01 01:00:00+01:00,29638.00,,51633.0,29632.75
1,2018-01-01 01:00:00+00:00,1,0,1,1,winter,27468,49734.697897,2018-01-01 02:00:00+01:00,30173.75,,51633.0,30978.75
2,2018-01-01 02:00:00+00:00,2,0,1,1,winter,27468,49734.697897,2018-01-01 03:00:00+01:00,31021.50,,51633.0,32154.25
3,2018-01-01 03:00:00+00:00,3,0,1,1,winter,27468,49734.697897,2018-01-01 04:00:00+01:00,31015.00,,51633.0,33045.50
4,2018-01-01 04:00:00+00:00,4,0,1,1,winter,27468,49734.697897,2018-01-01 05:00:00+01:00,31534.00,,51633.0,33644.50
...,...,...,...,...,...,...,...,...,...,...,...,...,...
50395,2023-08-31 19:00:00+00:00,19,3,31,8,summer,29445,59571.404737,2023-08-31 21:00:00+02:00,11080.75,125.50,57590.0,9225.00
50396,2023-08-31 20:00:00+00:00,20,3,31,8,summer,29445,59571.404737,2023-08-31 22:00:00+02:00,11553.25,106.03,57590.0,9885.75
50397,2023-08-31 21:00:00+00:00,21,3,31,8,summer,29445,59571.404737,2023-08-31 23:00:00+02:00,11290.00,96.89,57590.0,10282.50
50398,2023-08-31 22:00:00+00:00,22,3,31,8,summer,29445,59571.404737,NaT,,,,


In [893]:
df_main['datetime_utc'].value_counts().head(20)

datetime_utc
2022-01-19 09:00:00+00:00    2
2022-01-27 15:00:00+00:00    2
2022-01-27 06:00:00+00:00    2
2022-01-27 07:00:00+00:00    2
2022-01-27 08:00:00+00:00    2
2022-01-27 09:00:00+00:00    2
2022-01-27 10:00:00+00:00    2
2022-01-27 11:00:00+00:00    2
2022-01-27 12:00:00+00:00    2
2022-01-27 13:00:00+00:00    2
2022-01-27 14:00:00+00:00    2
2022-01-27 16:00:00+00:00    2
2022-01-26 05:00:00+00:00    2
2022-01-27 17:00:00+00:00    2
2022-01-27 18:00:00+00:00    2
2022-01-27 19:00:00+00:00    2
2022-01-27 20:00:00+00:00    2
2022-01-27 21:00:00+00:00    2
2022-01-27 22:00:00+00:00    2
2022-01-27 23:00:00+00:00    2
Name: count, dtype: int64

In [894]:
df_main.loc[df_main['datetime_utc'] == pd.to_datetime('2022-01-19 09:00:00+00:00')]

Unnamed: 0,datetime_utc,hour,day_of_week,day_of_month,month_number,meteorological_season,turbines_in_operation,total_nominal_capacity_operational_turbines_mw,datetime_cet,generated_smard_mwh,day_ahead_price_eur_mwh,total_nominal_capacity_smard_mw,forecasted_generation_smard_mwh
35947,2022-01-19 09:00:00+00:00,9,2,19,1,winter,28968,55923.362347,2022-01-19 10:00:00+01:00,14687.75,221.75,55289.0,13588.5
35948,2022-01-19 09:00:00+00:00,9,2,19,1,winter,28968,55923.362347,2022-01-19 10:00:00+01:00,14687.75,221.75,55289.0,13588.5
