# Building the ML dataframe

In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
import xarray as xr

In [3]:
# Set max number of columns to display; default 20
pd.options.display.max_columns = 80

In [4]:
# Directory where data files will be downloaded
cwd_path = Path.cwd()
data_path = cwd_path.parent.joinpath('data')
data_push_path = cwd_path.parent.joinpath('data_to_push')

#### Load in latest wind turbine data

In [375]:
# Read in latest turbine data
# Now only 30,642 after removing 3 turbines outside bounding box
df_turbines = pd.read_pickle(data_push_path / 'df_turbines_knn_blades_haversine_elevation_utc.pkl')
df_turbines.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30642 entries, 0 to 30641
Data columns (total 30 columns):
 #   Column                                            Non-Null Count  Dtype              
---  ------                                            --------------  -----              
 0   EinheitMastrNummer                                30642 non-null  object             
 1   DatumLetzteAktualisierung                         30642 non-null  datetime64[ns, UTC]
 2   Bundesland                                        30642 non-null  object             
 3   Postleitzahl                                      30642 non-null  int64              
 4   Ort                                               30642 non-null  object             
 5   Laengengrad                                       30642 non-null  float64            
 6   Breitengrad                                       30642 non-null  float64            
 7   Registrierungsdatum                               30642 non-null  d

In [376]:
# List of datetime column names for convenient indexing later
turbine_date_columns = [
    'DatumLetzteAktualisierung',
    'Registrierungsdatum',
    'Inbetriebnahmedatum',
    'DatumEndgueltigeStilllegung',
    'DatumBeginnVoruebergehendeStilllegung',
    'DatumWiederaufnahmeBetrieb'
]

In [371]:
# # Cast all tz naive datetime64 columns to UTC tz aware
# df_turbines[turbine_date_columns] = df_turbines[turbine_date_columns].apply(lambda series: series.dt.tz_localize('UTC'), axis=0)

# # save to pickle
# df_turbines.to_pickle(data_push_path / 'df_turbines_knn_blades_haversine_elevation_utc.pkl')

#### Load in SMARD data

In [159]:
# Get all SMARD CSV files, load into dataframes, and concat them

def load_SMARD_data():
    """
    Look for CSV files in the SMARD directory
    """
    list_of_dataframes = []
    for filepath in data_path.joinpath('SMARD').iterdir():
        if filepath.name.startswith('Realisierte_Erzeugung'):
            list_of_dataframes.append(pd.read_csv(filepath, sep=';'))
            
    # Check that num of rows is what I expect after concatenating dfs
    num_of_rows = 0
    for df in list_of_dataframes:
        num_of_rows += len(df)
    
    # concat dataframes
    df = pd.concat(list_of_dataframes)
    
    # Select columns to keep
    # Note: I'm using the interval start time to create the timestamps
    # ERA5 surface parameters are instantaneous so can't perfectly align anyway
    df = df[[
        'Datum',
        'Anfang',
        # 'Ende',
        'Wind Onshore [MWh] Berechnete Auflösungen'
    ]]
    
    # rename columns
    df.rename(columns={
        'Datum': 'date',
        'Anfang': 'interval_start_time', 
        # 'Ende': 'interval_end_time',
        'Wind Onshore [MWh] Berechnete Auflösungen': 'onshore_generated_mwh'
    }, inplace=True)
    
    # Handle dates and times to create unified datetime64 timestamps
    df['date'] = pd.to_datetime(df['date'], format='%d.%m.%Y')
    df['start_time_delta'] = pd.to_timedelta(df['interval_start_time'] + ':00')
    df['datetime_cet'] = df['date'] + df['start_time_delta']
    # Add CET timezone info and infer change from CET->CEST->CET, etc
    df['datetime_cet'] = df['datetime_cet'].dt.tz_localize(tz='CET', ambiguous='infer')
    
    # Drop un-needed columns
    df = df.drop(columns=['date', 'start_time_delta', 'interval_start_time'])
    # Re-arrange columns
    df = df[['datetime_cet', 'onshore_generated_mwh']]
    
    # Convert European thousands and decimal seperators in values to (US/UK) decimal full stop format
    translation_table = str.maketrans({'.': None, ',': '.'})
    df['onshore_generated_mwh'] = df['onshore_generated_mwh'].str.translate(translation_table).astype(float)
    
    print(f'Number of rows match up: {num_of_rows == len(df)}')
    
    return df.copy()

In [161]:
# Load SMARD data
df_smard = load_SMARD_data()
df_smard.info()

Number of rows match up: True
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8760 entries, 0 to 8759
Data columns (total 2 columns):
 #   Column                 Non-Null Count  Dtype              
---  ------                 --------------  -----              
 0   datetime_cet           8760 non-null   datetime64[ns, CET]
 1   onshore_generated_mwh  8760 non-null   float64            
dtypes: datetime64[ns, CET](1), float64(1)
memory usage: 137.0 KB


In [162]:
# df_smard[df_smard['datetime'] < pd.Timestamp('2022-10-30')]

In [163]:
df_smard.head(3)

Unnamed: 0,datetime_cet,onshore_generated_mwh
0,2022-01-01 00:00:00+01:00,25487.75
1,2022-01-01 01:00:00+01:00,24280.5
2,2022-01-01 02:00:00+01:00,23114.0


-----

## Begin building ML dataframe
- Start with one full year: 2022
    - Then bring in another year like 2021 and run through the same transformations and then concat along datetimeindex?
- Good resource on time-related feature engineering: https://scikit-learn.org/stable/auto_examples/applications/plot_cyclical_feature_engineering.html

In [140]:
# Year 2022
# Make time zone aware UTC?
datetime_index_utc = pd.date_range(start='2022-01-01', end='2022-12-31 23:59:59', freq='H', name='datetime_utc', tz='UTC')
datetime_index_utc

DatetimeIndex(['2022-01-01 00:00:00+00:00', '2022-01-01 01:00:00+00:00',
               '2022-01-01 02:00:00+00:00', '2022-01-01 03:00:00+00:00',
               '2022-01-01 04:00:00+00:00', '2022-01-01 05:00:00+00:00',
               '2022-01-01 06:00:00+00:00', '2022-01-01 07:00:00+00:00',
               '2022-01-01 08:00:00+00:00', '2022-01-01 09:00:00+00:00',
               ...
               '2022-12-31 14:00:00+00:00', '2022-12-31 15:00:00+00:00',
               '2022-12-31 16:00:00+00:00', '2022-12-31 17:00:00+00:00',
               '2022-12-31 18:00:00+00:00', '2022-12-31 19:00:00+00:00',
               '2022-12-31 20:00:00+00:00', '2022-12-31 21:00:00+00:00',
               '2022-12-31 22:00:00+00:00', '2022-12-31 23:00:00+00:00'],
              dtype='datetime64[ns, UTC]', name='datetime_utc', length=8760, freq='H')

In [141]:
# Create dataframe from datetimeindex, reset index and drop the duplicated datetime column
# datetime_index_utc.to_frame().reset_index(drop=True, inplace=True)

In [142]:
df = datetime_index_utc.to_frame().reset_index(drop=True)
df

Unnamed: 0,datetime_utc
0,2022-01-01 00:00:00+00:00
1,2022-01-01 01:00:00+00:00
2,2022-01-01 02:00:00+00:00
3,2022-01-01 03:00:00+00:00
4,2022-01-01 04:00:00+00:00
...,...
8755,2022-12-31 19:00:00+00:00
8756,2022-12-31 20:00:00+00:00
8757,2022-12-31 21:00:00+00:00
8758,2022-12-31 22:00:00+00:00


In [143]:
# Extract properties
df['hour'] = df['datetime_utc'].dt.hour
# 0-6 (Monday-Sunday)
df['day_of_week'] = df['datetime_utc'].dt.dayofweek
df['day_of_month'] = df['datetime_utc'].dt.day
df['month_number'] = df['datetime_utc'].dt.month

# Function to get meteorological season based on month number (1-12)
# Should I just go ahead and encode the discrete numerical value?
def get_meteorological_season(month):
    if 3 <= month <= 5:
        return 'spring'
    elif 6 <= month <= 8:
        return 'summer'
    elif 9 <= month <= 11:
        return 'autumn'
    else:
        return 'winter'
    
df['meteorological_season'] = df['month_number'].apply(get_meteorological_season)

In [144]:
df

Unnamed: 0,datetime_utc,hour,day_of_week,day_of_month,month_number,meteorological_season
0,2022-01-01 00:00:00+00:00,0,5,1,1,winter
1,2022-01-01 01:00:00+00:00,1,5,1,1,winter
2,2022-01-01 02:00:00+00:00,2,5,1,1,winter
3,2022-01-01 03:00:00+00:00,3,5,1,1,winter
4,2022-01-01 04:00:00+00:00,4,5,1,1,winter
...,...,...,...,...,...,...
8755,2022-12-31 19:00:00+00:00,19,5,31,12,winter
8756,2022-12-31 20:00:00+00:00,20,5,31,12,winter
8757,2022-12-31 21:00:00+00:00,21,5,31,12,winter
8758,2022-12-31 22:00:00+00:00,22,5,31,12,winter


----

## Number of active turbines per hour and total active nominal power 
- convert all date columns in turbine df to UTC time zone aware! Then re-save to pkl. Does it keep the dtype?
    - Yes!

In [377]:
df_turbines[turbine_date_columns][df_turbines['EinheitBetriebsstatus'] != 'In Betrieb']

Unnamed: 0,DatumLetzteAktualisierung,Registrierungsdatum,Inbetriebnahmedatum,DatumEndgueltigeStilllegung,DatumBeginnVoruebergehendeStilllegung,DatumWiederaufnahmeBetrieb
26,2022-12-01 07:52:10.180000+00:00,2019-02-01 00:00:00+00:00,2000-08-03 00:00:00+00:00,2021-03-01 00:00:00+00:00,NaT,NaT
27,2022-12-01 07:52:10.180000+00:00,2019-02-01 00:00:00+00:00,2000-08-04 00:00:00+00:00,2021-03-01 00:00:00+00:00,NaT,NaT
28,2022-12-01 07:52:10.180000+00:00,2019-02-01 00:00:00+00:00,2000-08-03 00:00:00+00:00,2021-03-01 00:00:00+00:00,NaT,NaT
29,2022-12-01 07:52:10.180000+00:00,2019-02-01 00:00:00+00:00,2000-08-09 00:00:00+00:00,2021-03-01 00:00:00+00:00,NaT,NaT
30,2022-12-01 07:52:10.180000+00:00,2019-02-01 00:00:00+00:00,2000-08-07 00:00:00+00:00,2021-03-01 00:00:00+00:00,NaT,NaT
...,...,...,...,...,...,...
30559,2023-05-31 19:42:04.614054100+00:00,2022-12-15 00:00:00+00:00,2023-02-04 00:00:00+00:00,NaT,2023-05-19 00:00:00+00:00,NaT
30561,2023-03-16 07:00:12.982323600+00:00,2022-10-26 00:00:00+00:00,1992-08-07 00:00:00+00:00,NaT,2021-12-31 00:00:00+00:00,NaT
30570,2023-03-08 09:45:31.647308700+00:00,2022-11-06 00:00:00+00:00,1991-06-24 00:00:00+00:00,NaT,2023-01-01 00:00:00+00:00,NaT
30574,2023-02-19 09:31:58.205552900+00:00,2022-12-05 00:00:00+00:00,2022-12-05 00:00:00+00:00,2023-02-01 00:00:00+00:00,NaT,NaT


##### I'm expecting the number of active turbines at the start of 2022 to be around 30,049!
- Way to check this:
    - `df_turbines[turbine_date_columns][~(df_turbines['DatumEndgueltigeStilllegung'] < pd.to_datetime('2022-01-01'))]`
    - All these c.600 turbines are no longer active come start of 2022!

In [317]:
# df_turbines[turbine_date_columns][(df_turbines['Inbetriebnahmedatum'] < pd.to_datetime('2022-01-01'))]

# This looks good! Need to validate!

In [312]:
df_turbines.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30642 entries, 0 to 30641
Data columns (total 30 columns):
 #   Column                                            Non-Null Count  Dtype              
---  ------                                            --------------  -----              
 0   EinheitMastrNummer                                30642 non-null  object             
 1   DatumLetzteAktualisierung                         30642 non-null  datetime64[ns]     
 2   Bundesland                                        30642 non-null  object             
 3   Postleitzahl                                      30642 non-null  int64              
 4   Ort                                               30642 non-null  object             
 5   Laengengrad                                       30642 non-null  float64            
 6   Breitengrad                                       30642 non-null  float64            
 7   Registrierungsdatum                               30642 non-null  d

In [378]:
def count_active_turbines(df, df_turbines):
    
    def is_operational(timestamp):
        """
        Get a bool series of turbines that are operational at the time of the [hourly] timestamp
        """
        # Turbines that went into operation before the timestamp; returns bool series for bool indexing
        started_operations = df_turbines['Inbetriebnahmedatum'] <= timestamp
        
        # Turbines already decommissioned before the timestamp; returns bool series for bool indexing
        already_decommissioned = df_turbines['DatumEndgueltigeStilllegung'] <= timestamp
        
        # Turbines that went into maintenance before timestamp and haven't come back into operation before the timestamp
        # Note: Some turbines go straight from temporary maintenance to decommissioned without ever going back into operation
        still_in_maintenance_or_decommissioned = (df_turbines['DatumBeginnVoruebergehendeStilllegung'] <= timestamp) & \
        ((df_turbines['DatumWiederaufnahmeBetrieb'] > timestamp) | df_turbines['DatumWiederaufnahmeBetrieb'].isna())
        
        # Number of turbines operational at the timestamp
        # This is a bool series with .sum() counting the number of True
        return (started_operations & ~already_decommissioned & ~still_in_maintenance_or_decommissioned).sum()
    
    df['num_active_turbines'] = df['datetime_utc'].apply(is_operational)
    return df

In [379]:
# About 8s to run for all 8,760 hours in the year
df = count_active_turbines(df, df_turbines)

In [380]:
df

Unnamed: 0,datetime_utc,hour,day_of_week,day_of_month,month_number,meteorological_season,num_active_turbines
0,2022-01-01 00:00:00+00:00,0,5,1,1,winter,28965
1,2022-01-01 01:00:00+00:00,1,5,1,1,winter,28965
2,2022-01-01 02:00:00+00:00,2,5,1,1,winter,28965
3,2022-01-01 03:00:00+00:00,3,5,1,1,winter,28965
4,2022-01-01 04:00:00+00:00,4,5,1,1,winter,28965
...,...,...,...,...,...,...,...
8755,2022-12-31 19:00:00+00:00,19,5,31,12,winter,29262
8756,2022-12-31 20:00:00+00:00,20,5,31,12,winter,29262
8757,2022-12-31 21:00:00+00:00,21,5,31,12,winter,29262
8758,2022-12-31 22:00:00+00:00,22,5,31,12,winter,29262


In [316]:
df['num_active_turbines'].describe()

count     8760.000000
mean     29090.161644
std         91.452150
min      28964.000000
25%      29001.000000
50%      29093.000000
75%      29183.000000
max      29277.000000
Name: num_active_turbines, dtype: float64

### Validate the function!

In [318]:
timestamp = df['datetime_utc'][0]
timestamp

Timestamp('2022-01-01 00:00:00+0000', tz='UTC')

In [345]:
started_operations = df_turbines['Inbetriebnahmedatum'] <= timestamp
started_operations

0         True
1         True
2         True
3         True
4         True
         ...  
30637    False
30638    False
30639    False
30640     True
30641    False
Name: Inbetriebnahmedatum, Length: 30642, dtype: bool

In [346]:
df_turbines[turbine_date_columns][-3:]

Unnamed: 0,Inbetriebnahmedatum,DatumEndgueltigeStilllegung,DatumBeginnVoruebergehendeStilllegung,DatumWiederaufnahmeBetrieb
30639,2023-07-20 00:00:00+00:00,NaT,NaT,NaT
30640,1996-01-11 00:00:00+00:00,NaT,NaT,NaT
30641,2023-06-23 00:00:00+00:00,NaT,NaT,NaT


In [347]:
already_decommissioned = df_turbines['DatumEndgueltigeStilllegung'] <= timestamp
already_decommissioned

0        False
1        False
2        False
3        False
4        False
         ...  
30637    False
30638    False
30639    False
30640    False
30641    False
Name: DatumEndgueltigeStilllegung, Length: 30642, dtype: bool

In [348]:
df_turbines[turbine_date_columns][already_decommissioned].notna().sum()

Inbetriebnahmedatum                      594
DatumEndgueltigeStilllegung              594
DatumBeginnVoruebergehendeStilllegung     14
DatumWiederaufnahmeBetrieb                 1
dtype: int64

In [349]:
still_in_maintenance_or_decommissioned = (df_turbines['DatumBeginnVoruebergehendeStilllegung'] <= timestamp) & \
              ((df_turbines['DatumWiederaufnahmeBetrieb'] > timestamp) | df_turbines['DatumWiederaufnahmeBetrieb'].isna())

In [350]:
df_turbines[turbine_date_columns][still_in_maintenance]

Unnamed: 0,Inbetriebnahmedatum,DatumEndgueltigeStilllegung,DatumBeginnVoruebergehendeStilllegung,DatumWiederaufnahmeBetrieb
2159,2000-02-04 00:00:00+00:00,NaT,2019-09-18 00:00:00+00:00,NaT
7986,2000-09-30 00:00:00+00:00,NaT,2020-09-30 00:00:00+00:00,NaT
8173,2000-09-30 00:00:00+00:00,NaT,2021-09-12 00:00:00+00:00,NaT
8809,2010-03-25 00:00:00+00:00,2020-07-15 00:00:00+00:00,2020-07-14 00:00:00+00:00,NaT
8815,2010-03-23 00:00:00+00:00,2020-07-14 00:00:00+00:00,2020-07-13 00:00:00+00:00,NaT
8821,2010-03-18 00:00:00+00:00,2020-07-25 00:00:00+00:00,2020-07-24 00:00:00+00:00,NaT
8838,2010-03-19 00:00:00+00:00,2020-05-29 00:00:00+00:00,2020-05-28 00:00:00+00:00,NaT
8839,2010-03-22 00:00:00+00:00,2020-07-23 00:00:00+00:00,2020-07-22 00:00:00+00:00,NaT
8842,2010-03-19 00:00:00+00:00,2020-07-17 00:00:00+00:00,2020-07-16 00:00:00+00:00,NaT
8875,2003-12-06 00:00:00+00:00,2020-08-11 00:00:00+00:00,2020-08-10 00:00:00+00:00,NaT


In [353]:
operational_bool_series = started_operations & ~already_decommissioned & ~still_in_maintenance_or_decommissioned
operational_bool_series

0         True
1         True
2         True
3         True
4         True
         ...  
30637    False
30638    False
30639    False
30640     True
30641    False
Length: 30642, dtype: bool

In [356]:
operational_bool_series.value_counts()

True     28965
False     1677
Name: count, dtype: int64

In [355]:
operational_bool_series.sum()

28965

In [368]:
df_turbines[turbine_date_columns][operational_bool_series].sample(10)

Unnamed: 0,Inbetriebnahmedatum,DatumEndgueltigeStilllegung,DatumBeginnVoruebergehendeStilllegung,DatumWiederaufnahmeBetrieb
17716,2014-12-16 00:00:00+00:00,NaT,NaT,NaT
29116,2004-05-27 00:00:00+00:00,NaT,NaT,NaT
11725,2004-09-01 00:00:00+00:00,NaT,NaT,NaT
27521,2002-08-05 00:00:00+00:00,NaT,NaT,NaT
27559,2012-03-21 00:00:00+00:00,NaT,NaT,NaT
15835,2012-11-06 00:00:00+00:00,NaT,NaT,NaT
3172,2015-01-26 00:00:00+00:00,NaT,NaT,NaT
1353,2016-04-21 00:00:00+00:00,NaT,NaT,NaT
4695,2015-08-25 00:00:00+00:00,NaT,NaT,NaT
12692,2012-08-29 00:00:00+00:00,NaT,NaT,NaT


In [357]:
df_turbines[turbine_date_columns]

Unnamed: 0,Inbetriebnahmedatum,DatumEndgueltigeStilllegung,DatumBeginnVoruebergehendeStilllegung,DatumWiederaufnahmeBetrieb
0,2017-09-01 00:00:00+00:00,NaT,NaT,NaT
1,2017-09-28 00:00:00+00:00,NaT,NaT,NaT
2,2017-09-04 00:00:00+00:00,NaT,NaT,NaT
3,2017-08-31 00:00:00+00:00,NaT,NaT,NaT
4,2017-01-11 00:00:00+00:00,NaT,NaT,NaT
...,...,...,...,...
30637,2023-06-24 00:00:00+00:00,NaT,NaT,NaT
30638,2023-07-28 00:00:00+00:00,NaT,NaT,NaT
30639,2023-07-20 00:00:00+00:00,NaT,NaT,NaT
30640,1996-01-11 00:00:00+00:00,NaT,NaT,NaT


-----

## Test merging SMARD label data onto `df` on key datetime

In [167]:
df_merged = pd.merge(df, df_smard, left_on='datetime_utc', right_on='datetime_cet', how='left')
df_merged

Unnamed: 0,datetime_utc,hour,day_of_week,day_of_month,month_number,meteorological_season,datetime_cet,onshore_generated_mwh
0,2022-01-01 00:00:00+00:00,0,5,1,1,winter,2022-01-01 01:00:00+01:00,24280.50
1,2022-01-01 01:00:00+00:00,1,5,1,1,winter,2022-01-01 02:00:00+01:00,23114.00
2,2022-01-01 02:00:00+00:00,2,5,1,1,winter,2022-01-01 03:00:00+01:00,21998.25
3,2022-01-01 03:00:00+00:00,3,5,1,1,winter,2022-01-01 04:00:00+01:00,20495.00
4,2022-01-01 04:00:00+00:00,4,5,1,1,winter,2022-01-01 05:00:00+01:00,19494.50
...,...,...,...,...,...,...,...,...
8755,2022-12-31 19:00:00+00:00,19,5,31,12,winter,2022-12-31 20:00:00+01:00,30505.00
8756,2022-12-31 20:00:00+00:00,20,5,31,12,winter,2022-12-31 21:00:00+01:00,30768.50
8757,2022-12-31 21:00:00+00:00,21,5,31,12,winter,2022-12-31 22:00:00+01:00,30490.75
8758,2022-12-31 22:00:00+00:00,22,5,31,12,winter,2022-12-31 23:00:00+01:00,29306.00


In [168]:
df_merged.head()

Unnamed: 0,datetime_utc,hour,day_of_week,day_of_month,month_number,meteorological_season,datetime_cet,onshore_generated_mwh
0,2022-01-01 00:00:00+00:00,0,5,1,1,winter,2022-01-01 01:00:00+01:00,24280.5
1,2022-01-01 01:00:00+00:00,1,5,1,1,winter,2022-01-01 02:00:00+01:00,23114.0
2,2022-01-01 02:00:00+00:00,2,5,1,1,winter,2022-01-01 03:00:00+01:00,21998.25
3,2022-01-01 03:00:00+00:00,3,5,1,1,winter,2022-01-01 04:00:00+01:00,20495.0
4,2022-01-01 04:00:00+00:00,4,5,1,1,winter,2022-01-01 05:00:00+01:00,19494.5


In [169]:
df_merged.isna().sum()

datetime_utc             0
hour                     0
day_of_week              0
day_of_month             0
month_number             0
meteorological_season    0
datetime_cet             1
onshore_generated_mwh    1
dtype: int64

In [170]:
df_merged[7240:].head(15)

Unnamed: 0,datetime_utc,hour,day_of_week,day_of_month,month_number,meteorological_season,datetime_cet,onshore_generated_mwh
7240,2022-10-29 16:00:00+00:00,16,5,29,10,autumn,2022-10-29 18:00:00+02:00,4657.25
7241,2022-10-29 17:00:00+00:00,17,5,29,10,autumn,2022-10-29 19:00:00+02:00,5988.25
7242,2022-10-29 18:00:00+00:00,18,5,29,10,autumn,2022-10-29 20:00:00+02:00,7333.25
7243,2022-10-29 19:00:00+00:00,19,5,29,10,autumn,2022-10-29 21:00:00+02:00,8419.0
7244,2022-10-29 20:00:00+00:00,20,5,29,10,autumn,2022-10-29 22:00:00+02:00,8901.0
7245,2022-10-29 21:00:00+00:00,21,5,29,10,autumn,2022-10-29 23:00:00+02:00,9505.25
7246,2022-10-29 22:00:00+00:00,22,5,29,10,autumn,2022-10-30 00:00:00+02:00,6546.5
7247,2022-10-29 23:00:00+00:00,23,5,29,10,autumn,2022-10-30 01:00:00+02:00,9525.5
7248,2022-10-30 00:00:00+00:00,0,6,30,10,autumn,2022-10-30 02:00:00+02:00,9475.5
7249,2022-10-30 01:00:00+00:00,1,6,30,10,autumn,2022-10-30 02:00:00+01:00,9311.25


-------

## Looking at other SMARD data like market price of electricity 

In [152]:
price = pd.read_csv(data_path.joinpath('SMARD') / 'Gro_handelspreise_202201010000_202212312359_Stunde.csv', sep=';')

In [164]:
translation_table = str.maketrans({'.': None, ',': '.'})
price['Deutschland/Luxemburg [€/MWh] Originalauflösungen'] = price['Deutschland/Luxemburg [€/MWh] Originalauflösungen'].str.translate(translation_table).astype(float)

In [166]:
price['Deutschland/Luxemburg [€/MWh] Originalauflösungen'].describe()

count    8760.000000
mean      235.446143
std       142.809409
min       -19.040000
25%       134.197500
50%       208.340000
75%       310.080000
max       871.000000
Name: Deutschland/Luxemburg [€/MWh] Originalauflösungen, dtype: float64