# Build dataset from all market data

In [183]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import csv
import os
import warnings
from IPython.display import Image

![Dataset Timeline](media/individual_datasets_DE_timeline.png)
<!--  Image("media/individual_datasets_DE_timeline.png") -->

- DA
    - DA 60min - DE-AT-LU (ENTSO-e)
    - DA 60min - DE-LU (ENTSO-e)
    - DA 15min - DE-LU (ENTSO-e)
- ID
    - IP-index (netztransparenz)
- FCR
    - Procurement price 1W (SMARD)
    - Procurement price 24h (SMARD)
    - Procurement price 4h (SMARD)
- aFRR cap
    - Cost of capacity 12h (SMARD)
    - Cost of capacity 4h (SMARD)
- aFRR en
    - Cost of activation 15min (SMARD)

## Day-ahead markets

In [241]:
def clean_ENTSOE_data(path, colname, areacode):
    """Reads all csv files in a directory and concatenates them into a single dataframe.
    Drops rows with no currency, sets index to datetime, drops duplicate indexes, drops rows with nans and drop area code column.
    Returns a dataframe with the specified column name"""
    df = pd.DataFrame()
    dirlist = os.listdir(path)
    for i in dirlist:
        if i.endswith(".csv"):
            print('Importing:',path+i)
            df_int = pd.read_csv(path + i)
            df_int.dropna(subset=['Currency'], inplace=True) # Drop rows with no currency
            if len(df_int) != 0: #If there's still rows
                df_int.set_index('MTU (CET/CEST)', inplace=True) 
                df_int.index = pd.to_datetime(df_int.index.str.slice(0,16), format="%d.%m.%Y %H:%M") # Set index to datetime
                # print(df_int.astype('|S').dtypes)
                # Convert all non numbers and dots to nans
                df_int[df_int.columns[0]] = df_int[df_int.columns[0]].astype("string").replace('^[^0-9.]+$', np.nan, regex=True).astype(float)
                df_int = df_int[~df_int.index.duplicated(keep='first')] # Drop duplicate indexes

                df = pd.concat([df,df_int]) # Concatenate to main dataframe
    df.drop(columns=[areacode, 'Currency'], inplace=True)
    df.rename(columns={'Day-ahead Price [EUR/MWh]': colname}, inplace=True)
    df.sort_index(inplace=True) # Sort index
    df.dropna(inplace=True) # Drop rows with nans
    return df


ENTSOE_DE_DATA_dir = "../../20_data/entsoe_data/DE"
DA_60min_DEATLU_subdir = "/DA_60min/DE-AT-LU/" # 'BZN|DE-AT-LU'
DA_60min_DELU_subdir = "/DA_60min/DE-LU/" # 'BZN|DE-LU'
DA_15min_DELU_subdir = "/DA_15min/DE-LU/" # 'BZN|DE-LU'

df_DA_60min_DEATLU = (clean_ENTSOE_data(ENTSOE_DE_DATA_dir + DA_60min_DEATLU_subdir, 'DA_60min_DEATLU', 'BZN|DE-AT-LU'))
# print(df_DA_60min_DEATLU.index.min(), df_DA_60min_DEATLU.index.max())

df_DA_60min_DELU = (clean_ENTSOE_data(ENTSOE_DE_DATA_dir + DA_60min_DELU_subdir, 'DA_60min_DELU', 'BZN|DE-LU'))
# print(df_DA_60min_DELU.index.min(), df_DA_60min_DELU.index.max())

df_DA_15min_DELU = (clean_ENTSOE_data(ENTSOE_DE_DATA_dir + DA_15min_DELU_subdir, 'DA_15min_DELU', 'BZN|DE-LU')) 
# print(df_DA_15min_DELU.index.min(), df_DA_15min_DELU.index.max())

df_DA = (pd.concat([ df_DA_60min_DEATLU.asfreq('15min'), df_DA_60min_DELU.asfreq('15min'), df_DA_15min_DELU])).sort_index()
df_DA = df_DA.groupby(df_DA.index).first() # Combine duplicate indexes (15min and 60min intervals)
# df_DA.plot()

Importing: ../../20_data/entsoe_data/DE/DA_60min/DE-AT-LU/Day-ahead Prices_202001010000-202101010000.csv
Importing: ../../20_data/entsoe_data/DE/DA_60min/DE-AT-LU/Day-ahead Prices_202201010000-202301010000.csv
Importing: ../../20_data/entsoe_data/DE/DA_60min/DE-AT-LU/Day-ahead Prices_202101010000-202201010000.csv
Importing: ../../20_data/entsoe_data/DE/DA_60min/DE-AT-LU/Day-ahead Prices_201901010000-202001010000.csv
Importing: ../../20_data/entsoe_data/DE/DA_60min/DE-AT-LU/Day-ahead Prices_202401010000-202501010000.csv
Importing: ../../20_data/entsoe_data/DE/DA_60min/DE-AT-LU/Day-ahead Prices_202301010000-202401010000.csv
Importing: ../../20_data/entsoe_data/DE/DA_60min/DE-AT-LU/Day-ahead Prices_201801010000-201901010000.csv
Importing: ../../20_data/entsoe_data/DE/DA_60min/DE-AT-LU/Day-ahead Prices_201701010000-201801010000.csv
Importing: ../../20_data/entsoe_data/DE/DA_60min/DE-AT-LU/Day-ahead Prices_201601010000-201701010000.csv
Importing: ../../20_data/entsoe_data/DE/DA_60min/DE-AT-

## Intra-day markets

In [185]:
netztransparenz_DATA_dir = "../../20_data/netztransparenz"
IP_index = "/IP_index/Index Ausgleichsenergiepreis [2024-10-11 17-44-33].csv" 
df_IP_index = pd.read_csv(netztransparenz_DATA_dir + IP_index, sep=";") # Import csv

# Set datetime index
df_IP_index.index = pd.to_datetime(df_IP_index['Datum von'] + " " + df_IP_index['(Uhrzeit) von'], format="%d.%m.%Y %H:%M")

# Drop columns and rename
df_IP_index.drop(columns=(df_IP_index.columns.difference(['ID AEP in €/MWh'])), inplace=True)
df_IP_index.rename(columns={'ID AEP in €/MWh': 'IP_index'}, inplace=True)

# Change comma to dot and convert to float
df_IP_index['IP_index'] = df_IP_index['IP_index'].replace(',', '.', regex=True).astype(float)

df_IP_index = df_IP_index[~df_IP_index.index.duplicated(keep='first')]
 
# df_IP_index

Unnamed: 0,IP_index
2020-07-01 00:00:00,43.75
2020-07-01 00:15:00,37.76
2020-07-01 00:30:00,26.63
2020-07-01 00:45:00,20.38
2020-07-01 01:00:00,33.22
...,...
2024-10-10 22:45:00,115.49
2024-10-10 23:00:00,114.59
2024-10-10 23:15:00,97.92
2024-10-10 23:30:00,99.21


## FCR

### SMARD data

In [238]:
# SMARD import
SMARD_dir = "../../20_data/SMARD"
FCR_subdir = "/FCR/"

df_FCR = pd.DataFrame()
dirlist = os.listdir(SMARD_dir + FCR_subdir)
for i in dirlist:
    if i.endswith(".csv"):
        print('Importing:', i)
        df_int = pd.read_csv(SMARD_dir + FCR_subdir + i, sep=";")
        df_int.set_index('Start date', inplace=True)
        df_int.index = pd.to_datetime(df_int.index, format="%b %d, %Y %I:%M %p")
        df_int.drop(columns=['End date'], inplace=True)
        df_int.rename(columns={'Volume procured [MW] Original resolutions': 'volume_procured_MW', 'Procurement price [€/MW] Original resolutions': 'procurement_price_EUR_MW'}, inplace=True)
        df_int = df_int.replace(',', '', regex=True)
        df_int = df_int.replace('^[^0-9.]+$', np.nan, regex=True).astype(float)
        df_FCR = pd.concat([df_FCR,df_int]) # Concatenate to main dataframe
df_FCR.sort_index(inplace=True) # Sort index

df_FCR = df_FCR[~df_FCR.index.duplicated(keep='first')]
# df_FCR.dropna(inplace=True) # Drop rows with nans
#df_FCR.to_csv('fcr.csv')
# plt.figure(figsize=(10, 5))
# plt.plot(df_FCR.index, df_FCR['procurement_price_EUR_MW'])
# # plt.xlim(pd.Timestamp('2019-05-01'), pd.Timestamp('2023-01-01'))
# # plt.ylim(0, 1000)
# plt.xlabel('Date')
# plt.ylabel('Procurement Price (EUR/MW)')
# plt.title('FCR Procurement Price (2020-2023)')
# plt.grid(True)
# plt.show()

Importing: Frequency_Containment_Reserve_201501010000_202101010000_Quarterhour.csv
Importing: Frequency_Containment_Reserve_202101010000_202501010000_Quarterhour.csv


### Regelleistung data

In [234]:
# Regelleistung, starting July 2019, 24h
regelleistung_dir = "../../20_data/regelleistung"
FCR_subdir = "/FCR/"

df_FCR = pd.DataFrame()
dirlist = os.listdir(regelleistung_dir + FCR_subdir)

df_FCR = pd.DataFrame()
for i in dirlist:
    if i.endswith(".xlsx"):
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            df_int = pd.read_excel(regelleistung_dir + FCR_subdir + i)
        df_int['period_start'] = df_int['DATE_FROM'] + pd.to_timedelta(df_int['PRODUCTNAME'].apply(lambda x: x.split("_")[1] + ":00:00"))
        df_int['FCR length'] = df_int['PRODUCTNAME'].apply(lambda x: '1D' if x[7:12] == "00_24" else '4h')
        print('Importing:', i)
        if 'DE_SETTLEMENTCAPACITY_PRICE_[EUR/MW]' in df_int.columns:
            df_int['settled price [EUR/MW]'] = df_int['DE_SETTLEMENTCAPACITY_PRICE_[EUR/MW]']
        else:
            df_int['settled price [EUR/MW]'] = df_int['GERMANY_SETTLEMENTCAPACITY_PRICE_[EUR/MW]']
        df_int.drop(columns=df_int.columns.difference(['period_start', 'FCR length', 'settled price [EUR/MW]']), inplace=True)
        df_int.set_index('period_start', inplace=True)
        df_FCR = pd.concat([df_FCR,df_int]) # Concatenate to main dataframe
df_FCR.sort_index(inplace=True) # Sort index
df_FCR = df_FCR[~df_FCR.index.duplicated(keep='first')]

Importing: RESULT_OVERVIEW_CAPACITY_MARKET_FCR_2021-01-01_2021-12-31.xlsx
Importing: RESULT_OVERVIEW_CAPACITY_MARKET_FCR_2020-01-01_2020-12-31.xlsx
Importing: RESULT_OVERVIEW_CAPACITY_MARKET_FCR_2022-01-01_2022-12-31.xlsx
Importing: RESULT_OVERVIEW_CAPACITY_MARKET_FCR_2023-01-01_2023-12-31.xlsx
Importing: RESULT_OVERVIEW_CAPACITY_MARKET_FCR_2019-01-01_2019-12-31.xlsx


## aFRR

### SMARD

In [228]:
columns_SMARD_aFRR = ['Volume activated (+) [MWh] Original resolutions',
       'Volume activated (-) [MWh] Original resolutions',
       'Activation price (+) [€/MWh] Original resolutions',
       'Activation price (-) [€/MWh] Original resolutions',
       'Volume procured (+) [MW] Original resolutions',
       'Volume procured (-) [MW] Original resolutions',
       'Procurement price (+) [€/MW] Original resolutions',
       'Procurement price (-) [€/MW] Original resolutions']

# SMARD import
SMARD_dir = "../../20_data/SMARD"
aFRR_subdir = "/aFRR/"

df_aFRR = pd.DataFrame()
dirlist = os.listdir(SMARD_dir + aFRR_subdir)
for i in dirlist:
    if i.endswith(".csv"):
        print('Importing:', i)
        df_int = pd.read_csv(SMARD_dir + aFRR_subdir + i, sep=";",low_memory=False)
        df_int.set_index('Start date', inplace=True)
        df_int.index = pd.to_datetime(df_int.index, format="%b %d, %Y %I:%M %p")
        df_int.drop(columns=['End date'], inplace=True)
        df_int.rename(columns={'Volume activated (+) [MWh] Original resolutions':'aFRR_en_pos',
        'Volume activated (-) [MWh] Original resolutions':'aFRR_en_neg',
        'Activation price (+) [€/MWh] Original resolutions':'aFRR_en_price_pos',
        'Activation price (-) [€/MWh] Original resolutions':'aFRR_en_price_neg',
        'Volume procured (+) [MW] Original resolutions':'aFRR_cap_pos',
        'Volume procured (-) [MW] Original resolutions':'aFRR_cap_neg',
        'Procurement price (+) [€/MW] Original resolutions':'aFRR_cap_price_pos',
        'Procurement price (-) [€/MW] Original resolutions':'aFRR_cap_price_neg'}, inplace=True)
        df_int = df_int.replace(',', '', regex=True)
        df_int = df_int.replace('^[^0-9.]+$', np.nan, regex=True).astype(float)
        df_aFRR = pd.concat([df_aFRR,df_int]) # Concatenate to main dataframe
df_aFRR.sort_index(inplace=True) # Sort index
df_aFRR = df_aFRR[~df_aFRR.index.duplicated(keep='first')]
#df_aFRR.dropna(inplace=True) # Drop rows with nans
# #df_FCR.to_csv('fcr.csv')
# plt.figure(figsize=(10, 5))
# plt.plot(df_aFRR.index, df_aFRR['aFRR_en_pos'])
# plt.xlim(pd.Timestamp('2019-05-01'), pd.Timestamp('2023-01-01'))
# plt.ylim(0, 1000)
# plt.xlabel('Date')
# plt.ylabel('pos en act')
# plt.title('aFRR pos en act')
# plt.grid(True)
# plt.show()

Importing: Automatic_Frequency_Restoration_Reserve_202101010000_202501010000_Quarterhour.csv
Importing: Automatic_Frequency_Restoration_Reserve_201501010000_202101010000_Quarterhour.csv


## Join, test, analyse and export

In [235]:
# Check there are no duplicate indexes in any of the dataframes
print(df_DA.index.duplicated().sum())
print(df_IP_index.index.duplicated().sum())
print(df_FCR.index.duplicated().sum())
print(df_aFRR.index.duplicated().sum())

0
0
0
0


In [236]:
combined_df = pd.concat([df_IP_index,df_aFRR,df_FCR,df_DA],axis=1)

# total scope of data excluding DA datasets (as they don't overlap)
print(combined_df.dropna(subset=combined_df.columns.difference(df_DA.columns)).index.max()- combined_df.dropna(subset=combined_df.columns.difference(df_DA.columns)).index.min())

#nans with and without drop all
print(len(combined_df),'-',len(combined_df.dropna(how='all')))

720 days 20:00:00
350688 - 342776
