In [3]:
import requests
import pandas as pd
import xml.etree.ElementTree as ET
from datetime import datetime, timedelta

In [78]:
def xml_to_gen_data(xml_data) -> dict:
    """
    Parse the XML data of generation into a dictionary of DataFrames, one for each PsrType.
    """

    # Define the XML namespace
    namespace = {'ns': 'urn:iec62325.351:tc57wg16:451-6:generationloaddocument:3:0'}
    
    # Parse the XML data
    root = ET.fromstring(xml_data)
    
    # Get all TimeSeries tags
    time_series_tags = root.findall('.//ns:TimeSeries', namespace)
    
    # Initialize a dictionary to hold the data
    data = {"StartTime": [], "EndTime": [], "AreaID": [], "UnitName": [], "PsrType": [], "quantity": []}

    # Loop over all TimeSeries tags
    for ts in time_series_tags:
        # Extract PsrType from MktPSRType if it exists
        psr_type_tag = ts.find('ns:MktPSRType/ns:psrType', namespace)
        psr_type = psr_type_tag.text if psr_type_tag is not None else None

        # Extract AreaID and UnitName if they exist
        area_id_tag = ts.find('ns:inBiddingZone_Domain.mRID', namespace)
        area_id = area_id_tag.text if area_id_tag is not None else None
        unit_name_tag = ts.find('ns:quantity_Measure_Unit.name', namespace)
        unit_name = unit_name_tag.text if unit_name_tag is not None else None

        # Extract the time period start and end if it exists
        time_period = ts.find('ns:Period', namespace)
        if time_period is not None:
            period_start = time_period.find('ns:timeInterval/ns:start', namespace).text
            period_end = time_period.find('ns:timeInterval/ns:end', namespace).text
            resolution = time_period.find('ns:resolution', namespace).text

            # Resolution is PT15M or PT60M
            resolution_minutes = int(resolution.replace('PT', '').replace('M', ''))

            # Extract the point values
            points = time_period.findall('ns:Point', namespace)
            for point in points:
                position = point.find('ns:position', namespace).text
                quantity = point.find('ns:quantity', namespace).text

                # Calculate the actual start and end time for each resolution_minutes interval
                start_time_interval = datetime.fromisoformat(period_start.replace('Z', '+00:00'))
                end_time_interval = start_time_interval + timedelta(minutes=resolution_minutes*(int(position)-1))
                start_time_interval = end_time_interval - timedelta(minutes=resolution_minutes)

                # Append the StartTime, EndTime, AreaID, UnitName, PsrType, and quantity values to the data dictionary
                data["StartTime"].append(start_time_interval.isoformat(timespec='minutes')+'Z')
                data["EndTime"].append(end_time_interval.isoformat(timespec='minutes')+'Z')
                data["AreaID"].append(area_id)
                data["UnitName"].append(unit_name)
                data["PsrType"].append(psr_type)
                data["quantity"].append(quantity)

    # Convert the data dictionary into a pandas DataFrame
    df = pd.DataFrame(data)

    # Create a separate DataFrame for each PsrType
    df_dict = {psr_type: df[df["PsrType"] == psr_type] for psr_type in df["PsrType"].unique()}
    
    return df_dict

In [79]:
def xml_to_load_dataframe(xml_data) -> pd.DataFrame:
    """
    Parse the XML data of Load into a pandas DataFrame.
    """
    namespace = {'ns': 'urn:iec62325.351:tc57wg16:451-6:generationloaddocument:3:0'}
    root = ET.fromstring(xml_data)

    data = []
    for time_series in root.findall('.//ns:TimeSeries', namespace):
        series_id = time_series.find('ns:mRID', namespace).text
        business_type = time_series.find('ns:businessType', namespace).text
        object_aggregation = time_series.find('ns:objectAggregation', namespace).text
        domain_mrid = time_series.find('ns:outBiddingZone_Domain.mRID', namespace).text
        unit_name = time_series.find('ns:quantity_Measure_Unit.name', namespace).text
        curve_type = time_series.find('ns:curveType', namespace).text

        for period in time_series.findall('ns:Period', namespace):
            start_time = period.find('ns:timeInterval/ns:start', namespace).text
            end_time = period.find('ns:timeInterval/ns:end', namespace).text
            resolution = period.find('ns:resolution', namespace).text

            # Resolution is PT15M or PT60M
            resolution_minutes = int(resolution.replace('PT', '').replace('M', ''))
            
            for point in period.findall('ns:Point', namespace):
                position = point.find('ns:position', namespace).text
                quantity = point.find('ns:quantity', namespace).text

                # calculate the actual start and end time for each resolution_minutes interval
                start_time_interval = datetime.fromisoformat(start_time.replace('Z', '+00:00'))
                end_time_interval = start_time_interval + timedelta(minutes=resolution_minutes*(int(position)-1))
                start_time_interval = end_time_interval - timedelta(minutes=resolution_minutes)

                data.append([start_time_interval.isoformat(timespec='minutes')+'Z', end_time_interval.isoformat(timespec='minutes')+'Z', 
                             domain_mrid, unit_name, quantity])

    df = pd.DataFrame(data, columns=['StartTime', 'EndTime', 'AreaID', 'UnitName', 'Load'])
    return df

In [80]:
def make_url(base_url, params):
    """Create URL"""
    query_string = "&".join([f"{k}={v}" for k, v in params.items()])
    print("11111111111")
    print(query_string)
    return f"{base_url}?{query_string}"

In [81]:
def perform_get_request(base_url, params):
    """Perform request to API"""
    url = make_url(base_url, params)
    response = requests.get(url)
    print("2222211111111111")
    print(response)
    if response.status_code == 200:
        return response.text
    else:
        return response.content

In [82]:
def get_load_data_from_entsoe(regions, period_start='202302240000', 
                              period_end='202303240000'):
    """ load data"""
    # TODO: There is a period range limit of 1 year for this API. Process in 1 year chunks if needed
    
    # URL of the RESTful API
    url = 'https://web-api.tp.entsoe.eu/api'

    # General parameters for the API
    # Refer to https://transparency.entsoe.eu/content/static_content/Static%20content/web%20api/Guide.html#_documenttype
    params = {
        'securityToken': '1d9cd4bd-f8aa-476c-8cc1-3442dc91506d',
        'documentType': 'A65',
        'processType': 'A16',
        'outBiddingZone_Domain': 'FILL_IN', # used for Load data
        'periodStart': period_start, # in the format YYYYMMDDHHMM
        'periodEnd': period_end # in the format YYYYMMDDHHMM
    }

    # Loop through the regions and get data for each region
    for region, area_code in regions.items():
        print(f'Fetching data for {region}...')
        params['outBiddingZone_Domain'] = area_code
    
        # Use the requests library to get data from the API for the specified time range
        response_content = perform_get_request(url, params)
#         print(response_content)
        df = xml_to_load_dataframe(response_content)
        print(df)
        df.to_csv(f'load_{region}.csv', index=False)
    return
        

In [66]:
regions = {
        'HU': '10YHU-MAVIR----U',
        'IT': '10YIT-GRTN-----B',
        'PO': '10YPL-AREA-----S',
        'SP': '10YES-REE------0',
        'UK': '10Y1001A1001A92E',
        'DE': '10Y1001A1001A83F',
        'DK': '10Y1001A1001A65H',
        'SE': '10YSE-1--------K',
        'NE': '10YNL----------L',
    }
get_load_data_from_entsoe(regions)

Fetching data for HU...
11111111111
securityToken=1d9cd4bd-f8aa-476c-8cc1-3442dc91506d&documentType=A65&processType=A16&outBiddingZone_Domain=10YHU-MAVIR----U&periodStart=202302240000&periodEnd=202303240000
2222211111111111
<Response [200]>
                    StartTime                  EndTime            AreaID   
0     2023-02-23T23:45+00:00Z  2023-02-24T00:00+00:00Z  10YHU-MAVIR----U  \
1     2023-02-24T00:00+00:00Z  2023-02-24T00:15+00:00Z  10YHU-MAVIR----U   
2     2023-02-24T00:15+00:00Z  2023-02-24T00:30+00:00Z  10YHU-MAVIR----U   
3     2023-02-24T00:30+00:00Z  2023-02-24T00:45+00:00Z  10YHU-MAVIR----U   
4     2023-02-24T00:45+00:00Z  2023-02-24T01:00+00:00Z  10YHU-MAVIR----U   
...                       ...                      ...               ...   
2683  2023-03-23T22:30+00:00Z  2023-03-23T22:45+00:00Z  10YHU-MAVIR----U   
2684  2023-03-23T22:45+00:00Z  2023-03-23T23:00+00:00Z  10YHU-MAVIR----U   
2685  2023-03-23T23:00+00:00Z  2023-03-23T23:15+00:00Z  10YHU-MAVIR----U   

2222211111111111
<Response [200]>
                   StartTime                  EndTime            AreaID   
0    2023-02-23T23:00+00:00Z  2023-02-24T00:00+00:00Z  10Y1001A1001A65H  \
1    2023-02-24T00:00+00:00Z  2023-02-24T01:00+00:00Z  10Y1001A1001A65H   
2    2023-02-24T01:00+00:00Z  2023-02-24T02:00+00:00Z  10Y1001A1001A65H   
3    2023-02-24T02:00+00:00Z  2023-02-24T03:00+00:00Z  10Y1001A1001A65H   
4    2023-02-24T03:00+00:00Z  2023-02-24T04:00+00:00Z  10Y1001A1001A65H   
..                       ...                      ...               ...   
667  2023-03-23T18:00+00:00Z  2023-03-23T19:00+00:00Z  10Y1001A1001A65H   
668  2023-03-23T19:00+00:00Z  2023-03-23T20:00+00:00Z  10Y1001A1001A65H   
669  2023-03-23T20:00+00:00Z  2023-03-23T21:00+00:00Z  10Y1001A1001A65H   
670  2023-03-23T21:00+00:00Z  2023-03-23T22:00+00:00Z  10Y1001A1001A65H   
671  2023-03-23T22:00+00:00Z  2023-03-23T23:00+00:00Z  10Y1001A1001A65H   

    UnitName  Load  
0        MAW  3849  
1        MAW  3844  
2 

In [67]:
def get_gen_data_from_entsoe(regions, period_start='202302240000', 
                             period_end='202303240000', output_path='./data'):
    """Get data"""
    # TODO: There is a period range limit of 1 day for this API. Process in 1 day chunks if needed

    # URL of the RESTful API
    url = 'https://web-api.tp.entsoe.eu/api'

    # General parameters for the API
    params = {
        'securityToken': '1d9cd4bd-f8aa-476c-8cc1-3442dc91506d',
        'documentType': 'A75',
        'processType': 'A16',
        'outBiddingZone_Domain': 'FILL_IN', # used for Load data
        'in_Domain': 'FILL_IN', # used for Generation data
        'periodStart': period_start, # in the format YYYYMMDDHHMM
        'periodEnd': period_end # in the format YYYYMMDDHHMM
    }

    # Loop through the regions and get data for each region
    for region, area_code in regions.items():
        print(f'Fetching data for {region}...')
        params['outBiddingZone_Domain'] = area_code
        params['in_Domain'] = area_code
    
        # Use the requests library to get data from the API for the specified time range
        response_content = perform_get_request(url, params)
        print('response_content000000')
        print(response_content)
        print('response_content11111')
        # Response content is a string of XML data
        dfs = xml_to_gen_data(response_content)
        print(dfs)
        print('dfs111111')
        # Save the dfs to CSV files
#         for psr_type, df in dfs.items():
#             # Save the DataFrame to a CSV file
#             df.to_csv(f'gen_{region}_{psr_type}.csv', index=False)

In [68]:
regions = {
        'HU': '10YHU-MAVIR----U',
#         'IT': '10YIT-GRTN-----B',
#         'PO': '10YPL-AREA-----S',
#         'SP': '10YES-REE------0',
#         'UK': '10Y1001A1001A92E',
#         'DE': '10Y1001A1001A83F',
#         'DK': '10Y1001A1001A65H',
#         'SE': '10YSE-1--------K',
#         'NE': '10YNL----------L',
    }
get_gen_data_from_entsoe(regions)

Fetching data for HU...


NameError: name 'perform_get_request' is not defined

In [72]:
#load data
import pandas as pd
import os

In [73]:
# Define the list of codes representing green energy
GREEN_ENERGY = [
    "B01", "B09", "B10", "B11", "B12",
    "B13", "B15", "B16", "B18", "B19"
]

In [74]:
# Initialize an empty DataFrame to hold concatenated data
concatenated_df = pd.DataFrame()

In [75]:
folder_path = '../data/ingestion'

In [76]:
# List all files in the directory
all_files = os.listdir(folder_path)

In [77]:
all_files

['load_SP.csv',
 'gen_DE_B09.csv',
 'gen_DE_B20.csv',
 'load_SE.csv',
 'gen_NE_B20.csv',
 'gen_NE_B18.csv',
 'gen_NE_B19.csv',
 'load_DK.csv',
 'gen_DE_B18.csv',
 'gen_DE_B19.csv',
 'load_PO.csv',
 'gen_DK_B01.csv',
 'gen_HU_B01.csv',
 'gen_HU_B15.csv',
 'gen_SP_B16.csv',
 'gen_SP_B02.csv',
 'gen_IT_B04.csv',
 'gen_IT_B10.csv',
 'gen_SE_B16.csv',
 'gen_IT_B11.csv',
 'gen_IT_B05.csv',
 'gen_UK_B19.csv',
 'gen_SP_B03.csv',
 'gen_SP_B17.csv',
 'gen_HU_B14.csv',
 'gen_DK_B16.csv',
 'gen_HU_B16.csv',
 'gen_HU_B02.csv',
 'gen_SP_B01.csv',
 'gen_SP_B15.csv',
 'gen_PO_B19.csv',
 'gen_SE_B14.csv',
 'gen_IT_B06.csv',
 'gen_IT_B12.csv',
 'gen_SP_B14.csv',
 'gen_HU_B17.csv',
 'gen_DK_B17.csv',
 'gen_SP_B04.csv',
 'gen_SP_B10.csv',
 'gen_IT_B16.csv',
 'gen_SE_B04.csv',
 'gen_IT_B03.csv',
 'gen_IT_B17.csv',
 'gen_SP_B11.csv',
 'gen_SP_B05.csv',
 'gen_HU_B06.csv',
 'gen_HU_B12.csv',
 'gen_DK_B06.csv',
 'gen_DK_B04.csv',
 'gen_HU_B04.csv',
 'gen_SP_B13.csv',
 'gen_SP_B07.csv',
 'gen_IT_B01.csv',
 'gen

In [78]:
# Loop through the files and concatenate the ones that meet the criteria
for file_name in all_files:
    if file_name.endswith('.csv') and any(code in file_name for code in GREEN_ENERGY):
        # Read the csv file
        df = pd.read_csv(os.path.join(folder_path, file_name))
        
        # Split the file name and create new columns
        split_name = file_name.replace('.csv', '').split('_')
        df['Country'] = split_name[1]
        
        # Concatenate the DataFrame to the main concatenated_df
        concatenated_df = pd.concat([concatenated_df, df])
#Delete AreaID because already have country code
concatenated_df = concatenated_df.drop(columns=['AreaID'])
# Save the concatenated DataFrame to a new csv file
output_file_path = os.path.join(folder_path, 'concatenated_green_energy.csv')
concatenated_df.to_csv(output_file_path, index=False)

concatenated_df

Unnamed: 0,StartTime,EndTime,UnitName,PsrType,quantity,Country
0,2021-12-31T23:45+00:00Z,2022-01-01T00:00+00:00Z,MAW,B09,26,DE
1,2022-01-01T00:00+00:00Z,2022-01-01T00:15+00:00Z,MAW,B09,26,DE
2,2022-01-01T00:15+00:00Z,2022-01-01T00:30+00:00Z,MAW,B09,26,DE
3,2022-01-01T00:30+00:00Z,2022-01-01T00:45+00:00Z,MAW,B09,26,DE
4,2022-01-01T00:45+00:00Z,2022-01-01T01:00+00:00Z,MAW,B09,26,DE
...,...,...,...,...,...,...
70075,2022-12-31T22:30+00:00Z,2022-12-31T22:45+00:00Z,MAW,B10,3809,DE
70076,2022-12-31T22:45+00:00Z,2022-12-31T23:00+00:00Z,MAW,B10,1699,DE
70077,2022-12-31T23:00+00:00Z,2022-12-31T23:15+00:00Z,MAW,B10,1774,DE
70078,2022-12-31T23:15+00:00Z,2022-12-31T23:30+00:00Z,MAW,B10,2185,DE


In [66]:
concatenated_df

Unnamed: 0,StartTime,EndTime,UnitName,PsrType,quantity,Country
0,2021-12-31T23:45+00:00Z,2022-01-01T00:00+00:00Z,MAW,B09,26,DE
1,2022-01-01T00:00+00:00Z,2022-01-01T00:15+00:00Z,MAW,B09,26,DE
2,2022-01-01T00:15+00:00Z,2022-01-01T00:30+00:00Z,MAW,B09,26,DE
3,2022-01-01T00:30+00:00Z,2022-01-01T00:45+00:00Z,MAW,B09,26,DE
4,2022-01-01T00:45+00:00Z,2022-01-01T01:00+00:00Z,MAW,B09,26,DE
...,...,...,...,...,...,...
70075,2022-12-31T22:30+00:00Z,2022-12-31T22:45+00:00Z,MAW,B10,3809,DE
70076,2022-12-31T22:45+00:00Z,2022-12-31T23:00+00:00Z,MAW,B10,1699,DE
70077,2022-12-31T23:00+00:00Z,2022-12-31T23:15+00:00Z,MAW,B10,1774,DE
70078,2022-12-31T23:15+00:00Z,2022-12-31T23:30+00:00Z,MAW,B10,2185,DE


In [79]:
# Load the dataset
df = pd.read_csv('../data/ingestion/concatenated_green_energy.csv')


In [80]:
null_values = df.isnull().sum()
null_values

StartTime    0
EndTime      0
UnitName     0
PsrType      0
quantity     0
Country      0
dtype: int64

In [81]:
from sqlalchemy import create_engine

In [82]:
engine = create_engine('sqlite://', echo=False)

In [83]:
df.to_sql('energy_data', con=engine, index=False)

1466395

In [84]:
pd.read_sql_query("SELECT * FROM energy_data LIMIT 100", con=engine)

Unnamed: 0,StartTime,EndTime,UnitName,PsrType,quantity,Country
0,2021-12-31T23:45+00:00Z,2022-01-01T00:00+00:00Z,MAW,B09,26,DE
1,2022-01-01T00:00+00:00Z,2022-01-01T00:15+00:00Z,MAW,B09,26,DE
2,2022-01-01T00:15+00:00Z,2022-01-01T00:30+00:00Z,MAW,B09,26,DE
3,2022-01-01T00:30+00:00Z,2022-01-01T00:45+00:00Z,MAW,B09,26,DE
4,2022-01-01T00:45+00:00Z,2022-01-01T01:00+00:00Z,MAW,B09,26,DE
...,...,...,...,...,...,...
95,2022-01-01T23:30+00:00Z,2022-01-01T23:45+00:00Z,MAW,B09,28,DE
96,2022-01-01T23:45+00:00Z,2022-01-02T00:00+00:00Z,MAW,B09,28,DE
97,2022-01-02T00:00+00:00Z,2022-01-02T00:15+00:00Z,MAW,B09,28,DE
98,2022-01-02T00:15+00:00Z,2022-01-02T00:30+00:00Z,MAW,B09,28,DE


In [87]:
sql ="""
with data_raw as (
  SELECT
  substr(StartTime, 1, 10) as StartDate,
  substr(EndTime, 1, 10) as EndDate,
  substr(StartTime, 12, 2) as dataHour,
  *
FROM energy_data)

SELECT * 
FROM data_raw
WHERE StartDate <> '2021-12-31'
LIMIT 1000
"""

In [88]:
pd.read_sql_query(sql, con=engine)

Unnamed: 0,StartDate,EndDate,dataHour,StartTime,EndTime,UnitName,PsrType,quantity,Country
0,2022-01-01,2022-01-01,00,2022-01-01T00:00+00:00Z,2022-01-01T00:15+00:00Z,MAW,B09,26,DE
1,2022-01-01,2022-01-01,00,2022-01-01T00:15+00:00Z,2022-01-01T00:30+00:00Z,MAW,B09,26,DE
2,2022-01-01,2022-01-01,00,2022-01-01T00:30+00:00Z,2022-01-01T00:45+00:00Z,MAW,B09,26,DE
3,2022-01-01,2022-01-01,00,2022-01-01T00:45+00:00Z,2022-01-01T01:00+00:00Z,MAW,B09,26,DE
4,2022-01-01,2022-01-01,01,2022-01-01T01:00+00:00Z,2022-01-01T01:15+00:00Z,MAW,B09,26,DE
...,...,...,...,...,...,...,...,...,...
995,2022-01-11,2022-01-11,08,2022-01-11T08:45+00:00Z,2022-01-11T09:00+00:00Z,MAW,B09,27,DE
996,2022-01-11,2022-01-11,09,2022-01-11T09:00+00:00Z,2022-01-11T09:15+00:00Z,MAW,B09,27,DE
997,2022-01-11,2022-01-11,09,2022-01-11T09:15+00:00Z,2022-01-11T09:30+00:00Z,MAW,B09,27,DE
998,2022-01-11,2022-01-11,09,2022-01-11T09:30+00:00Z,2022-01-11T09:45+00:00Z,MAW,B09,27,DE
