In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
def convert_to_float(data):
    """Converts a string to a float, replacing commas with dots.

    Args:
        data (str): The string to be converted to float.

    Returns:
        float: The float value of the string, or None if it can't be converted.
    """
    # if data is instance of float, return data
    if isinstance(data, float):
        return data

    try:
        return float(data.replace(".", "").replace(",", "."))

    except Exception as e:
        if data == "-":
            return None
        else:
            print(f"Error converting {data} to float: {e}")


In [3]:
# include sine and cosine transformations for the hour and day of the week
def encode_feature(data, column_name: str, max_value: int):
    """Encodes a feature in a way that captures the periodicity of the data.

    Args:
        data (dataframe): the dataframe containing the data to be transformed.
        column_name (str): the name of the column to be transformed.
        max_value (int): the maximum value of the feature to be transformed.

    Returns:
        _type_: _description_
    """
    data[column_name + "_sin"] = np.sin(2 * np.pi * data[column_name] / max_value)
    data[column_name + "_cos"] = np.cos(2 * np.pi * data[column_name] / max_value)
    return data



In [4]:
smard_pred_consumption = pd.read_csv(r"C:\Repositories\electricity_lstm\electricity_data\consumption_forecast_hour.csv", delimiter=";")
smard_pred_production = pd.read_csv(r"C:\Repositories\electricity_lstm\electricity_data\creation_forecast_day_ahead_hour.csv", delimiter=";")

In [5]:
complete_data = pd.read_csv(r"C:\Repositories\electricity_lstm\data\transformed\transformed_hour_lstm_seq2seq_additive_corrected.csv")

In [6]:

lstm_seq2seq_additive_corrected_hour_data = pd.read_csv(r"C:\Repositories\electricity_lstm\results\predictions_lstm_seq2seq_additive_corrected_hour.csv")

In [7]:
smard_pred_consumption = smard_pred_consumption.iloc[:, :3].copy()
smard_pred_consumption.columns = ["start_time", "end_time", "smard_total_load_mwh"]

In [65]:
smard_pred_consumption.shape

(83255, 3)

In [8]:
# just keep the first 3 columns and drop the rest
smard_pred_production = smard_pred_production.iloc[:, :3].copy()
smard_pred_production.columns = ["start_time", "end_time", "smard_total_production_mwh"]

In [66]:
smard_pred_production.shape

(83255, 3)

In [9]:
merge_smard = pd.merge(smard_pred_production, smard_pred_consumption, on=["start_time", "end_time"])

In [67]:
merge_smard.shape

(83273, 4)

In [10]:
for col in merge_smard.columns[2:]:
    merge_smard[col] = merge_smard[col].apply(convert_to_float)

In [11]:
# consumption data
merge_smard["start_time"] = pd.to_datetime(
    merge_smard["start_time"], format="%d.%m.%Y %H:%M"
)
merge_smard["end_time"] = pd.to_datetime(
    merge_smard["end_time"], format="%d.%m.%Y %H:%M")


In [14]:
print(complete_data.shape)
print(merge_smard.shape)

(83273, 33)
(83273, 4)


In [20]:
# concat the two dataframes on their column and drop the duplicates
smard_and_complete_data = pd.concat([complete_data, merge_smard], axis=1)
smard_and_complete_data = smard_and_complete_data.loc[:, ~smard_and_complete_data.columns.duplicated()]

In [22]:
smard_and_complete_data.shape

(83273, 35)

In [23]:
lstm_seq2seq_additive_corrected_hour_data.shape

(13108, 26)

In [43]:
# get the common columns between the two dataframes
common_columns = list(set(smard_and_complete_data.columns).intersection(set(lstm_seq2seq_additive_corrected_hour_data.columns)))

In [33]:
lstm_seq2seq_additive_corrected_hour_data.columns

Index(['pred_total_production_mwh', 'pred_total_load_mwh',
       'total_production_mwh', 'total_load_mwh', 'biomass_mwh',
       'hydropower_mwh', 'wind_offshore_mwh', 'wind_onshore_mwh', 'solar_mwh',
       'other_renewables_mwh', 'nuclear_mwh', 'lignite_mwh', 'hard_coal_mwh',
       'natural_gas_mwh', 'pumped_storage_mwh', 'other_conventional_mwh',
       'hour_sin', 'hour_cos', 'minute_sin', 'minute_cos', 'day_of_week_sin',
       'day_of_week_cos', 'day_of_year_sin', 'day_of_year_cos',
       'week_of_year_sin', 'week_of_year_cos'],
      dtype='object')

In [34]:
smard_and_complete_data.columns

Index(['start_time', 'end_time', 'biomass_mwh', 'hydropower_mwh',
       'wind_offshore_mwh', 'wind_onshore_mwh', 'solar_mwh',
       'other_renewables_mwh', 'nuclear_mwh', 'lignite_mwh', 'hard_coal_mwh',
       'natural_gas_mwh', 'pumped_storage_mwh', 'other_conventional_mwh',
       'total_load_mwh', 'residual_load_mwh', 'saved_pumped_storage_mwh',
       'total_production_mwh', 'hour', 'minute', 'day_of_week', 'day_of_year',
       'week_of_year', 'hour_sin', 'hour_cos', 'minute_sin', 'minute_cos',
       'day_of_week_sin', 'day_of_week_cos', 'day_of_year_sin',
       'day_of_year_cos', 'week_of_year_sin', 'week_of_year_cos',
       'smard_total_production_mwh', 'smard_total_load_mwh'],
      dtype='object')

In [59]:
test_data = smard_and_complete_data[smard_and_complete_data["start_time"] >= "2023-01-02"]

In [64]:
test_data

Unnamed: 0,start_time,end_time,biomass_mwh,hydropower_mwh,wind_offshore_mwh,wind_onshore_mwh,solar_mwh,other_renewables_mwh,nuclear_mwh,lignite_mwh,...,minute_sin,minute_cos,day_of_week_sin,day_of_week_cos,day_of_year_sin,day_of_year_cos,week_of_year_sin,week_of_year_cos,smard_total_production_mwh,smard_total_load_mwh
70168,2023-01-02 00:00:00,2023-01-02 01:00:00,4355.25,1246.00,2881.00,18602.25,1.50,130.00,2462.75,4722.00,...,0.0,1.0,0.000000,1.00000,0.034422,0.999407,1.205367e-01,0.992709,44218.0,43966.50
70169,2023-01-02 01:00:00,2023-01-02 02:00:00,4327.25,1203.75,1745.25,19121.75,1.00,130.00,2461.25,4752.25,...,0.0,1.0,0.000000,1.00000,0.034422,0.999407,1.205367e-01,0.992709,43158.0,42313.75
70170,2023-01-02 02:00:00,2023-01-02 03:00:00,4315.00,1237.25,1183.75,20986.75,1.50,130.00,2461.00,4768.75,...,0.0,1.0,0.000000,1.00000,0.034422,0.999407,1.205367e-01,0.992709,43662.0,41668.00
70171,2023-01-02 03:00:00,2023-01-02 04:00:00,4329.25,1246.75,851.00,22609.25,1.25,130.25,2460.50,4813.50,...,0.0,1.0,0.000000,1.00000,0.034422,0.999407,1.205367e-01,0.992709,42952.0,41994.25
70172,2023-01-02 04:00:00,2023-01-02 05:00:00,4372.00,1254.00,682.00,24661.00,1.25,130.25,2459.25,5608.25,...,0.0,1.0,0.000000,1.00000,0.034422,0.999407,1.205367e-01,0.992709,43866.0,43612.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83268,2024-06-30 19:00:00,2024-06-30 20:00:00,4084.75,2527.00,1595.50,11676.75,3420.50,112.00,0.00,7976.25,...,0.0,1.0,-0.781831,0.62349,0.008607,-0.999963,-3.216245e-16,-1.000000,41799.0,46839.00
83269,2024-06-30 20:00:00,2024-06-30 21:00:00,4114.50,2801.50,1469.00,10118.25,1231.00,112.00,0.00,8468.00,...,0.0,1.0,-0.781831,0.62349,0.008607,-0.999963,-3.216245e-16,-1.000000,37720.0,46174.25
83270,2024-06-30 21:00:00,2024-06-30 22:00:00,4121.75,2679.00,1446.00,7890.50,125.00,112.00,0.00,8693.00,...,0.0,1.0,-0.781831,0.62349,0.008607,-0.999963,-3.216245e-16,-1.000000,35829.0,44956.00
83271,2024-06-30 22:00:00,2024-06-30 23:00:00,4115.00,2788.75,1357.25,6051.25,0.25,112.00,0.00,9058.75,...,0.0,1.0,-0.781831,0.62349,0.008607,-0.999963,-3.216245e-16,-1.000000,34410.0,44442.25


In [61]:
test_data

Unnamed: 0,start_time,end_time,biomass_mwh,hydropower_mwh,wind_offshore_mwh,wind_onshore_mwh,solar_mwh,other_renewables_mwh,nuclear_mwh,lignite_mwh,...,minute_sin,minute_cos,day_of_week_sin,day_of_week_cos,day_of_year_sin,day_of_year_cos,week_of_year_sin,week_of_year_cos,smard_total_production_mwh,smard_total_load_mwh
70168,2023-01-02 00:00:00,2023-01-02 01:00:00,4355.25,1246.00,2881.00,18602.25,1.50,130.00,2462.75,4722.00,...,0.0,1.0,0.000000,1.00000,0.034422,0.999407,1.205367e-01,0.992709,44218.0,43966.50
70169,2023-01-02 01:00:00,2023-01-02 02:00:00,4327.25,1203.75,1745.25,19121.75,1.00,130.00,2461.25,4752.25,...,0.0,1.0,0.000000,1.00000,0.034422,0.999407,1.205367e-01,0.992709,43158.0,42313.75
70170,2023-01-02 02:00:00,2023-01-02 03:00:00,4315.00,1237.25,1183.75,20986.75,1.50,130.00,2461.00,4768.75,...,0.0,1.0,0.000000,1.00000,0.034422,0.999407,1.205367e-01,0.992709,43662.0,41668.00
70171,2023-01-02 03:00:00,2023-01-02 04:00:00,4329.25,1246.75,851.00,22609.25,1.25,130.25,2460.50,4813.50,...,0.0,1.0,0.000000,1.00000,0.034422,0.999407,1.205367e-01,0.992709,42952.0,41994.25
70172,2023-01-02 04:00:00,2023-01-02 05:00:00,4372.00,1254.00,682.00,24661.00,1.25,130.25,2459.25,5608.25,...,0.0,1.0,0.000000,1.00000,0.034422,0.999407,1.205367e-01,0.992709,43866.0,43612.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83268,2024-06-30 19:00:00,2024-06-30 20:00:00,4084.75,2527.00,1595.50,11676.75,3420.50,112.00,0.00,7976.25,...,0.0,1.0,-0.781831,0.62349,0.008607,-0.999963,-3.216245e-16,-1.000000,41799.0,46839.00
83269,2024-06-30 20:00:00,2024-06-30 21:00:00,4114.50,2801.50,1469.00,10118.25,1231.00,112.00,0.00,8468.00,...,0.0,1.0,-0.781831,0.62349,0.008607,-0.999963,-3.216245e-16,-1.000000,37720.0,46174.25
83270,2024-06-30 21:00:00,2024-06-30 22:00:00,4121.75,2679.00,1446.00,7890.50,125.00,112.00,0.00,8693.00,...,0.0,1.0,-0.781831,0.62349,0.008607,-0.999963,-3.216245e-16,-1.000000,35829.0,44956.00
83271,2024-06-30 22:00:00,2024-06-30 23:00:00,4115.00,2788.75,1357.25,6051.25,0.25,112.00,0.00,9058.75,...,0.0,1.0,-0.781831,0.62349,0.008607,-0.999963,-3.216245e-16,-1.000000,34410.0,44442.25


In [44]:
common_columns

['week_of_year_sin',
 'week_of_year_cos',
 'day_of_week_cos',
 'lignite_mwh',
 'hard_coal_mwh',
 'day_of_year_sin',
 'natural_gas_mwh',
 'biomass_mwh',
 'hydropower_mwh',
 'solar_mwh',
 'minute_sin',
 'wind_offshore_mwh',
 'pumped_storage_mwh',
 'hour_sin',
 'other_conventional_mwh',
 'day_of_year_cos',
 'total_load_mwh',
 'other_renewables_mwh',
 'minute_cos',
 'nuclear_mwh',
 'day_of_week_sin',
 'total_production_mwh',
 'wind_onshore_mwh',
 'hour_cos']

In [53]:
# merge the two dataframes on the common columns
merged_test = pd.merge(lstm_seq2seq_additive_corrected_hour_data, test_data, on=common_columns, how="right")

In [56]:
print(merged_test.shape)
print(lstm_seq2seq_additive_corrected_hour_data.shape)

(13129, 37)
(13108, 26)
