In [3]:
import pandas as pd
import os
from utils.from_mongodb import MongoData
from config import MongoConfig

### FUNCTION
def load_data(path: str, mongo_connect_config: MongoConfig):
    """
    Loads data from either from MongoDB if the file not exists or from published by the author .parquet if the file exists.
    :param path: str - path to the data file
    :param mongo_connect_config: MongoConfig - configuration object to connect to MongoDB.
    :return: 
    """
    if os.path.exists(path):
        df_df = pd.read_parquet(path)
    else:
        data_class = MongoData(mongo_connect_config)
        df_df = data_class.load_mongo_document_to_df()
        df_df.to_parquet(path)
    
    return df_df

def create_date_time_with_timezone(df):
    """
    Save date and time as datetime with time zone to distinguish DST (Daylight saving time) 3rd hour duplication.
    """
    df.loc[(df['date_cet'] == '2023-10-29') & (df['hour'] > 3), 'hour']  = \
        df.loc[(df['date_cet'] == '2023-10-29') & (df['hour'] > 3), 'hour'] - 1
    # df = create_datetime_timezone_1h(df)
    df['Data Godzina'] = pd.to_datetime(df['date_cet']) + pd.to_timedelta(df['hour'] - 1, unit='h')
    df['Data Godzina'] = df['Data Godzina'].dt.tz_localize('Europe/Warsaw', ambiguous='infer')

    return df

## PSE - BPKD (Krajowe zapotrzebowanie, PV, wiatr)

In [4]:
from config import mongo_connect_config_pse_bpkd


path = 'data/2023/raw_PSE_BPKD.parquet'
# path = 'data/2023/raw_PSE_PK5.parquet'
df_pse = load_data(path, mongo_connect_config_pse_bpkd)

In [5]:
# process data 
df_pse.loc[(df_pse['date_cet'] == '2023-03-26') & (df_pse['hour'] == 3), 'hour'] = 2
df_pse = create_date_time_with_timezone(df_pse)

df_pse = df_pse.set_index('Data Godzina')
df_pse = df_pse[[
    'krajowe_zapotrzebowanie_na_moc[MW]','generacja_zrodel_wiatrowych[MWh]',
    'generacja_zrodel_fotowoltaicznych[MWh]']].copy()
df_pse = df_pse.rename(columns={
    'krajowe_zapotrzebowanie_na_moc[MW]': 'Godzinowe zużycie krajowe MWh',
    'generacja_zrodel_wiatrowych[MWh]': 'Generacja ze źródeł wiatrowych MWh',
    'generacja_zrodel_fotowoltaicznych[MWh]': 'Generacja ze źródeł fotowoltaicznych MWh',
})
df_pse = df_pse.reset_index()

# save processed data
df_pse.to_parquet('data/2023/processed_PSE_BPKD.parquet')

## PSE - RCE

In [6]:
from config import mongo_connect_config_rce

path_fix = 'data/2023/raw_rce.parquet'
df_rce = load_data(path_fix, mongo_connect_config_rce)

df_rce.loc[(df_rce['date_cet'] == '2023-03-26') & (df_rce['hour'] == 3), 'hour'] = 2
df_rce = create_date_time_with_timezone(df_rce)

df_rce = df_rce.rename(columns={
    'RCE[PLN/MWh]': 'Cena PLN/MWh',
    'datetime_timezone': 'Data Godzina'
})
df_rce = df_rce[['Data Godzina', 'Cena PLN/MWh']].copy()

df_rce.to_parquet('data/2023/processed_rce.parquet')

## Fix1 historia

In [7]:
# df_ceny_rdn = pd.read_parquet('data/surowe_Ceny_2020_24.parquet')
from PyXoog import convert_datetime_timezone_datatype
from config import mongo_connect_config_ceny_fix1

mongo_connect_config_ceny_fix1.START = '2016-01-01'
mongo_connect_config_ceny_fix1.END = '2024-12-31'

path_fix1 = 'data/2023/raw_fix1_history.parquet'
df_fix1_history = load_data(path_fix1, mongo_connect_config_ceny_fix1)
df_fix1_history = convert_datetime_timezone_datatype(df_fix1_history)
df_fix1_history = df_fix1_history.rename(columns={
     'fix1_price[PLN/MWh]': 'Cena PLN/MWh',
    'datetime_timezone': 'Data Godzina'
})
df_fix1_history = df_fix1_history[['Data Godzina', 'Cena PLN/MWh']].copy()
df_fix1_history.to_parquet('data/obrobione_Ceny_2016_24.parquet')

In [8]:
df_fix1_history

Unnamed: 0,Data Godzina,Cena PLN/MWh
78888,2016-01-01 00:00:00+01:00,108.27
78889,2016-01-01 01:00:00+01:00,94.74
78890,2016-01-01 02:00:00+01:00,85.05
78891,2016-01-01 03:00:00+01:00,79.35
78892,2016-01-01 04:00:00+01:00,75.17
...,...,...
19,2024-12-31 19:00:00+01:00,301.99
20,2024-12-31 20:00:00+01:00,215.00
21,2024-12-31 21:00:00+01:00,50.00
22,2024-12-31 22:00:00+01:00,0.01


## Pogoda

In [9]:
from config import mongo_connect_config_pogoda

path_weather = 'data/2023/raw_weather.parquet'
df_weather = load_data(path_weather, mongo_connect_config_pogoda)

# Drop inconsistent stations
df_weather = df_weather.loc[~df_weather['imgw_station_name'].isin(['Kasprowy Wierch', 'sniezka'])]
df_weather = create_date_time_with_timezone(df_weather)

df_weather = df_weather.groupby('Data Godzina').mean(numeric_only=True).reset_index()
df_weather = df_weather.rename(columns={'temp_C': 'Temperatura C'})
df_weather = df_weather[['Data Godzina', 'Temperatura C']].copy()

df_weather.to_parquet('data/2023/processed_weather.parquet')

## Standardowe profile zużycia

In [10]:
df_profile = pd.read_excel('data/2023/raw_standardowe_profile_enea.xlsx', sheet_name='G11')

df_profile = df_profile.set_index('Data')
df_profile = df_profile.drop(columns=['Dzień'])
df_profile = df_profile.melt(ignore_index=False, var_name='Godzina', value_name='Wskaźnik zużycia %')
df_profile = df_profile.reset_index()
df_profile = df_profile.dropna()

# godzina 3 tego dnia nie istnieje, to jest błąd w danych źródłowych
df_profile.loc[(df_profile['Data']=='2023-03-26') & (df_profile['Godzina']==3), 'Godzina'] = 2
# Godz 2a
df_profile = df_profile.dropna(subset='Wskaźnik zużycia %', axis=0)
df_profile.loc[df_profile['Godzina'] == '2a', 'Godzina'] = 3
df_profile = df_profile.sort_values(['Data', 'Godzina'])
df_profile['Data Godzina'] = pd.to_datetime(df_profile['Data']) + pd.to_timedelta(df_profile['Godzina'] - 1, unit='h')
df_profile['Data Godzina'] = df_profile['Data Godzina'].dt.tz_localize('Europe/Warsaw', ambiguous='infer')

df_profile['Wskaźnik zużycia %'] = df_profile['Wskaźnik zużycia %'] / 1000
df_profile = df_profile.reset_index(drop=True)

# dane z GUS w GWh (plik "roczne uzycie ee gosp domowe 2023.pdf")
roczne_zuzycie_gp = 29774.5
df_profile['Roczne zużycie gd MWh'] = roczne_zuzycie_gp * 1000

# zamodelowanie godzinowego zużycia ee przez gospodarstwa domowe
df_profile['Godzinowe zużycie gd MWh'] = round(df_profile['Roczne zużycie gd MWh'] * df_profile['Wskaźnik zużycia %'], 4)

df_profile['Miesiąc'] = df_profile['Data Godzina'].dt.month
# df_profile['Dzień tygodnia'] = df_profile['Data Godzina'].dt.weekday
grouper = [
    'Godzina',
    'Miesiąc',
    # 'Dzień tygodnia'
]
df_profile_model = df_profile.groupby(grouper).mean(numeric_only=True)
df_profile_model.rename(columns={'Godzinowe zużycie gd MWh': 'Modelowe godzinowe zużycie gd MWh'}, inplace=True)
df_profile_model = df_profile_model[['Modelowe godzinowe zużycie gd MWh']]
df_profile = pd.merge(df_profile.reset_index(drop=True), df_profile_model, on=grouper)

df_profile['Delta godzinowe zużycie gd MWh'] = \
    df_profile['Godzinowe zużycie gd MWh'] - df_profile['Modelowe godzinowe zużycie gd MWh']

df_profile.to_parquet('data/2023/processed_standardowe_profile_enea.parquet')