## Date Handling of Input Files

> The data contains periods in CET/CEST format.
	

* we need to have start and end date in one consitent time zone
* we need to split the period into start and end date of the period 

"01/01/2015 00:00:00 - 01/01/2015 00:15:00"


## Setup

All the manipulations and plots in this notebook can be created with standard libraries such as matplotlib, statsmodels etc. 

In [1]:
# Main data packages. 
import numpy as np
import pandas as pd

# Data Viz. 
import statsmodels.formula.api as smf
from statsmodels.tsa.seasonal import seasonal_decompose
from scipy.ndimage import gaussian_filter
from calendar import monthrange
from calendar import month_name



## Import Data 

The data for this notebook was downloaded from the [meteoblue website](https://www.meteoblue.com/en/weather/archive/export/basel_switzerland_2661604) and consits of weather data for the city of Basel from 2008 till 2020. 

In [2]:

df_price = pd.read_csv("../../data_cleaned/by_source/01_ENERGY_PRICES.csv", delimiter=",")


In [3]:
df_price.shape

(96336, 12)

In [4]:
df_load = pd.read_csv("../../data_cleaned/by_source/02_LOAD.csv", delimiter=",")

In [5]:
df_res_offshore = pd.read_csv("../../data_cleaned/by_source/03_RES_WIND_OFFSHORE.csv", delimiter=",")



In [6]:
df_res_onshore = pd.read_csv("../../data_cleaned/by_source/04_RES_WIND_ONSHORE.csv", delimiter=",")

In [7]:
df_res_offshore.shape

(96432, 13)

In [8]:

df_res_solar = pd.read_csv("../../data_cleaned/by_source/05_RES_SOLAR.csv", delimiter=",")

In [9]:
df_res_solar.shape

(96432, 13)

In [10]:

df_tot_gen = pd.read_csv("../../data_cleaned/by_source/06_TOTAL_GEN.csv", delimiter=",")

In [11]:
df_res_offshore.columns

Index(['date', 'year', 'month', 'day', 'dayofyear', 'hour', 'week',
       'dayofweek', 'off_wind_da', 'off_wind_act', 'period_start_utc',
       'period_end_utc', 'c_by_hour'],
      dtype='object')

In [12]:
df_tot_gen.columns

Index(['date', 'year', 'month', 'day', 'dayofyear', 'hour', 'week',
       'dayofweek', 'gen_forecast_da', 'gen_actual', 'period_start_utc',
       'period_end_utc', 'c_by_hour'],
      dtype='object')

In [13]:
df_res_onshore.columns

Index(['date', 'year', 'month', 'day', 'dayofyear', 'hour', 'week',
       'dayofweek', 'on_wind_da', 'on_wind_act', 'period_start_utc',
       'period_end_utc', 'c_by_hour'],
      dtype='object')

In [14]:
from functools import reduce

#dfs = [df_price, df_res_offshore] ##, df_res_offshore, df_res_onshore, df_res_solar]  investigate on keys df_gen_forecast
dfs = [
    df_price,
    df_load[["period_start_utc","load_forecast_da","load_actual"]],
    df_res_offshore[["period_start_utc","off_wind_da","off_wind_act"]],
    df_res_onshore[["period_start_utc","on_wind_da","on_wind_act"]],
    df_res_solar[["period_start_utc","solar_da","solar_act"]],
    df_tot_gen[["period_start_utc","gen_forecast_da","gen_actual"]],
]

df_merged = reduce(
    lambda left, right: pd.merge(left, right, on="period_start_utc", how="left"),
    dfs
)

In [15]:
df_merged.shape

(96336, 22)

In [16]:
df_merged

Unnamed: 0,date,year,month,day,dayofyear,hour,week,dayofweek,price,period_start_utc,...,load_forecast_da,load_actual,off_wind_da,off_wind_act,on_wind_da,on_wind_act,solar_da,solar_act,gen_forecast_da,gen_actual
0,2015-01-04,2015,1,4,4,23,1,6,22.3400,2015-01-04 23:00:00,...,50326.4700,53613.9750,192.1250,478.0200,11675.5250,14223.2775,0.0,0.1700,,57028.5925
1,2015-01-05,2015,1,5,5,0,2,0,17.9300,2015-01-05 00:00:00,...,48599.1775,51367.7075,192.3750,468.5525,11924.5575,14207.5025,0.0,0.2075,,56318.8525
2,2015-01-05,2015,1,5,5,1,2,0,15.1700,2015-01-05 01:00:00,...,47364.1200,50369.3225,195.0000,465.5550,12000.4075,14439.8025,0.0,0.1800,,56216.6000
3,2015-01-05,2015,1,5,5,2,2,0,16.3800,2015-01-05 02:00:00,...,47292.0000,50171.3625,193.6250,454.0175,12108.2650,14584.6825,0.0,0.2000,,56394.7350
4,2015-01-05,2015,1,5,5,3,2,0,17.3800,2015-01-05 03:00:00,...,48370.0200,51393.2750,187.6250,457.4750,12196.1175,15071.2450,0.0,0.1950,,57670.7700
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96331,2025-12-31,2025,12,31,365,18,1,2,95.9900,2025-12-31 18:00:00,...,61429.2525,55093.5625,5430.0475,5418.4900,25289.8075,26949.3850,0.0,0.0675,58900.9800,60292.3475
96332,2025-12-31,2025,12,31,365,19,1,2,86.8025,2025-12-31 19:00:00,...,58695.5025,52129.1650,5494.9900,5015.0525,26179.9500,28002.9700,0.0,0.0525,58562.3375,58865.2100
96333,2025-12-31,2025,12,31,365,20,1,2,79.7975,2025-12-31 20:00:00,...,55991.9725,50470.0175,5359.1000,4390.0575,27294.2850,29909.6650,0.0,0.0575,57650.8775,59417.6050
96334,2025-12-31,2025,12,31,365,21,1,2,81.3900,2025-12-31 21:00:00,...,53693.2025,49173.3875,5389.0800,3363.9350,27876.2850,29905.7425,0.0,0.0400,56819.4225,59089.8875


In [17]:
df_merged.columns

Index(['date', 'year', 'month', 'day', 'dayofyear', 'hour', 'week',
       'dayofweek', 'price', 'period_start_utc', 'period_end_utc', 'c_by_hour',
       'load_forecast_da', 'load_actual', 'off_wind_da', 'off_wind_act',
       'on_wind_da', 'on_wind_act', 'solar_da', 'solar_act', 'gen_forecast_da',
       'gen_actual'],
      dtype='object')

In [18]:
# 1) sort by period_start_utc
df_chk = df_merged.sort_values("period_start_utc").reset_index(drop=True)

# ensure datetime
df_chk["period_start_utc"] = pd.to_datetime(df_chk["period_start_utc"], errors="coerce")
df_chk["period_end_utc"] = pd.to_datetime(df_chk["period_end_utc"], errors="coerce")

# 2) only full-hour timestamps
full_hour_start = (df_chk["period_start_utc"].dt.minute == 0) & (df_chk["period_start_utc"].dt.second == 0)
full_hour_end = (df_chk["period_end_utc"].dt.minute == 0) & (df_chk["period_end_utc"].dt.second == 0)

# 3) exactly one hour difference
one_hour = (df_chk["period_end_utc"] - df_chk["period_start_utc"]) == pd.Timedelta(hours=1)

# 4) no missing timestamps (continuous hourly sequence)
expected = pd.date_range(
    start=df_chk["period_start_utc"].min(),
    end=df_chk["period_start_utc"].max(),
    freq="H"
)
missing = expected.difference(df_chk["period_start_utc"])

# rows failing any row-wise checks
bad_rows = df_chk[~(full_hour_start & full_hour_end & one_hour)]

print("Full-hour start:", full_hour_start.all())
print("Full-hour end:", full_hour_end.all())
print("Exactly 1 hour diff:", one_hour.all())
print("Missing timestamps:", len(missing))

# Inspect
bad_rows.head(), missing[:10]


Full-hour start: True
Full-hour end: True
Exactly 1 hour diff: True
Missing timestamps: 0


(Empty DataFrame
 Columns: [date, year, month, day, dayofyear, hour, week, dayofweek, price, period_start_utc, period_end_utc, c_by_hour, load_forecast_da, load_actual, off_wind_da, off_wind_act, on_wind_da, on_wind_act, solar_da, solar_act, gen_forecast_da, gen_actual]
 Index: []
 
 [0 rows x 22 columns],
 DatetimeIndex([], dtype='datetime64[ns]', freq='H'))

In [19]:
df_sorted=df_chk.copy()
df_sorted["res_sum_da"] = df_sorted[["off_wind_da", "on_wind_da", "solar_da"]].sum(axis=1)
df_sorted["res_sum_act"] = df_sorted[["off_wind_act", "on_wind_act", "solar_act"]].sum(axis=1)

In [20]:
df_sorted.columns

Index(['date', 'year', 'month', 'day', 'dayofyear', 'hour', 'week',
       'dayofweek', 'price', 'period_start_utc', 'period_end_utc', 'c_by_hour',
       'load_forecast_da', 'load_actual', 'off_wind_da', 'off_wind_act',
       'on_wind_da', 'on_wind_act', 'solar_da', 'solar_act', 'gen_forecast_da',
       'gen_actual', 'res_sum_da', 'res_sum_act'],
      dtype='object')

In [21]:
#df_merged.to_csv("../../data_cleaned/merged/Merge_all_prices_load_gen_res.csv", index=False)
df_sorted.to_csv("../../data_cleaned/merged/Merge_all_prices_load_gen_res.csv", index=False)