## Date Handling of Input Files

> The data contains periods in CET/CEST format.
	

* we need to have start and end date in one consitent time zone
* we need to split the period into start and end date of the period 

"01/01/2015 00:00:00 - 01/01/2015 00:15:00"


## Setup

All the manipulations and plots in this notebook can be created with standard libraries such as matplotlib, statsmodels etc. 

In [29]:
# Main data packages. 
import numpy as np
import pandas as pd


## Import Data 

The data for this notebook was downloaded from the [meteoblue website](https://www.meteoblue.com/en/weather/archive/export/basel_switzerland_2661604) and consits of weather data for the city of Basel from 2008 till 2020. 

In [30]:
files = [
    "../../data/res/on_wind/GUI_WIND_SOLAR_GENERATION_FORECAST_ONSHORE_201412312300-201512312300.csv",
    "../../data/res/on_wind/GUI_WIND_SOLAR_GENERATION_FORECAST_ONSHORE_201512312300-201612312300.csv",
    "../../data/res/on_wind/GUI_WIND_SOLAR_GENERATION_FORECAST_ONSHORE_201612312300-201712312300.csv",
    "../../data/res/on_wind/GUI_WIND_SOLAR_GENERATION_FORECAST_ONSHORE_201712312300-201812312300 - 1.csv",
    "../../data/res/on_wind/GUI_WIND_SOLAR_GENERATION_FORECAST_ONSHORE_201712312300-201812312300 - 2.csv",
    "../../data/res/on_wind/GUI_WIND_SOLAR_GENERATION_FORECAST_ONSHORE_201812312300-201912312300.csv",
    "../../data/res/on_wind/GUI_WIND_SOLAR_GENERATION_FORECAST_ONSHORE_201912312300-202012312300.csv",
    "../../data/res/on_wind/GUI_WIND_SOLAR_GENERATION_FORECAST_ONSHORE_202012312300-202112312300.csv",
    "../../data/res/on_wind/GUI_WIND_SOLAR_GENERATION_FORECAST_ONSHORE_202112312300-202212312300.csv",
    "../../data/res/on_wind/GUI_WIND_SOLAR_GENERATION_FORECAST_ONSHORE_202112312300-202212312300.csv",
    "../../data/res/on_wind/GUI_WIND_SOLAR_GENERATION_FORECAST_ONSHORE_202312312300-202412312300.csv",
    "../../data/res/on_wind/GUI_WIND_SOLAR_GENERATION_FORECAST_ONSHORE_202412312300-202512312300.csv",
    # "../../data/ree/on_wind/GUI_WIND_SOLAR_GENERATION_FORECAST_ONSHORE_202512312300-202612312300.csv",
]

raw_df = pd.concat((pd.read_csv(f, delimiter=",") for f in files), ignore_index=True)


In [31]:
raw_df.shape  #(541275, 6)

(420768, 6)

In [32]:
raw_df.columns

Index(['MTU (CET/CEST)', 'Area', 'Day-ahead (MW)', 'Intraday (MW)',
       'Current (MW)', 'Actual (MW)'],
      dtype='object')

In [33]:
# Create working copy of dataframe
df_utc_op = raw_df.copy()
df_utc_op = df_utc_op[["MTU (CET/CEST)","Day-ahead (MW)", "Actual (MW)"]].rename(  #"Intraday (MW)","Current (MW)",
    columns={
        'MTU (CET/CEST)': 'period',  
        "Day-ahead (MW)": "on_wind_da",
        # "Intraday (MW)": "OffSh Intraday (MW)",
        # "Current (MW)": "OffSh Current (MW)",
        "Actual (MW)": "on_wind_act",
    }
)
df_utc_op.head()

Unnamed: 0,period,on_wind_da,on_wind_act
0,01/01/2015 00:00:00 - 01/01/2015 00:15:00,7517.7,8866.33
1,01/01/2015 00:15:00 - 01/01/2015 00:30:00,7603.26,8867.96
2,01/01/2015 00:30:00 - 01/01/2015 00:45:00,7695.36,8986.9
3,01/01/2015 00:45:00 - 01/01/2015 01:00:00,7779.2,8980.77
4,01/01/2015 01:00:00 - 01/01/2015 01:15:00,8108.18,9045.36


In [34]:
df_utc_op.shape

(420768, 3)

In [35]:
# Split "period" into start/end strings
df_utc_op[["period_start", "period_end"]] = df_utc_op["period"].str.split(" - ", n=1, expand=True)
df_utc_op.head()

Unnamed: 0,period,on_wind_da,on_wind_act,period_start,period_end
0,01/01/2015 00:00:00 - 01/01/2015 00:15:00,7517.7,8866.33,01/01/2015 00:00:00,01/01/2015 00:15:00
1,01/01/2015 00:15:00 - 01/01/2015 00:30:00,7603.26,8867.96,01/01/2015 00:15:00,01/01/2015 00:30:00
2,01/01/2015 00:30:00 - 01/01/2015 00:45:00,7695.36,8986.9,01/01/2015 00:30:00,01/01/2015 00:45:00
3,01/01/2015 00:45:00 - 01/01/2015 01:00:00,7779.2,8980.77,01/01/2015 00:45:00,01/01/2015 01:00:00
4,01/01/2015 01:00:00 - 01/01/2015 01:15:00,8108.18,9045.36,01/01/2015 01:00:00,01/01/2015 01:15:00


In [36]:

def add_timezone_and_utc(df, col): #="period_start"):
    # State: start in CET
    tz_state = "CET"
    tz_list = []

    # Walk rows in order and flip state if original string contains CET/CEST
    for val in df[col].astype(str):
        if "CEST" in val:
            tz_state = "CEST"
        elif "CET" in val:
            tz_state = "CET"
        tz_list.append(tz_state)

    # Parse datetime from the column (remove any existing timezone text)
    base_dt = pd.to_datetime(
        df[col].astype(str).str.replace(r"\s*\(.*\)$", "", regex=True),
        dayfirst=True,
        errors="coerce"
    )

    # Add labeled version
    # df["period_start_labeled"] = (
    df[col + "_labeled"] = (
        base_dt.dt.strftime("%d/%m/%Y %H:%M:%S") + " (" + pd.Series(tz_list, index=df.index) + ")"
    )

    # Compute UTC by subtracting 1 or 2 hours depending on CET/CEST
    offset_hours = pd.Series(tz_list, index=df.index).map({"CET": 1, "CEST": 2})
    df[col + "_utc"] = base_dt - pd.to_timedelta(offset_hours, unit="h")
    # df["tsp_start_utc"] = base_dt - pd.to_timedelta(offset_hours, unit="h")

    return df


In [37]:
df_utc_op = add_timezone_and_utc(df_utc_op, col="period_start")
df_utc_op = add_timezone_and_utc(df_utc_op, col="period_end")

In [38]:
df_utc_op

Unnamed: 0,period,on_wind_da,on_wind_act,period_start,period_end,period_start_labeled,period_start_utc,period_end_labeled,period_end_utc
0,01/01/2015 00:00:00 - 01/01/2015 00:15:00,7517.7,8866.33,01/01/2015 00:00:00,01/01/2015 00:15:00,01/01/2015 00:00:00 (CET),2014-12-31 23:00:00,01/01/2015 00:15:00 (CET),2014-12-31 23:15:00
1,01/01/2015 00:15:00 - 01/01/2015 00:30:00,7603.26,8867.96,01/01/2015 00:15:00,01/01/2015 00:30:00,01/01/2015 00:15:00 (CET),2014-12-31 23:15:00,01/01/2015 00:30:00 (CET),2014-12-31 23:30:00
2,01/01/2015 00:30:00 - 01/01/2015 00:45:00,7695.36,8986.9,01/01/2015 00:30:00,01/01/2015 00:45:00,01/01/2015 00:30:00 (CET),2014-12-31 23:30:00,01/01/2015 00:45:00 (CET),2014-12-31 23:45:00
3,01/01/2015 00:45:00 - 01/01/2015 01:00:00,7779.2,8980.77,01/01/2015 00:45:00,01/01/2015 01:00:00,01/01/2015 00:45:00 (CET),2014-12-31 23:45:00,01/01/2015 01:00:00 (CET),2015-01-01 00:00:00
4,01/01/2015 01:00:00 - 01/01/2015 01:15:00,8108.18,9045.36,01/01/2015 01:00:00,01/01/2015 01:15:00,01/01/2015 01:00:00 (CET),2015-01-01 00:00:00,01/01/2015 01:15:00 (CET),2015-01-01 00:15:00
...,...,...,...,...,...,...,...,...,...
420763,31/12/2025 22:45:00 - 31/12/2025 23:00:00,28246.54,30309.09,31/12/2025 22:45:00,31/12/2025 23:00:00,31/12/2025 22:45:00 (CET),2025-12-31 21:45:00,31/12/2025 23:00:00 (CET),2025-12-31 22:00:00
420764,31/12/2025 23:00:00 - 31/12/2025 23:15:00,28538.46,30296.55,31/12/2025 23:00:00,31/12/2025 23:15:00,31/12/2025 23:00:00 (CET),2025-12-31 22:00:00,31/12/2025 23:15:00 (CET),2025-12-31 22:15:00
420765,31/12/2025 23:15:00 - 31/12/2025 23:30:00,29009.41,30638.98,31/12/2025 23:15:00,31/12/2025 23:30:00,31/12/2025 23:15:00 (CET),2025-12-31 22:15:00,31/12/2025 23:30:00 (CET),2025-12-31 22:30:00
420766,31/12/2025 23:30:00 - 31/12/2025 23:45:00,29471.1,31365.92,31/12/2025 23:30:00,31/12/2025 23:45:00,31/12/2025 23:30:00 (CET),2025-12-31 22:30:00,31/12/2025 23:45:00 (CET),2025-12-31 22:45:00


In [39]:
df_utc_op.columns

Index(['period', 'on_wind_da', 'on_wind_act', 'period_start', 'period_end',
       'period_start_labeled', 'period_start_utc', 'period_end_labeled',
       'period_end_utc'],
      dtype='object')

In [40]:
# Create working copy of dataframe
df_utc_op_temp = df_utc_op.copy()
# select only those values
# df_price = df_price[df_price["Sequence"] != "Sequence Sequence 2"]
 # drop columns
df_utc_op.drop(columns=['period_start', 'period_end', 'period_start_labeled', 'period_end_labeled'], inplace=True)

df_utc_op = df_utc_op.assign(
    date1=lambda x: x["period_start_utc"].dt.date,
    date=lambda x: pd.to_datetime(
        x["date1"],
        format="mixed",
        dayfirst=True,
        errors="coerce"
    ),
    year=lambda x: x["period_start_utc"].dt.year,
    month=lambda x: x["period_start_utc"].dt.month,
    #monthname=lambda x: x["period_start_utc"].dt.month_name(),
    day=lambda x: x["period_start_utc"].dt.day,
    dayofyear=lambda x: x["period_start_utc"].dt.dayofyear,
    hour=lambda x: x["period_start_utc"].dt.hour,
    #week=lambda x: x["period_start_utc"].dt.isocalendar().week.astype(int),
    week=lambda x: x["period_start_utc"].dt.isocalendar().week.astype("Int64"),
    dayofweek=lambda x: x["period_start_utc"].dt.dayofweek,   # 0=Mon ... 6=Sun
    #dayname=lambda x: x["period_start_utc"].dt.day_name(),   

)
df_utc_op.drop(columns=['date1'], inplace=True)

df_utc_op.head()

Unnamed: 0,period,on_wind_da,on_wind_act,period_start_utc,period_end_utc,date,year,month,day,dayofyear,hour,week,dayofweek
0,01/01/2015 00:00:00 - 01/01/2015 00:15:00,7517.7,8866.33,2014-12-31 23:00:00,2014-12-31 23:15:00,2014-12-31,2014,12,31,365,23,1,2
1,01/01/2015 00:15:00 - 01/01/2015 00:30:00,7603.26,8867.96,2014-12-31 23:15:00,2014-12-31 23:30:00,2014-12-31,2014,12,31,365,23,1,2
2,01/01/2015 00:30:00 - 01/01/2015 00:45:00,7695.36,8986.9,2014-12-31 23:30:00,2014-12-31 23:45:00,2014-12-31,2014,12,31,365,23,1,2
3,01/01/2015 00:45:00 - 01/01/2015 01:00:00,7779.2,8980.77,2014-12-31 23:45:00,2015-01-01 00:00:00,2014-12-31,2014,12,31,365,23,1,2
4,01/01/2015 01:00:00 - 01/01/2015 01:15:00,8108.18,9045.36,2015-01-01 00:00:00,2015-01-01 00:15:00,2015-01-01,2015,1,1,1,0,1,3


In [41]:
df_utc_op.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 420768 entries, 0 to 420767
Data columns (total 13 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   period            420768 non-null  object        
 1   on_wind_da        420760 non-null  object        
 2   on_wind_act       420575 non-null  object        
 3   period_start_utc  420768 non-null  datetime64[ns]
 4   period_end_utc    420768 non-null  datetime64[ns]
 5   date              420768 non-null  datetime64[ns]
 6   year              420768 non-null  int32         
 7   month             420768 non-null  int32         
 8   day               420768 non-null  int32         
 9   dayofyear         420768 non-null  int32         
 10  hour              420768 non-null  int32         
 11  week              420768 non-null  Int64         
 12  dayofweek         420768 non-null  int32         
dtypes: Int64(1), datetime64[ns](3), int32(6), object(3)
memory 

In [42]:
# Check NaN only in one column ('price')
rows_with_missing_price = df_utc_op[df_utc_op["on_wind_da"].isna()]
rows_with_missing_price.tail(100)

Unnamed: 0,period,on_wind_da,on_wind_act,period_start_utc,period_end_utc,date,year,month,day,dayofyear,hour,week,dayofweek
379388,27/10/2024 00:00:00 - 27/10/2024 00:15:00,,13274.82,2024-10-26 22:00:00,2024-10-26 22:15:00,2024-10-26,2024,10,26,300,22,43,5
379389,27/10/2024 00:15:00 - 27/10/2024 00:30:00,,13322.28,2024-10-26 22:15:00,2024-10-26 22:30:00,2024-10-26,2024,10,26,300,22,43,5
379390,27/10/2024 00:30:00 - 27/10/2024 00:45:00,,13214.21,2024-10-26 22:30:00,2024-10-26 22:45:00,2024-10-26,2024,10,26,300,22,43,5
379391,27/10/2024 00:45:00 - 27/10/2024 01:00:00,,13103.85,2024-10-26 22:45:00,2024-10-26 23:00:00,2024-10-26,2024,10,26,300,22,43,5
414332,26/10/2025 00:00:00 - 26/10/2025 00:15:00,,31320.38,2025-10-25 22:00:00,2025-10-25 22:15:00,2025-10-25,2025,10,25,298,22,43,5
414333,26/10/2025 00:15:00 - 26/10/2025 00:30:00,,31109.36,2025-10-25 22:15:00,2025-10-25 22:30:00,2025-10-25,2025,10,25,298,22,43,5
414334,26/10/2025 00:30:00 - 26/10/2025 00:45:00,,31415.02,2025-10-25 22:30:00,2025-10-25 22:45:00,2025-10-25,2025,10,25,298,22,43,5
414335,26/10/2025 00:45:00 - 26/10/2025 01:00:00,,31504.98,2025-10-25 22:45:00,2025-10-25 23:00:00,2025-10-25,2025,10,25,298,22,43,5


In [43]:
# Check NaN only in one column ('price')
rows_with_missing_price = df_utc_op[df_utc_op["on_wind_act"].isna()]
rows_with_missing_price

Unnamed: 0,period,on_wind_da,on_wind_act,period_start_utc,period_end_utc,date,year,month,day,dayofyear,hour,week,dayofweek
49624,31/05/2016 23:00:00 - 31/05/2016 23:15:00,4037.27,,2016-05-31 21:00:00,2016-05-31 21:15:00,2016-05-31,2016,5,31,152,21,22,1
49625,31/05/2016 23:15:00 - 31/05/2016 23:30:00,4105.33,,2016-05-31 21:15:00,2016-05-31 21:30:00,2016-05-31,2016,5,31,152,21,22,1
49626,31/05/2016 23:30:00 - 31/05/2016 23:45:00,4170.27,,2016-05-31 21:30:00,2016-05-31 21:45:00,2016-05-31,2016,5,31,152,21,22,1
49627,31/05/2016 23:45:00 - 01/06/2016 00:00:00,4238.37,,2016-05-31 21:45:00,2016-05-31 22:00:00,2016-05-31,2016,5,31,152,21,22,1
49628,01/06/2016 00:00:00 - 01/06/2016 00:15:00,4809.9,,2016-05-31 22:00:00,2016-05-31 22:15:00,2016-05-31,2016,5,31,152,22,22,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
64024,28/10/2016 23:00:00 - 28/10/2016 23:15:00,9385.66,,2016-10-28 21:00:00,2016-10-28 21:15:00,2016-10-28,2016,10,28,302,21,43,4
64025,28/10/2016 23:15:00 - 28/10/2016 23:30:00,9325.19,,2016-10-28 21:15:00,2016-10-28 21:30:00,2016-10-28,2016,10,28,302,21,43,4
64026,28/10/2016 23:30:00 - 28/10/2016 23:45:00,9261.69,,2016-10-28 21:30:00,2016-10-28 21:45:00,2016-10-28,2016,10,28,302,21,43,4
64027,28/10/2016 23:45:00 - 29/10/2016 00:00:00,9195.97,,2016-10-28 21:45:00,2016-10-28 22:00:00,2016-10-28,2016,10,28,302,21,43,4


In [44]:
# Convert to numeric, turn non-numeric strings into NaN
df_utc_op["on_wind_act"] = pd.to_numeric(df_utc_op["on_wind_act"], errors="coerce")
df_utc_op["on_wind_da"] = pd.to_numeric(df_utc_op["on_wind_da"], errors="coerce")

rows_with_missing_price = df_utc_op[df_utc_op["on_wind_act"].isna()]
rows_with_missing_price

Unnamed: 0,period,on_wind_da,on_wind_act,period_start_utc,period_end_utc,date,year,month,day,dayofyear,hour,week,dayofweek
49624,31/05/2016 23:00:00 - 31/05/2016 23:15:00,4037.27,,2016-05-31 21:00:00,2016-05-31 21:15:00,2016-05-31,2016,5,31,152,21,22,1
49625,31/05/2016 23:15:00 - 31/05/2016 23:30:00,4105.33,,2016-05-31 21:15:00,2016-05-31 21:30:00,2016-05-31,2016,5,31,152,21,22,1
49626,31/05/2016 23:30:00 - 31/05/2016 23:45:00,4170.27,,2016-05-31 21:30:00,2016-05-31 21:45:00,2016-05-31,2016,5,31,152,21,22,1
49627,31/05/2016 23:45:00 - 01/06/2016 00:00:00,4238.37,,2016-05-31 21:45:00,2016-05-31 22:00:00,2016-05-31,2016,5,31,152,21,22,1
49628,01/06/2016 00:00:00 - 01/06/2016 00:15:00,4809.90,,2016-05-31 22:00:00,2016-05-31 22:15:00,2016-05-31,2016,5,31,152,22,22,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
166456,30/09/2018 23:00:00 - 30/09/2018 23:15:00,,,2018-09-30 21:00:00,2018-09-30 21:15:00,2018-09-30,2018,9,30,273,21,39,6
166457,30/09/2018 23:15:00 - 30/09/2018 23:30:00,,,2018-09-30 21:15:00,2018-09-30 21:30:00,2018-09-30,2018,9,30,273,21,39,6
166458,30/09/2018 23:30:00 - 30/09/2018 23:45:00,,,2018-09-30 21:30:00,2018-09-30 21:45:00,2018-09-30,2018,9,30,273,21,39,6
166459,30/09/2018 23:45:00 - 01/10/2018 00:00:00,,,2018-09-30 21:45:00,2018-09-30 22:00:00,2018-09-30,2018,9,30,273,21,39,6


In [45]:
df_utc_op.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 420768 entries, 0 to 420767
Data columns (total 13 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   period            420768 non-null  object        
 1   on_wind_da        385720 non-null  float64       
 2   on_wind_act       385535 non-null  float64       
 3   period_start_utc  420768 non-null  datetime64[ns]
 4   period_end_utc    420768 non-null  datetime64[ns]
 5   date              420768 non-null  datetime64[ns]
 6   year              420768 non-null  int32         
 7   month             420768 non-null  int32         
 8   day               420768 non-null  int32         
 9   dayofyear         420768 non-null  int32         
 10  hour              420768 non-null  int32         
 11  week              420768 non-null  Int64         
 12  dayofweek         420768 non-null  int32         
dtypes: Int64(1), datetime64[ns](3), float64(2), int32(6), objec

In [58]:
group_cols = [
    "date", "year", "month", "day", "dayofyear", "hour", "week", "dayofweek"
]

df_utc_h = (
    df_utc_op
    .groupby(group_cols, as_index=False)
    .agg(
        on_wind_da=("on_wind_da", "mean"),
        on_wind_act=("on_wind_act", "mean"),
        period_start_utc=("period_start_utc", "min"),
        period_end_utc=("period_end_utc", "max"),
        c_by_hour=("year", "size"),   # count rows per group
    )
)
df_utc_h.head()

Unnamed: 0,date,year,month,day,dayofyear,hour,week,dayofweek,on_wind_da,on_wind_act,period_start_utc,period_end_utc,c_by_hour
0,2014-12-31,2014,12,31,365,23,1,2,7648.88,8925.49,2014-12-31 23:00:00,2015-01-01 00:00:00,4
1,2015-01-01,2015,1,1,1,0,1,3,8212.41,9129.9925,2015-01-01 00:00:00,2015-01-01 01:00:00,4
2,2015-01-01,2015,1,1,1,1,1,3,8340.6575,9287.08,2015-01-01 01:00:00,2015-01-01 02:00:00,4
3,2015-01-01,2015,1,1,1,2,1,3,8408.77,9232.5225,2015-01-01 02:00:00,2015-01-01 03:00:00,4
4,2015-01-01,2015,1,1,1,3,1,3,8581.785,9319.295,2015-01-01 03:00:00,2015-01-01 04:00:00,4


In [59]:
df_utc_h.tail()

Unnamed: 0,date,year,month,day,dayofyear,hour,week,dayofweek,on_wind_da,on_wind_act,period_start_utc,period_end_utc,c_by_hour
87667,2025-12-31,2025,12,31,365,18,1,2,25289.8075,26949.385,2025-12-31 18:00:00,2025-12-31 19:00:00,4
87668,2025-12-31,2025,12,31,365,19,1,2,26179.95,28002.97,2025-12-31 19:00:00,2025-12-31 20:00:00,4
87669,2025-12-31,2025,12,31,365,20,1,2,27294.285,29909.665,2025-12-31 20:00:00,2025-12-31 21:00:00,4
87670,2025-12-31,2025,12,31,365,21,1,2,27876.285,29905.7425,2025-12-31 21:00:00,2025-12-31 22:00:00,4
87671,2025-12-31,2025,12,31,365,22,1,2,29243.185,31084.6075,2025-12-31 22:00:00,2025-12-31 23:00:00,4


In [60]:
df_utc_h["c_by_hour"].value_counts()

c_by_hour
4    70152
8    17520
Name: count, dtype: int64

In [61]:
df_utc_h[df_utc_h["c_by_hour"] > 4]

Unnamed: 0,date,year,month,day,dayofyear,hour,week,dayofweek,on_wind_da,on_wind_act,period_start_utc,period_end_utc,c_by_hour
26304,2017-12-31,2017,12,31,365,23,52,6,29244.4275,29647.5750,2017-12-31 23:00:00,2018-01-01 00:00:00,8
26305,2018-01-01,2018,1,1,1,0,1,0,30310.5925,30637.2375,2018-01-01 00:00:00,2018-01-01 01:00:00,8
26306,2018-01-01,2018,1,1,1,1,1,0,31520.8025,31217.9950,2018-01-01 01:00:00,2018-01-01 02:00:00,8
26307,2018-01-01,2018,1,1,1,2,1,0,32588.2250,31861.7125,2018-01-01 02:00:00,2018-01-01 03:00:00,8
26308,2018-01-01,2018,1,1,1,3,1,0,33411.4250,31610.9025,2018-01-01 03:00:00,2018-01-01 04:00:00,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...
70123,2022-12-31,2022,12,31,365,18,52,5,34793.9975,31062.9850,2022-12-31 18:00:00,2022-12-31 19:00:00,8
70124,2022-12-31,2022,12,31,365,19,52,5,35025.0125,31046.3725,2022-12-31 19:00:00,2022-12-31 20:00:00,8
70125,2022-12-31,2022,12,31,365,20,52,5,35161.5975,31300.0900,2022-12-31 20:00:00,2022-12-31 21:00:00,8
70126,2022-12-31,2022,12,31,365,21,52,5,34383.6450,30941.0800,2022-12-31 21:00:00,2022-12-31 22:00:00,8


In [62]:
group_cols = [
    "date", "year", "month", "day", "dayofyear", "week", "dayofweek"
]

df_utc_d = (
    df_utc_h
    .groupby(group_cols, as_index=False)
    .agg(
        on_wind_da=("on_wind_da", "mean"),
        on_wind_act=("on_wind_act", "mean"),
        period_start_utc=("period_start_utc", "min"),
        period_end_utc=("period_end_utc", "max"),
        c_by_day=("year", "size"),   # count rows per group
    )
)
df_utc_d.head()

Unnamed: 0,date,year,month,day,dayofyear,week,dayofweek,on_wind_da,on_wind_act,period_start_utc,period_end_utc,c_by_day
0,2014-12-31,2014,12,31,365,1,2,7648.88,8925.49,2014-12-31 23:00:00,2015-01-01,1
1,2015-01-01,2015,1,1,1,1,3,12042.876667,13866.128438,2015-01-01 00:00:00,2015-01-02,24
2,2015-01-02,2015,1,2,2,1,4,26106.059375,25339.485729,2015-01-02 00:00:00,2015-01-03,24
3,2015-01-03,2015,1,3,3,1,5,18343.128229,19561.925625,2015-01-03 00:00:00,2015-01-04,24
4,2015-01-04,2015,1,4,4,1,6,17133.397812,17753.627708,2015-01-04 00:00:00,2015-01-05,24


In [63]:
df_utc_d.tail()

Unnamed: 0,date,year,month,day,dayofyear,week,dayofweek,on_wind_da,on_wind_act,period_start_utc,period_end_utc,c_by_day
3650,2025-12-27,2025,12,27,361,52,5,12727.308333,12828.157708,2025-12-27,2025-12-28 00:00:00,24
3651,2025-12-28,2025,12,28,362,52,6,10650.399167,9942.153542,2025-12-28,2025-12-29 00:00:00,24
3652,2025-12-29,2025,12,29,363,1,0,19306.082708,18193.835938,2025-12-29,2025-12-30 00:00:00,24
3653,2025-12-30,2025,12,30,364,1,1,16268.198542,16096.952812,2025-12-30,2025-12-31 00:00:00,24
3654,2025-12-31,2025,12,31,365,1,2,23525.647717,26618.26413,2025-12-31,2025-12-31 23:00:00,23


In [64]:
df_utc_d["c_by_day"].value_counts()

c_by_day
24    3651
1        2
23       2
Name: count, dtype: int64

In [66]:
group_cols = [
    "year"
]

df_utc_y = (
    df_utc_d
    .groupby(group_cols, as_index=False)
    .agg(
        on_wind_da=("on_wind_da", "mean"),
        on_wind_act=("on_wind_act", "mean"),
        period_start_utc=("period_start_utc", "min"),
        period_end_utc=("period_end_utc", "max"),
        c_by_year=("year", "size"),   # count rows per group
    )
)
df_utc_y.head(20)

Unnamed: 0,year,on_wind_da,on_wind_act,period_start_utc,period_end_utc,c_by_year
0,2014,7648.88,8925.49,2014-12-31 23:00:00,2015-01-01 00:00:00,1
1,2015,8443.835543,8696.355618,2015-01-01 00:00:00,2016-01-01 00:00:00,365
2,2016,7907.393071,8156.036882,2016-01-01 00:00:00,2017-01-01 00:00:00,366
3,2017,10383.917061,10610.234501,2017-01-01 00:00:00,2018-01-01 00:00:00,365
4,2018,10569.235731,10714.986315,2018-01-01 00:00:00,2019-01-01 00:00:00,365
5,2019,11230.842983,11415.052499,2019-01-01 00:00:00,2020-01-01 00:00:00,365
6,2020,11662.338829,11773.524047,2020-01-01 00:00:00,2021-01-01 00:00:00,366
7,2021,10201.293727,10261.965281,2021-01-01 00:00:00,2022-01-01 00:00:00,365
8,2022,11341.964633,11563.487552,2022-01-01 00:00:00,2022-12-31 23:00:00,365
9,2023,30113.93,29754.0975,2023-12-31 23:00:00,2024-01-01 00:00:00,1


In [67]:
# Check NaN only in one column ('price')
rows_with_missing_price = df_utc_op[df_utc_op["on_wind_da"].isna()]
rows_with_missing_price

Unnamed: 0,period,on_wind_da,on_wind_act,period_start_utc,period_end_utc,date,year,month,day,dayofyear,hour,week,dayofweek
131420,01/10/2018 00:00:00 - 01/10/2018 00:15:00,,,2018-09-30 22:00:00,2018-09-30 22:15:00,2018-09-30,2018,9,30,273,22,39,6
131421,01/10/2018 00:15:00 - 01/10/2018 00:30:00,,,2018-09-30 22:15:00,2018-09-30 22:30:00,2018-09-30,2018,9,30,273,22,39,6
131422,01/10/2018 00:30:00 - 01/10/2018 00:45:00,,,2018-09-30 22:30:00,2018-09-30 22:45:00,2018-09-30,2018,9,30,273,22,39,6
131423,01/10/2018 00:45:00 - 01/10/2018 01:00:00,,,2018-09-30 22:45:00,2018-09-30 23:00:00,2018-09-30,2018,9,30,273,22,39,6
131424,01/10/2018 01:00:00 - 01/10/2018 01:15:00,,,2018-09-30 23:00:00,2018-09-30 23:15:00,2018-09-30,2018,9,30,273,23,39,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...
379391,27/10/2024 00:45:00 - 27/10/2024 01:00:00,,13103.85,2024-10-26 22:45:00,2024-10-26 23:00:00,2024-10-26,2024,10,26,300,22,43,5
414332,26/10/2025 00:00:00 - 26/10/2025 00:15:00,,31320.38,2025-10-25 22:00:00,2025-10-25 22:15:00,2025-10-25,2025,10,25,298,22,43,5
414333,26/10/2025 00:15:00 - 26/10/2025 00:30:00,,31109.36,2025-10-25 22:15:00,2025-10-25 22:30:00,2025-10-25,2025,10,25,298,22,43,5
414334,26/10/2025 00:30:00 - 26/10/2025 00:45:00,,31415.02,2025-10-25 22:30:00,2025-10-25 22:45:00,2025-10-25,2025,10,25,298,22,43,5


In [68]:
# Check NaN only in one column ('price')
rows_with_missing_price = df_utc_op[df_utc_op["on_wind_act"].isna()]
rows_with_missing_price

Unnamed: 0,period,on_wind_da,on_wind_act,period_start_utc,period_end_utc,date,year,month,day,dayofyear,hour,week,dayofweek
49624,31/05/2016 23:00:00 - 31/05/2016 23:15:00,4037.27,,2016-05-31 21:00:00,2016-05-31 21:15:00,2016-05-31,2016,5,31,152,21,22,1
49625,31/05/2016 23:15:00 - 31/05/2016 23:30:00,4105.33,,2016-05-31 21:15:00,2016-05-31 21:30:00,2016-05-31,2016,5,31,152,21,22,1
49626,31/05/2016 23:30:00 - 31/05/2016 23:45:00,4170.27,,2016-05-31 21:30:00,2016-05-31 21:45:00,2016-05-31,2016,5,31,152,21,22,1
49627,31/05/2016 23:45:00 - 01/06/2016 00:00:00,4238.37,,2016-05-31 21:45:00,2016-05-31 22:00:00,2016-05-31,2016,5,31,152,21,22,1
49628,01/06/2016 00:00:00 - 01/06/2016 00:15:00,4809.90,,2016-05-31 22:00:00,2016-05-31 22:15:00,2016-05-31,2016,5,31,152,22,22,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
166456,30/09/2018 23:00:00 - 30/09/2018 23:15:00,,,2018-09-30 21:00:00,2018-09-30 21:15:00,2018-09-30,2018,9,30,273,21,39,6
166457,30/09/2018 23:15:00 - 30/09/2018 23:30:00,,,2018-09-30 21:15:00,2018-09-30 21:30:00,2018-09-30,2018,9,30,273,21,39,6
166458,30/09/2018 23:30:00 - 30/09/2018 23:45:00,,,2018-09-30 21:30:00,2018-09-30 21:45:00,2018-09-30,2018,9,30,273,21,39,6
166459,30/09/2018 23:45:00 - 01/10/2018 00:00:00,,,2018-09-30 21:45:00,2018-09-30 22:00:00,2018-09-30,2018,9,30,273,21,39,6


## saving our files by hour

In [69]:
df_utc_h.to_csv("../../data_cleaned/by_source/04_RES_WIND_ONSHORE.csv", index=False)