In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [230]:
data = pd.read_csv('CSV_Data_3_min.csv', header=0, sep=';')
data.head()

Unnamed: 0,startTime,endTime,Wind power production - real-time data,Nuclear power production - real-time data,Hydro power production - real-time data,Electricity production in Finland - real-time data,"Electricity production, surplus/deficit - real-time data","Electricity production, reserve power plants and small-scale production - real-time data"
0,2022-12-31T23:01:00.000Z,2022-12-31T23:01:00.000Z,3442.0,4373.19999,634.5,10603.0,675.5,70.0
1,2022-12-31T23:04:00.000Z,2022-12-31T23:04:00.000Z,3441.0,4372.0,655.5,10608.0,570.5,70.0
2,2022-12-31T23:07:00.000Z,2022-12-31T23:07:00.000Z,3433.59999,4378.8,666.15999,10629.0,508.0,70.0
3,2022-12-31T23:10:00.000Z,2022-12-31T23:10:00.000Z,3445.9,4371.3,675.67999,10630.0,412.19999,70.0
4,2022-12-31T23:13:00.000Z,2022-12-31T23:13:00.000Z,3465.0,4377.3,684.49,10659.0,451.3,70.0


In [231]:
data.dtypes

startTime                                                                                       str
endTime                                                                                         str
Wind power production - real-time data                                                      float64
Nuclear power production - real-time data                                                   float64
Hydro power production - real-time data                                                     float64
Electricity production in Finland - real-time data                                          float64
Electricity production, surplus/deficit - real-time data                                    float64
Electricity production, reserve power plants and small-scale production - real-time data    float64
dtype: object

In [232]:
data.shape

(548153, 8)

In [233]:
data.columns

Index(['startTime', 'endTime', 'Wind power production - real-time data',
       'Nuclear power production - real-time data',
       'Hydro power production - real-time data',
       'Electricity production in Finland - real-time data',
       'Electricity production, surplus/deficit - real-time data',
       'Electricity production, reserve power plants and small-scale production - real-time data'],
      dtype='str')

In [234]:
data.isnull().sum()

startTime                                                                                      0
endTime                                                                                        0
Wind power production - real-time data                                                      6234
Nuclear power production - real-time data                                                   2144
Hydro power production - real-time data                                                     2076
Electricity production in Finland - real-time data                                          3513
Electricity production, surplus/deficit - real-time data                                    2119
Electricity production, reserve power plants and small-scale production - real-time data    2127
dtype: int64

In [235]:
# Drop the 'endTime' column and rename the columns
data = data.drop(columns=['endTime'])

# Rename the columns
data = data.rename(columns={
    'startTime': 'start_time',
    'Wind power production - real-time data': 'wind_production',
    'Nuclear power production - real-time data': 'nuclear_production',
    'Hydro power production - real-time data': 'hydro_production',
    'Electricity production in Finland - real-time data': 'total_production',
    'Electricity production, surplus/deficit - real-time data': 'surplus_deficit',
    'Electricity production, reserve power plants and small-scale production - real-time data': 'reserve_small_scale_production'
})

In [236]:
# Ensure 'start_time' is in datetime format
data['start_time'] = pd.to_datetime(data['start_time'])

# Set 'start_time' as the index for resampling
data = data.set_index('start_time')

In [237]:
data.head()

Unnamed: 0_level_0,wind_production,nuclear_production,hydro_production,total_production,surplus_deficit,reserve_small_scale_production
start_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-12-31 23:01:00+00:00,3442.0,4373.19999,634.5,10603.0,675.5,70.0
2022-12-31 23:04:00+00:00,3441.0,4372.0,655.5,10608.0,570.5,70.0
2022-12-31 23:07:00+00:00,3433.59999,4378.8,666.15999,10629.0,508.0,70.0
2022-12-31 23:10:00+00:00,3445.9,4371.3,675.67999,10630.0,412.19999,70.0
2022-12-31 23:13:00+00:00,3465.0,4377.3,684.49,10659.0,451.3,70.0


In [238]:
# Ensure datetime index
if not isinstance(data.index, pd.DatetimeIndex):
    data.index = pd.to_datetime(data.index)

# 1-hour moving average (robust to missing values)
rolling_1h = data.rolling('1h', min_periods=1).mean()

# keep one value per hour (hour-end rolling average)
data_hourly = rolling_1h.resample('1h').last()

data_hourly.head()

Unnamed: 0_level_0,wind_production,nuclear_production,hydro_production,total_production,surplus_deficit,reserve_small_scale_production
start_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-12-31 23:00:00+00:00,3447.92,4371.914998,637.046496,10602.2,478.399996,70.0
2023-01-01 00:00:00+00:00,3558.065,4371.744998,517.146997,10593.15,437.404996,70.0
2023-01-01 01:00:00+00:00,3635.24,4373.079996,538.334996,10706.45,658.064998,71.1
2023-01-01 02:00:00+00:00,3627.525,4373.309997,523.641995,10692.35,694.119996,72.0
2023-01-01 03:00:00+00:00,3503.089999,4372.399997,559.133995,10607.7,669.764997,72.0


In [239]:
data_hourly.isnull().sum()

wind_production                   45
nuclear_production                47
hydro_production                  45
total_production                  47
surplus_deficit                   45
reserve_small_scale_production    46
dtype: int64

In [240]:
data_hourly.dtypes

wind_production                   float64
nuclear_production                float64
hydro_production                  float64
total_production                  float64
surplus_deficit                   float64
reserve_small_scale_production    float64
dtype: object

In [244]:
# Fill missing values using neighboring values (forward fill, then backward fill)
data_hourly = data_hourly.fillna("ffill").fillna("bfill")
data_hourly.isnull().sum()

wind_production                   0
nuclear_production                0
hydro_production                  0
total_production                  0
surplus_deficit                   0
reserve_small_scale_production    0
dtype: int64

In [245]:
data_hourly.dtypes

wind_production                   object
nuclear_production                object
hydro_production                  object
total_production                  object
surplus_deficit                   object
reserve_small_scale_production    object
dtype: object

In [246]:
data_hourly.head()

Unnamed: 0_level_0,wind_production,nuclear_production,hydro_production,total_production,surplus_deficit,reserve_small_scale_production
start_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-12-31 23:00:00+00:00,3447.92,4371.914998,637.046496,10602.2,478.399996,70.0
2023-01-01 00:00:00+00:00,3558.065,4371.744998,517.146997,10593.15,437.404996,70.0
2023-01-01 01:00:00+00:00,3635.24,4373.079996,538.334996,10706.45,658.064998,71.1
2023-01-01 02:00:00+00:00,3627.525,4373.309997,523.641995,10692.35,694.119996,72.0
2023-01-01 03:00:00+00:00,3503.089999,4372.399997,559.133995,10607.7,669.764997,72.0


In [None]:
#data_hourly.to_csv('cleaned_fingrid_data_with_missing.csv', sep=';', index=True)
data_hourly.to_csv('cleaned_fingrid_data.csv', sep=';', index=True)