<span style="font-weight:bold; font-size: 3rem; color:#0047AB;">Feature pipeline updating hourly</span>

## 🗒️ This notebook is divided into the following sections:

1. Retrieves real-time weather and electricity data and inserts the new data into respective feature groups


## <span style='color:#ff5f27'> Imports</span>

In [1]:
import pandas as pd
import os
import numpy as np
import requests
from datetime import datetime, timedelta
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from datetime import datetime, timedelta
import time
import json
from geopy.geocoders import Nominatim
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
from matplotlib.ticker import MultipleLocator
import openmeteo_requests
import requests_cache
from retry_requests import retry
import hsfs
from pathlib import Path
from dotenv import load_dotenv
import hopsworks
import sys

root_dir = Path().resolve().parent
sys.path.append(str(root_dir))

from format_data import format_weather_data, format_price_data, process_weather_data
from get_electricity_prices import get_data
from get_weather_data import get_historical_weather, get_weather_forecast
from entsoe_data import fetch_historical_data, ensure_valid_series
from util import modify_weather_df, modify_entsoe_df, create_lagging_columns, add_future_price_column




### Connecting to Hopsworks and Entose API

In [2]:
if not os.getenv("CI"):  # CI is a common variable set in GitHub Actions
    load_dotenv()
    
hopsworks_api = os.getenv("HOPSWORKS_API_KEY")
entose_api = os.getenv("ENTSOE_API")

# Check if keys are loaded properly
if not hopsworks_api:
    raise ValueError("HOPSWORKS_API_KEY is not set.")
if not entose_api:
    raise ValueError("ENTSOE_API is not set.")

os.environ["ENTSOE_API"] = entose_api
os.environ["HOPSWORKS_API_KEY"] = hopsworks_api

project = hopsworks.login()
fs = project.get_feature_store() 
print(f"Connected to project: {project.name}")


2025-01-07 23:43:45,208 INFO: Initializing external client
2025-01-07 23:43:45,210 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-01-07 23:43:48,239 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1164446
Connected to project: oskaralf


In [3]:
start_date = datetime.now().strftime('%Y-%m-%d')
end_date = (datetime.now() + timedelta(days=1)).strftime('%Y-%m-%d')

## <span style='color:#ff5f27'> Get latest weather and electricity data</span>

### Get weather data and format it

In [4]:
forecast = get_weather_forecast("Stockholm", "2022-11-01", "2025-01-03", 59.3294, 18.0687)

formatted_forecast_df = process_weather_data(forecast)
formatted_forecast_df = modify_weather_df(formatted_forecast_df)
formatted_forecast_df = create_lagging_columns(formatted_forecast_df)
print(formatted_forecast_df.dtypes)

Coordinates 59.32889938354492°N 18.072357177734375°E
Elevation 24.0 m asl
Timezone None None
Timezone difference to GMT+0 0 s
number of rows before dropping na:  168
number of rows after dropping na:  167
date                   datetime64[ns]
temperature_2m                float32
precipitation                 float32
wind_speed_100m               float32
temperature_2m_lag            float32
precipitation_lag             float32
wind_speed_100m_lag           float32
dtype: object


### Retreive feature groups

In [5]:

weather_fg = fs.get_feature_group(
    name='weather_data_3',
    version=1,
)
entsoe_fg = fs.get_feature_group(
    name='entsoe_data_3',
    version=1,
)

In [6]:
print(formatted_forecast_df.isna().sum())

date                   0
temperature_2m         0
precipitation          0
wind_speed_100m        0
temperature_2m_lag     0
precipitation_lag      0
wind_speed_100m_lag    0
dtype: int64


In [None]:
formatted_forecast_df = formatted_forecast_df.dropna()
weather_fg.insert(formatted_forecast_df, write_options={"wait_for_job": True})

2025-01-07 23:43:52,199 INFO: 	6 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1164446/fs/1155149/fg/1393654


Uploading Dataframe: 100.00% |██████████| Rows 167/167 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: weather_data_3_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1164446/jobs/named/weather_data_3_1_offline_fg_materialization/executions


(Job('weather_data_3_1_offline_fg_materialization', 'SPARK'),
 {
   "success": true,
   "results": [
     {
       "success": true,
       "expectation_config": {
         "expectation_type": "expect_column_min_to_be_between",
         "kwargs": {
           "column": "temperature_2m_lag",
           "min_value": -100.0,
           "max_value": 500.0
         },
         "meta": {
           "expectationId": 696123
         }
       },
       "result": {
         "observed_value": -10.350000381469727,
         "element_count": 167,
         "missing_count": null,
         "missing_percent": null
       },
       "meta": {
         "ingestionResult": "INGESTED",
         "validationTime": "2025-01-07T10:43:52.000198Z"
       },
       "exception_info": {
         "raised_exception": false,
         "exception_message": null,
         "exception_traceback": null
       }
     },
     {
       "success": true,
       "expectation_config": {
         "expectation_type": "expect_column_min_

### Fetch latest electricity data from Entsoe, and insert it in feature group

In [8]:
entsoe_df = fetch_historical_data(entose_api, start_date, end_date)

[DEBUG] Successfully fetched load data for finland.
[DEBUG] Successfully fetched load data for norway.
[DEBUG] Successfully fetched load data for denmark.
[DEBUG] Successfully fetched cross-border flows SE3 to finland.
[DEBUG] Successfully fetched cross-border flows finland to SE3.
[DEBUG] Successfully fetched cross-border flows SE3 to norway.
[DEBUG] Successfully fetched cross-border flows norway to SE3.
[DEBUG] Successfully fetched cross-border flows SE3 to denmark.
[DEBUG] Successfully fetched cross-border flows denmark to SE3.


In [9]:
entsoe_df

Unnamed: 0_level_0,load_se3,load_finland,load_norway,load_denmark,prices,total_generation_Biomass,total_generation_Fossil Gas,total_generation_Fossil Hard coal,total_generation_Fossil Oil,total_generation_Hydro Run-of-river and poundage,...,total_generation_Waste,total_generation_Wind Offshore,total_generation_Wind Onshore,flows_se3_to_finland,flows_finland_to_se3,flows_se3_to_norway,flows_norway_to_se3,flows_se3_to_denmark,flows_denmark_to_se3,date
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2025-01-06 23:00:00+00:00,10645.0,11400.0,5873.0,3050.0,13.6,154.0,58.0,89.0,6.0,2.0,...,23.0,922.0,3208.0,1178.0,0.0,1105.0,0.0,658.0,0.0,2025-01-07 00:00:00
2025-01-07 00:00:00+00:00,10529.0,11281.75,5665.0,3219.0,8.71,146.0,58.0,89.0,6.0,2.0,...,26.0,827.0,3190.0,1194.0,0.0,1284.0,0.0,650.0,0.0,2025-01-07 01:00:00
2025-01-07 01:00:00+00:00,10467.0,11288.75,5547.0,3130.0,8.9,153.0,57.0,91.0,7.0,1.0,...,24.0,679.0,3120.0,1194.0,0.0,1092.0,0.0,646.0,0.0,2025-01-07 02:00:00
2025-01-07 02:00:00+00:00,10285.0,11426.25,5434.0,3179.0,5.01,155.0,57.0,91.0,7.0,2.0,...,24.0,641.0,3009.0,1194.0,0.0,970.0,0.0,646.0,0.0,2025-01-07 03:00:00
2025-01-07 03:00:00+00:00,10415.0,11629.5,5364.0,3079.0,5.13,146.0,58.0,93.0,7.0,2.0,...,25.0,636.0,2903.0,1194.0,0.0,1170.0,0.0,646.0,0.0,2025-01-07 04:00:00
2025-01-07 04:00:00+00:00,10833.0,12158.0,5467.0,3094.0,5.8,150.0,67.0,103.0,9.0,2.0,...,27.0,617.0,2881.0,1194.0,0.0,1210.0,0.0,646.0,0.0,2025-01-07 05:00:00
2025-01-07 05:00:00+00:00,11649.0,12387.5,5858.0,3199.0,44.81,272.0,92.0,106.0,10.0,2.0,...,38.0,502.0,2873.0,1004.0,0.0,947.0,0.0,619.0,0.0,2025-01-07 06:00:00
2025-01-07 06:00:00+00:00,12482.0,12764.5,6270.0,3477.0,38.03,300.0,98.0,106.0,10.0,2.0,...,38.0,843.0,2793.0,1040.0,0.0,586.0,0.0,259.0,0.0,2025-01-07 07:00:00
2025-01-07 07:00:00+00:00,12961.0,12699.25,6430.0,3642.0,37.2,280.0,93.0,115.0,9.0,3.0,...,39.0,960.0,2842.0,680.0,0.0,549.0,0.0,260.0,0.0,2025-01-07 08:00:00
2025-01-07 08:00:00+00:00,13098.0,12786.0,6421.0,3628.0,35.01,292.0,64.0,173.0,7.0,2.0,...,38.0,889.0,2810.0,334.0,0.0,611.0,0.0,329.0,0.0,2025-01-07 09:00:00


In [10]:
print(entsoe_df.dtypes)

load_se3                                            float64
load_finland                                        float64
load_norway                                         float64
load_denmark                                        float64
prices                                              float64
total_generation_Biomass                            float64
total_generation_Fossil Gas                         float64
total_generation_Fossil Hard coal                   float64
total_generation_Fossil Oil                         float64
total_generation_Hydro Run-of-river and poundage    float64
total_generation_Other renewable                    float64
total_generation_Solar                              float64
total_generation_Waste                              float64
total_generation_Wind Offshore                      float64
total_generation_Wind Onshore                       float64
flows_se3_to_finland                                float64
flows_finland_to_se3                    

In [11]:

entsoe_df.columns = entsoe_df.columns.str.lower()
entsoe_df.columns = entsoe_df.columns.str.lower().str.replace(' ', '_').str.replace('-', '_')
entsoe_df['date'] = pd.to_datetime(entsoe_df['date'])
entsoe_df['date'] = pd.to_datetime(entsoe_df['date']).dt.tz_localize('UTC').dt.tz_convert(None)
#entsoe_df = entsoe_df.dropna()
print(entsoe_df.dtypes)
print(entsoe_df.tail(5))

load_se3                                                   float64
load_finland                                               float64
load_norway                                                float64
load_denmark                                               float64
prices                                                     float64
total_generation_biomass                                   float64
total_generation_fossil_gas                                float64
total_generation_fossil_hard_coal                          float64
total_generation_fossil_oil                                float64
total_generation_hydro_run_of_river_and_poundage           float64
total_generation_other_renewable                           float64
total_generation_solar                                     float64
total_generation_waste                                     float64
total_generation_wind_offshore                             float64
total_generation_wind_onshore                              flo

In [12]:
entsoe_df

Unnamed: 0_level_0,load_se3,load_finland,load_norway,load_denmark,prices,total_generation_biomass,total_generation_fossil_gas,total_generation_fossil_hard_coal,total_generation_fossil_oil,total_generation_hydro_run_of_river_and_poundage,...,total_generation_waste,total_generation_wind_offshore,total_generation_wind_onshore,flows_se3_to_finland,flows_finland_to_se3,flows_se3_to_norway,flows_norway_to_se3,flows_se3_to_denmark,flows_denmark_to_se3,date
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2025-01-06 23:00:00+00:00,10645.0,11400.0,5873.0,3050.0,13.6,154.0,58.0,89.0,6.0,2.0,...,23.0,922.0,3208.0,1178.0,0.0,1105.0,0.0,658.0,0.0,2025-01-07 00:00:00
2025-01-07 00:00:00+00:00,10529.0,11281.75,5665.0,3219.0,8.71,146.0,58.0,89.0,6.0,2.0,...,26.0,827.0,3190.0,1194.0,0.0,1284.0,0.0,650.0,0.0,2025-01-07 01:00:00
2025-01-07 01:00:00+00:00,10467.0,11288.75,5547.0,3130.0,8.9,153.0,57.0,91.0,7.0,1.0,...,24.0,679.0,3120.0,1194.0,0.0,1092.0,0.0,646.0,0.0,2025-01-07 02:00:00
2025-01-07 02:00:00+00:00,10285.0,11426.25,5434.0,3179.0,5.01,155.0,57.0,91.0,7.0,2.0,...,24.0,641.0,3009.0,1194.0,0.0,970.0,0.0,646.0,0.0,2025-01-07 03:00:00
2025-01-07 03:00:00+00:00,10415.0,11629.5,5364.0,3079.0,5.13,146.0,58.0,93.0,7.0,2.0,...,25.0,636.0,2903.0,1194.0,0.0,1170.0,0.0,646.0,0.0,2025-01-07 04:00:00
2025-01-07 04:00:00+00:00,10833.0,12158.0,5467.0,3094.0,5.8,150.0,67.0,103.0,9.0,2.0,...,27.0,617.0,2881.0,1194.0,0.0,1210.0,0.0,646.0,0.0,2025-01-07 05:00:00
2025-01-07 05:00:00+00:00,11649.0,12387.5,5858.0,3199.0,44.81,272.0,92.0,106.0,10.0,2.0,...,38.0,502.0,2873.0,1004.0,0.0,947.0,0.0,619.0,0.0,2025-01-07 06:00:00
2025-01-07 06:00:00+00:00,12482.0,12764.5,6270.0,3477.0,38.03,300.0,98.0,106.0,10.0,2.0,...,38.0,843.0,2793.0,1040.0,0.0,586.0,0.0,259.0,0.0,2025-01-07 07:00:00
2025-01-07 07:00:00+00:00,12961.0,12699.25,6430.0,3642.0,37.2,280.0,93.0,115.0,9.0,3.0,...,39.0,960.0,2842.0,680.0,0.0,549.0,0.0,260.0,0.0,2025-01-07 08:00:00
2025-01-07 08:00:00+00:00,13098.0,12786.0,6421.0,3628.0,35.01,292.0,64.0,173.0,7.0,2.0,...,38.0,889.0,2810.0,334.0,0.0,611.0,0.0,329.0,0.0,2025-01-07 09:00:00


In [13]:
entsoe_df = modify_entsoe_df(entsoe_df)
entsoe_df = create_lagging_columns(entsoe_df)
entsoe_df = add_future_price_column(entsoe_df)

number of rows before dropping na:  21
number of rows after dropping na:  19


In [14]:
print(entsoe_df.dtypes)
entsoe_df

prices                            float64
date                       datetime64[ns]
imported_energy                   float64
total_load                        float64
total_generation_se               float64
prices_lag                        float64
imported_energy_lag               float64
total_load_lag                    float64
total_generation_se_lag           float64
future_price                      float64
dtype: object


Unnamed: 0_level_0,prices,date,imported_energy,total_load,total_generation_se,prices_lag,imported_energy_lag,total_load_lag,total_generation_se_lag,future_price
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2025-01-07 00:00:00+00:00,8.71,2025-01-07 01:00:00,-3128.0,30694.75,4345.0,13.6,-2941.0,30968.0,4463.0,8.9
2025-01-07 01:00:00+00:00,8.9,2025-01-07 02:00:00,-2932.0,30432.75,4133.0,8.71,-3128.0,30694.75,4345.0,5.01
2025-01-07 02:00:00+00:00,5.01,2025-01-07 03:00:00,-2810.0,30324.25,3987.0,8.9,-2932.0,30432.75,4133.0,5.13
2025-01-07 03:00:00+00:00,5.13,2025-01-07 04:00:00,-3010.0,30487.5,3871.0,5.01,-2810.0,30324.25,3987.0,5.8
2025-01-07 04:00:00+00:00,5.8,2025-01-07 05:00:00,-3050.0,31552.0,3857.0,5.13,-3010.0,30487.5,3871.0,44.81
2025-01-07 05:00:00+00:00,44.81,2025-01-07 06:00:00,-2570.0,33093.5,3896.0,5.8,-3050.0,31552.0,3857.0,38.03
2025-01-07 06:00:00+00:00,38.03,2025-01-07 07:00:00,-1885.0,34993.5,4191.0,44.81,-2570.0,33093.5,3896.0,37.2
2025-01-07 07:00:00+00:00,37.2,2025-01-07 08:00:00,-1489.0,35732.25,4342.0,38.03,-1885.0,34993.5,4191.0,35.01
2025-01-07 08:00:00+00:00,35.01,2025-01-07 09:00:00,-1274.0,35933.0,4304.0,37.2,-1489.0,35732.25,4342.0,34.27
2025-01-07 09:00:00+00:00,34.27,2025-01-07 10:00:00,-1159.0,36067.5,4668.0,35.01,-1274.0,35933.0,4304.0,34.65


In [None]:
entsoe_fg.insert(entsoe_df, write_options={"wait_for_job": True})

2025-01-07 23:44:21,022 INFO: 	9 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1164446/fs/1155149/fg/1393655


Uploading Dataframe: 100.00% |██████████| Rows 19/19 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: entsoe_data_3_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1164446/jobs/named/entsoe_data_3_1_offline_fg_materialization/executions


(Job('entsoe_data_3_1_offline_fg_materialization', 'SPARK'),
 {
   "success": true,
   "results": [
     {
       "success": true,
       "expectation_config": {
         "expectation_type": "expect_column_min_to_be_between",
         "kwargs": {
           "column": "prices_lag",
           "min_value": -1000,
           "max_value": 1000,
           "strict_min": true
         },
         "meta": {
           "expectationId": 696127
         }
       },
       "result": {
         "observed_value": 5.01,
         "element_count": 19,
         "missing_count": null,
         "missing_percent": null
       },
       "meta": {
         "ingestionResult": "INGESTED",
         "validationTime": "2025-01-07T10:44:21.000021Z"
       },
       "exception_info": {
         "raised_exception": false,
         "exception_message": null,
         "exception_traceback": null
       }
     },
     {
       "success": true,
       "expectation_config": {
         "expectation_type": "expect_column_

In [16]:
print(entsoe_df.dtypes)
entsoe_df

prices                            float64
date                       datetime64[ns]
imported_energy                   float64
total_load                        float64
total_generation_se               float64
prices_lag                        float64
imported_energy_lag               float64
total_load_lag                    float64
total_generation_se_lag           float64
future_price                      float64
dtype: object


Unnamed: 0_level_0,prices,date,imported_energy,total_load,total_generation_se,prices_lag,imported_energy_lag,total_load_lag,total_generation_se_lag,future_price
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2025-01-07 00:00:00+00:00,8.71,2025-01-07 01:00:00,-3128.0,30694.75,4345.0,13.6,-2941.0,30968.0,4463.0,8.9
2025-01-07 01:00:00+00:00,8.9,2025-01-07 02:00:00,-2932.0,30432.75,4133.0,8.71,-3128.0,30694.75,4345.0,5.01
2025-01-07 02:00:00+00:00,5.01,2025-01-07 03:00:00,-2810.0,30324.25,3987.0,8.9,-2932.0,30432.75,4133.0,5.13
2025-01-07 03:00:00+00:00,5.13,2025-01-07 04:00:00,-3010.0,30487.5,3871.0,5.01,-2810.0,30324.25,3987.0,5.8
2025-01-07 04:00:00+00:00,5.8,2025-01-07 05:00:00,-3050.0,31552.0,3857.0,5.13,-3010.0,30487.5,3871.0,44.81
2025-01-07 05:00:00+00:00,44.81,2025-01-07 06:00:00,-2570.0,33093.5,3896.0,5.8,-3050.0,31552.0,3857.0,38.03
2025-01-07 06:00:00+00:00,38.03,2025-01-07 07:00:00,-1885.0,34993.5,4191.0,44.81,-2570.0,33093.5,3896.0,37.2
2025-01-07 07:00:00+00:00,37.2,2025-01-07 08:00:00,-1489.0,35732.25,4342.0,38.03,-1885.0,34993.5,4191.0,35.01
2025-01-07 08:00:00+00:00,35.01,2025-01-07 09:00:00,-1274.0,35933.0,4304.0,37.2,-1489.0,35732.25,4342.0,34.27
2025-01-07 09:00:00+00:00,34.27,2025-01-07 10:00:00,-1159.0,36067.5,4668.0,35.01,-1274.0,35933.0,4304.0,34.65
