In [1]:
import pandas as pd
import os
import numpy as np
import requests
import datetime
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import datetime
import time
import json
from geopy.geocoders import Nominatim
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
from matplotlib.ticker import MultipleLocator
import openmeteo_requests
import requests_cache
from retry_requests import retry
import hsfs
from pathlib import Path
from dotenv import load_dotenv
import hopsworks
import sys

root_dir = Path().resolve().parent
sys.path.append(str(root_dir))

from format_data import format_weather_data, format_price_data, process_weather_data
from get_electricity_prices import get_data
from get_weather_data import get_historical_weather, get_weather_forecast



In [2]:
import hsfs

In [3]:
import hopsworks

In [4]:
load_dotenv()
entose_api = os.getenv("ENTSOE_API")
hopsworks_api = os.getenv("HOPSWORKS_API_KEY")

os.environ["HOPSWORKS_API_KEY"] = hopsworks_api

project = hopsworks.login()
print(f"Connected to project: {project.name}")

2025-01-04 20:40:35,829 INFO: Initializing external client
2025-01-04 20:40:35,830 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-01-04 20:40:37,488 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1164446
Connected to project: oskaralf


In [5]:
start_date = "2022-11-01"
end_date = datetime.date.today().strftime('%Y-%m-%d')
hist_weather_df = get_historical_weather("Stockhom", "2022-11-01", "2025-01-03", 59.3294, 18.0687)
formatted_hist_weather_df = process_weather_data(hist_weather_df)

Coordinates 59.29701232910156°N 18.163265228271484°E
Elevation 24.0 m asl
Timezone None None
Timezone difference to GMT+0 0 s


In [6]:
entsoe_df = pd.read_csv('../data/entose_data.csv')

In [7]:
print(formatted_hist_weather_df.head())
print(entsoe_df.head())

                 date  temperature_2m  precipitation  snow_depth  \
0 2022-11-01 00:00:00        9.165000            0.0         0.0   
1 2022-11-01 01:00:00        8.764999            0.0         0.0   
2 2022-11-01 02:00:00        8.615000            0.0         0.0   
3 2022-11-01 03:00:00        8.565000            0.0         0.0   
4 2022-11-01 04:00:00        8.365000            0.0         0.0   

   pressure_msl  cloud_cover  wind_speed_10m  wind_speed_100m  \
0   1017.099976        100.0        9.114471        17.698677   
1   1017.200012        100.0        8.427383        16.622490   
2   1017.200012        100.0        8.427383        16.454008   
3   1017.099976        100.0        8.759178        17.283749   
4   1016.900024        100.0        8.640000        17.106628   

   wind_direction_10m  wind_direction_100m      city  
0           260.90979           274.666779  Stockhom  
1           250.01680           265.030334  Stockhom  
2           250.01680           259

In [None]:
price_df = get_data("SE3")
print(price_df.dtypes)
print(price_df)

date          datetime64[ns]
time_start            object
price                float64
dtype: object
            date                 time_start    price
0     2022-11-01  2022-11-01T00:00:00+01:00  0.37995
1     2022-11-01  2022-11-01T01:00:00+01:00  0.37995
2     2022-11-01  2022-11-01T02:00:00+01:00  0.38430
3     2022-11-01  2022-11-01T03:00:00+01:00  0.39301
4     2022-11-01  2022-11-01T04:00:00+01:00  0.41173
...          ...                        ...      ...
18810 2024-12-23  2024-12-23T19:00:00+01:00  1.12735
18811 2024-12-23  2024-12-23T20:00:00+01:00  1.04068
18812 2024-12-23  2024-12-23T21:00:00+01:00  1.01973
18813 2024-12-23  2024-12-23T22:00:00+01:00  0.73231
18814 2024-12-23  2024-12-23T23:00:00+01:00  0.54411

[18815 rows x 3 columns]


In [8]:
price_df['time_start'] = price_df['time_start'].str.replace(r'\+\d{2}:\d{2}$', '', regex=True)
print(price_df)
print(price_df.dtypes)

            date           time_start    price
0     2022-11-01  2022-11-01T00:00:00  0.37995
1     2022-11-01  2022-11-01T01:00:00  0.37995
2     2022-11-01  2022-11-01T02:00:00  0.38430
3     2022-11-01  2022-11-01T03:00:00  0.39301
4     2022-11-01  2022-11-01T04:00:00  0.41173
...          ...                  ...      ...
18810 2024-12-23  2024-12-23T19:00:00  1.12735
18811 2024-12-23  2024-12-23T20:00:00  1.04068
18812 2024-12-23  2024-12-23T21:00:00  1.01973
18813 2024-12-23  2024-12-23T22:00:00  0.73231
18814 2024-12-23  2024-12-23T23:00:00  0.54411

[18815 rows x 3 columns]
date          datetime64[ns]
time_start            object
price                float64
dtype: object


In [9]:
formatted_hist_weather_df['time_start'] = formatted_hist_weather_df['time_start'].str.replace(r'\+\d{2}:\d{2}$', '', regex=True)
print(formatted_hist_weather_df)
print(formatted_hist_weather_df.dtypes)


                time_start  temperature_2m  precipitation  snow_depth  \
0      2022-11-01T00:00:00       -0.039000            0.0        0.00   
1      2022-11-01T01:00:00       -0.439000            0.0        0.00   
2      2022-11-01T02:00:00       -0.339000            0.0        0.00   
3      2022-11-01T03:00:00        0.161000            0.0        0.00   
4      2022-11-01T04:00:00       -0.439000            0.0        0.00   
...                    ...             ...            ...         ...   
18667  2024-12-17T19:00:00      -10.289000            0.0        0.12   
18668  2024-12-17T20:00:00       -9.889000            0.0        0.12   
18669  2024-12-17T21:00:00      -10.188999            0.0        0.12   
18670  2024-12-17T22:00:00      -10.789000            0.0        0.12   
18671  2024-12-17T23:00:00      -11.889000            0.0        0.12   

       pressure_msl  cloud_cover  wind_speed_10m  wind_speed_100m  \
0       1009.299988         38.0       15.530151      

In [10]:
print(price_df)
print(price_df.dtypes)

            date           time_start    price
0     2022-11-01  2022-11-01T00:00:00  0.37995
1     2022-11-01  2022-11-01T01:00:00  0.37995
2     2022-11-01  2022-11-01T02:00:00  0.38430
3     2022-11-01  2022-11-01T03:00:00  0.39301
4     2022-11-01  2022-11-01T04:00:00  0.41173
...          ...                  ...      ...
18810 2024-12-23  2024-12-23T19:00:00  1.12735
18811 2024-12-23  2024-12-23T20:00:00  1.04068
18812 2024-12-23  2024-12-23T21:00:00  1.01973
18813 2024-12-23  2024-12-23T22:00:00  0.73231
18814 2024-12-23  2024-12-23T23:00:00  0.54411

[18815 rows x 3 columns]
date          datetime64[ns]
time_start            object
price                float64
dtype: object


In [12]:

print(price_df.head())
print(price_df.dtypes)

        date           time_start    price
0 2022-11-01  2022-11-01T00:00:00  0.37995
1 2022-11-01  2022-11-01T01:00:00  0.37995
2 2022-11-01  2022-11-01T02:00:00  0.38430
3 2022-11-01  2022-11-01T03:00:00  0.39301
4 2022-11-01  2022-11-01T04:00:00  0.41173
date          datetime64[ns]
time_start            object
price                float64
dtype: object


In [11]:
import great_expectations as ge

weather_expectation_suite = ge.core.ExpectationSuite(
    expectation_suite_name="weather_expectation_suite"
)

weather_expectation_suite.add_expectation(
    ge.core.ExpectationConfiguration(
        expectation_type="expect_column_min_to_be_between",
        kwargs={
            "column":"temperature_2m",
            "min_value":-100.0,
            "max_value":500.0,
        }
    )
)

weather_expectation_suite.add_expectation(
    ge.core.ExpectationConfiguration(
        expectation_type="expect_column_min_to_be_between",
        kwargs={
            "column":"precipitation",
            "min_value":-5.0,
            "max_value":500.0,

        }
    )
)

weather_expectation_suite.add_expectation(
    ge.core.ExpectationConfiguration(
        expectation_type="expect_column_min_to_be_between",
        kwargs={
            "column":"pressure_msl",
            "min_value":0.0,
            "max_value":2000,
        }
    )
)



weather_expectation_suite.add_expectation(
    ge.core.ExpectationConfiguration(
        expectation_type="expect_column_min_to_be_between",
        kwargs={
            "column":"cloud_cover",
            "min_value":-0.1,
            "max_value":100.0,
            "strict_min":True
        }
    )
)

weather_expectation_suite.add_expectation(
    ge.core.ExpectationConfiguration(
        expectation_type="expect_column_min_to_be_between",
        kwargs={
            "column":"wind_speed_10m",
            "min_value":-0.1,
            "max_value":1000,
            "strict_min":True
        }
    )
)



{"expectation_type": "expect_column_min_to_be_between", "kwargs": {"column": "wind_speed_10m", "min_value": -0.1, "max_value": 1000, "strict_min": true}, "meta": {}}

In [10]:
import great_expectations as ge

prices_expectation_suite = ge.core.ExpectationSuite(
    expectation_suite_name="price_expectation_suite"
)

prices_expectation_suite.add_expectation(
    ge.core.ExpectationConfiguration(
        expectation_type="expect_column_min_to_be_between",
        kwargs={
            "column":"price",
            "min_value":-0.1,
            "max_value":5000,
            "strict_min":True
        }
    )
)

{"expectation_type": "expect_column_min_to_be_between", "kwargs": {"column": "price", "min_value": -0.1, "max_value": 5000, "strict_min": true}, "meta": {}}

In [12]:
import great_expectations as ge

data_expectation_suite = ge.core.ExpectationSuite(
    expectation_suite_name="data_expectation_suite"
)

data_expectation_suite.add_expectation(
    ge.core.ExpectationConfiguration(
        expectation_type="expect_column_min_to_be_between",
        kwargs={
            "column": "load_se",
            "min_value": 0,
            "max_value": 20000,
            "strict_min": True,
        }
    )
)

data_expectation_suite.add_expectation(
    ge.core.ExpectationConfiguration(
        expectation_type="expect_column_min_to_be_between",
        kwargs={
            "column": "price_se",
            "min_value": -1000,
            "max_value": 5000,
        }
    )
)

data_expectation_suite.add_expectation(
    ge.core.ExpectationConfiguration(
        expectation_type="expect_column_min_to_be_between",
        kwargs={
            "column": "flows_se_finland",
            "min_value": -5000,
            "max_value": 10000,  
        }
    )
)

data_expectation_suite.add_expectation(
    ge.core.ExpectationConfiguration(
        expectation_type="expect_column_min_to_be_between",
        kwargs={
            "column": "flows_se_norway",
            "min_value": -5000,
            "max_value": 10000,
        }
    )
)

data_expectation_suite.add_expectation(
    ge.core.ExpectationConfiguration(
        expectation_type="expect_column_min_to_be_between",
        kwargs={
            "column": "flows_se_denmark",
            "min_value": -5000,
            "max_value": 10000,
        }
    )
)

data_expectation_suite.add_expectation(
    ge.core.ExpectationConfiguration(
        expectation_type="expect_column_min_to_be_between",
        kwargs={
            "column": "load_finland",
            "min_value": 0,
            "max_value": 20000,
        }
    )
)

data_expectation_suite.add_expectation(
    ge.core.ExpectationConfiguration(
        expectation_type="expect_column_min_to_be_between",
        kwargs={
            "column": "load_norway",
            "min_value": 0,
            "max_value": 20000,
        }
    )
)

data_expectation_suite.add_expectation(
    ge.core.ExpectationConfiguration(
        expectation_type="expect_column_min_to_be_between",
        kwargs={
            "column": "load_germany",
            "min_value": 0,
            "max_value": 100000,
        }
    )
)

data_expectation_suite.add_expectation(
    ge.core.ExpectationConfiguration(
        expectation_type="expect_column_min_to_be_between",
        kwargs={
            "column": "load_denmark",
            "min_value": 0,
            "max_value": 20000,
        }
    )
)

data_expectation_suite.add_expectation(
    ge.core.ExpectationConfiguration(
        expectation_type="expect_column_min_to_be_between",
        kwargs={
            "column": "hydro_storage_se",
            "min_value": 0,
            "max_value": 4000000,
        }
    )
)

print("Expectation suite created!")


Expectation suite created!


In [13]:
try:
    fs = project.get_feature_store()
    print(f"Connected to feature store: {fs.name}")
except Exception as e:
    print(f"Failed to get feature store: {e}")

Connected to feature store: oskaralf_featurestore


In [14]:
weather_fg = fs.get_or_create_feature_group(
    name='weather_data_2',
    description='Weather data for SE3',
    version=1,
    primary_key=['date'],
    expectation_suite=weather_expectation_suite
)

In [15]:
entsoe_fg = fs.get_or_create_feature_group(
    name='entsoe_data',
    description='Entsoe data for SE3',
    version=1,
    primary_key=['date'],
    expectation_suite=data_expectation_suite
)

In [None]:
price_fg = fs.get_or_create_feature_group(
    name='electricity_price_data_3',
    description='Electricity price data for SE3',
    version=1,
    primary_key=['time_start'],
    expectation_suite=prices_expectation_suite
)

In [16]:
weather_fg.insert(formatted_hist_weather_df)

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1164446/fs/1155149/fg/1394547
2025-01-04 20:42:27,069 INFO: 	5 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1164446/fs/1155149/fg/1394547


Uploading Dataframe: 100.00% |██████████| Rows 18456/18456 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: weather_data_2_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1164446/jobs/named/weather_data_2_1_offline_fg_materialization/executions


(Job('weather_data_2_1_offline_fg_materialization', 'SPARK'),
 {
   "success": true,
   "results": [
     {
       "success": true,
       "expectation_config": {
         "expectation_type": "expect_column_min_to_be_between",
         "kwargs": {
           "column": "pressure_msl",
           "min_value": 0.0,
           "max_value": 2000
         },
         "meta": {
           "expectationId": 694345
         }
       },
       "result": {
         "observed_value": 962.7999877929688,
         "element_count": 18456,
         "missing_count": null,
         "missing_percent": null
       },
       "meta": {
         "ingestionResult": "INGESTED",
         "validationTime": "2025-01-04T07:42:27.000068Z"
       },
       "exception_info": {
         "raised_exception": false,
         "exception_message": null,
         "exception_traceback": null
       }
     },
     {
       "success": true,
       "expectation_config": {
         "expectation_type": "expect_column_min_to_be_betw

In [17]:
entsoe_df.columns = entsoe_df.columns.str.lower()
entsoe_fg.insert(entsoe_df)

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1164446/fs/1155149/fg/1393532
2025-01-04 20:42:47,235 INFO: 	10 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1164446/fs/1155149/fg/1393532


Uploading Dataframe: 100.00% |██████████| Rows 19057/19057 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: entsoe_data_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1164446/jobs/named/entsoe_data_1_offline_fg_materialization/executions


(Job('entsoe_data_1_offline_fg_materialization', 'SPARK'),
 {
   "success": true,
   "results": [
     {
       "success": true,
       "expectation_config": {
         "expectation_type": "expect_column_min_to_be_between",
         "kwargs": {
           "column": "flows_se_finland",
           "min_value": -5000,
           "max_value": 10000
         },
         "meta": {
           "expectationId": 696021
         }
       },
       "result": {
         "observed_value": 0.0,
         "element_count": 19057,
         "missing_count": null,
         "missing_percent": null
       },
       "meta": {
         "ingestionResult": "INGESTED",
         "validationTime": "2025-01-04T07:42:47.000234Z"
       },
       "exception_info": {
         "raised_exception": false,
         "exception_message": null,
         "exception_traceback": null
       }
     },
     {
       "success": true,
       "expectation_config": {
         "expectation_type": "expect_column_min_to_be_between",
    

In [22]:
price_fg.insert(price_df)

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1164446/fs/1155149/fg/1393146
2024-12-23 12:41:23,583 INFO: 	1 expectation(s) included in expectation_suite.
Validation failed.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1164446/fs/1155149/fg/1393146


Uploading Dataframe: 100.00% |██████████| Rows 18815/18815 | Elapsed Time: 00:06 | Remaining Time: 00:00


Launching job: electricity_price_data_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1164446/jobs/named/electricity_price_data_1_offline_fg_materialization/executions


(Job('electricity_price_data_1_offline_fg_materialization', 'SPARK'),
 {
   "success": false,
   "results": [
     {
       "success": false,
       "expectation_config": {
         "expectation_type": "expect_column_min_to_be_between",
         "kwargs": {
           "column": "price",
           "min_value": -0.1,
           "max_value": 5000,
           "strict_min": true
         },
         "meta": {
           "expectationId": 695305
         }
       },
       "result": {
         "observed_value": -0.69112,
         "element_count": 18815,
         "missing_count": null,
         "missing_percent": null
       },
       "meta": {
         "ingestionResult": "INGESTED",
         "validationTime": "2024-12-23T11:41:23.000582Z"
       },
       "exception_info": {
         "raised_exception": false,
         "exception_message": null,
         "exception_traceback": null
       }
     }
   ],
   "evaluation_parameters": {},
   "statistics": {
     "evaluated_expectations": 1,
    