In [1]:
import pandas as pd
import os
import numpy as np
import requests
import datetime
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import datetime
import time
import json
from geopy.geocoders import Nominatim
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
from matplotlib.ticker import MultipleLocator
import openmeteo_requests
import requests_cache
from retry_requests import retry
import hsfs
from pathlib import Path
from dotenv import load_dotenv
import hopsworks
import sys

root_dir = Path().resolve().parent
sys.path.append(str(root_dir))

from format_data import format_weather_data, format_price_data
from get_electricity_prices import get_data
from get_weather_data import get_historical_weather, get_weather_forecast



In [2]:
import hsfs

In [3]:
import hopsworks

In [2]:
load_dotenv()
hopsworks_api = os.getenv("HOPSWORKS_API_KEY")

os.environ["HOPSWORKS_API_KEY"] = hopsworks_api

project = hopsworks.login()
print(f"Connected to project: {project.name}")

2024-12-23 12:29:23,177 INFO: Initializing external client
2024-12-23 12:29:23,178 INFO: Base URL: https://c.app.hopsworks.ai:443
2024-12-23 12:29:24,858 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1164446
Connected to project: oskaralf


In [6]:
start_date = "2022-11-01"
end_date = datetime.date.today().strftime('%Y-%m-%d')
hist_weather_df = get_historical_weather("Luleå", start_date, end_date, 65.5841, 22.1547)
formatted_hist_weather_df = format_weather_data(hist_weather_df)

Coordinates 65.55360412597656°N 22.24431800842285°E
Elevation 15.0 m asl
Timezone None None
Timezone difference to GMT+0 0 s


In [7]:
price_df = get_data("SE4")
print(price_df.dtypes)
print(price_df)

date          datetime64[ns]
time_start            object
price                float64
dtype: object
            date                 time_start    price
0     2022-11-01  2022-11-01T00:00:00+01:00  0.37995
1     2022-11-01  2022-11-01T01:00:00+01:00  0.37995
2     2022-11-01  2022-11-01T02:00:00+01:00  0.38430
3     2022-11-01  2022-11-01T03:00:00+01:00  0.39301
4     2022-11-01  2022-11-01T04:00:00+01:00  0.41173
...          ...                        ...      ...
18810 2024-12-23  2024-12-23T19:00:00+01:00  1.12735
18811 2024-12-23  2024-12-23T20:00:00+01:00  1.04068
18812 2024-12-23  2024-12-23T21:00:00+01:00  1.01973
18813 2024-12-23  2024-12-23T22:00:00+01:00  0.73231
18814 2024-12-23  2024-12-23T23:00:00+01:00  0.54411

[18815 rows x 3 columns]


In [8]:
price_df['time_start'] = price_df['time_start'].str.replace(r'\+\d{2}:\d{2}$', '', regex=True)
print(price_df)
print(price_df.dtypes)

            date           time_start    price
0     2022-11-01  2022-11-01T00:00:00  0.37995
1     2022-11-01  2022-11-01T01:00:00  0.37995
2     2022-11-01  2022-11-01T02:00:00  0.38430
3     2022-11-01  2022-11-01T03:00:00  0.39301
4     2022-11-01  2022-11-01T04:00:00  0.41173
...          ...                  ...      ...
18810 2024-12-23  2024-12-23T19:00:00  1.12735
18811 2024-12-23  2024-12-23T20:00:00  1.04068
18812 2024-12-23  2024-12-23T21:00:00  1.01973
18813 2024-12-23  2024-12-23T22:00:00  0.73231
18814 2024-12-23  2024-12-23T23:00:00  0.54411

[18815 rows x 3 columns]
date          datetime64[ns]
time_start            object
price                float64
dtype: object


In [9]:
formatted_hist_weather_df['time_start'] = formatted_hist_weather_df['time_start'].str.replace(r'\+\d{2}:\d{2}$', '', regex=True)
print(formatted_hist_weather_df)
print(formatted_hist_weather_df.dtypes)


                time_start  temperature_2m  precipitation  snow_depth  \
0      2022-11-01T00:00:00       -0.039000            0.0        0.00   
1      2022-11-01T01:00:00       -0.439000            0.0        0.00   
2      2022-11-01T02:00:00       -0.339000            0.0        0.00   
3      2022-11-01T03:00:00        0.161000            0.0        0.00   
4      2022-11-01T04:00:00       -0.439000            0.0        0.00   
...                    ...             ...            ...         ...   
18667  2024-12-17T19:00:00      -10.289000            0.0        0.12   
18668  2024-12-17T20:00:00       -9.889000            0.0        0.12   
18669  2024-12-17T21:00:00      -10.188999            0.0        0.12   
18670  2024-12-17T22:00:00      -10.789000            0.0        0.12   
18671  2024-12-17T23:00:00      -11.889000            0.0        0.12   

       pressure_msl  cloud_cover  wind_speed_10m  wind_speed_100m  \
0       1009.299988         38.0       15.530151      

In [10]:
print(price_df)
print(price_df.dtypes)

            date           time_start    price
0     2022-11-01  2022-11-01T00:00:00  0.37995
1     2022-11-01  2022-11-01T01:00:00  0.37995
2     2022-11-01  2022-11-01T02:00:00  0.38430
3     2022-11-01  2022-11-01T03:00:00  0.39301
4     2022-11-01  2022-11-01T04:00:00  0.41173
...          ...                  ...      ...
18810 2024-12-23  2024-12-23T19:00:00  1.12735
18811 2024-12-23  2024-12-23T20:00:00  1.04068
18812 2024-12-23  2024-12-23T21:00:00  1.01973
18813 2024-12-23  2024-12-23T22:00:00  0.73231
18814 2024-12-23  2024-12-23T23:00:00  0.54411

[18815 rows x 3 columns]
date          datetime64[ns]
time_start            object
price                float64
dtype: object


In [12]:

print(price_df.head())
print(price_df.dtypes)

        date           time_start    price
0 2022-11-01  2022-11-01T00:00:00  0.37995
1 2022-11-01  2022-11-01T01:00:00  0.37995
2 2022-11-01  2022-11-01T02:00:00  0.38430
3 2022-11-01  2022-11-01T03:00:00  0.39301
4 2022-11-01  2022-11-01T04:00:00  0.41173
date          datetime64[ns]
time_start            object
price                float64
dtype: object


In [13]:
import great_expectations as ge

weather_expectation_suite = ge.core.ExpectationSuite(
    expectation_suite_name="weather_expectation_suite"
)

weather_expectation_suite.add_expectation(
    ge.core.ExpectationConfiguration(
        expectation_type="expect_column_min_to_be_between",
        kwargs={
            "column":"temperature_2m",
            "min_value":-100.0,
            "max_value":500.0,
            "strict_min":True
        }
    )
)

weather_expectation_suite.add_expectation(
    ge.core.ExpectationConfiguration(
        expectation_type="expect_column_min_to_be_between",
        kwargs={
            "column":"precipitation",
            "min_value":-0.0,
            "max_value":500.0,
            "strict_min":True
        }
    )
)

weather_expectation_suite.add_expectation(
    ge.core.ExpectationConfiguration(
        expectation_type="expect_column_min_to_be_between",
        kwargs={
            "column":"pressure_msl",
            "min_value":800,
            "max_value":2000,
            "strict_min":True
        }
    )
)



weather_expectation_suite.add_expectation(
    ge.core.ExpectationConfiguration(
        expectation_type="expect_column_min_to_be_between",
        kwargs={
            "column":"cloud_cover",
            "min_value":-0.1,
            "max_value":100,
            "strict_min":True
        }
    )
)

weather_expectation_suite.add_expectation(
    ge.core.ExpectationConfiguration(
        expectation_type="expect_column_min_to_be_between",
        kwargs={
            "column":"wind_speed_10m",
            "min_value":-0.1,
            "max_value":1000,
            "strict_min":True
        }
    )
)



{"expectation_type": "expect_column_min_to_be_between", "kwargs": {"column": "wind_speed_10m", "min_value": -0.1, "max_value": 1000, "strict_min": true}, "meta": {}}

In [14]:
import great_expectations as ge

prices_expectation_suite = ge.core.ExpectationSuite(
    expectation_suite_name="price_expectation_suite"
)

prices_expectation_suite.add_expectation(
    ge.core.ExpectationConfiguration(
        expectation_type="expect_column_min_to_be_between",
        kwargs={
            "column":"price",
            "min_value":-0.1,
            "max_value":5000,
            "strict_min":True
        }
    )
)

{"expectation_type": "expect_column_min_to_be_between", "kwargs": {"column": "price", "min_value": -0.1, "max_value": 5000, "strict_min": true}, "meta": {}}

In [15]:
try:
    fs = project.get_feature_store()
    print(f"Connected to feature store: {fs.name}")
except Exception as e:
    print(f"Failed to get feature store: {e}")

Connected to feature store: oskaralf_featurestore


In [19]:
weather_fg = fs.get_or_create_feature_group(
    name='weather_data',
    description='Weather data for SE4',
    version=1,
    primary_key=['time_start'],
    expectation_suite=weather_expectation_suite
)

In [20]:
price_fg = fs.get_or_create_feature_group(
    name='electricity_price_data',
    description='Electricity price data for SE4',
    version=1,
    primary_key=['time_start'],
    expectation_suite=prices_expectation_suite
)

In [21]:
weather_fg.insert(formatted_hist_weather_df)

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1164446/fs/1155149/fg/1393145
2024-12-23 12:40:28,705 INFO: 	5 expectation(s) included in expectation_suite.
Validation failed.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1164446/fs/1155149/fg/1393145


Uploading Dataframe: 100.00% |██████████| Rows 18168/18168 | Elapsed Time: 00:06 | Remaining Time: 00:00


Launching job: weather_data_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1164446/jobs/named/weather_data_1_offline_fg_materialization/executions


(Job('weather_data_1_offline_fg_materialization', 'SPARK'),
 {
   "success": false,
   "results": [
     {
       "success": false,
       "expectation_config": {
         "expectation_type": "expect_column_min_to_be_between",
         "kwargs": {
           "column": "precipitation",
           "min_value": -0.0,
           "max_value": 500.0,
           "strict_min": true
         },
         "meta": {
           "expectationId": 695303
         }
       },
       "result": {
         "observed_value": 0.0,
         "element_count": 18168,
         "missing_count": null,
         "missing_percent": null
       },
       "meta": {
         "ingestionResult": "INGESTED",
         "validationTime": "2024-12-23T11:40:28.000705Z"
       },
       "exception_info": {
         "raised_exception": false,
         "exception_message": null,
         "exception_traceback": null
       }
     },
     {
       "success": true,
       "expectation_config": {
         "expectation_type": "expect_c

In [22]:
price_fg.insert(price_df)

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1164446/fs/1155149/fg/1393146
2024-12-23 12:41:23,583 INFO: 	1 expectation(s) included in expectation_suite.
Validation failed.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1164446/fs/1155149/fg/1393146


Uploading Dataframe: 100.00% |██████████| Rows 18815/18815 | Elapsed Time: 00:06 | Remaining Time: 00:00


Launching job: electricity_price_data_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1164446/jobs/named/electricity_price_data_1_offline_fg_materialization/executions


(Job('electricity_price_data_1_offline_fg_materialization', 'SPARK'),
 {
   "success": false,
   "results": [
     {
       "success": false,
       "expectation_config": {
         "expectation_type": "expect_column_min_to_be_between",
         "kwargs": {
           "column": "price",
           "min_value": -0.1,
           "max_value": 5000,
           "strict_min": true
         },
         "meta": {
           "expectationId": 695305
         }
       },
       "result": {
         "observed_value": -0.69112,
         "element_count": 18815,
         "missing_count": null,
         "missing_percent": null
       },
       "meta": {
         "ingestionResult": "INGESTED",
         "validationTime": "2024-12-23T11:41:23.000582Z"
       },
       "exception_info": {
         "raised_exception": false,
         "exception_message": null,
         "exception_traceback": null
       }
     }
   ],
   "evaluation_parameters": {},
   "statistics": {
     "evaluated_expectations": 1,
    