In [5]:
import pandas as pd
import os
import numpy as np
import requests
import datetime
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import datetime
import time
import json
from geopy.geocoders import Nominatim
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
from matplotlib.ticker import MultipleLocator
import openmeteo_requests
import requests_cache
from retry_requests import retry
import hsfs
from pathlib import Path
from dotenv import load_dotenv
import hopsworks



In [1]:
import hsfs

In [2]:
import hopsworks

In [6]:
load_dotenv()
hopsworks_api = os.getenv("HOPSWORKS_API_KEY")

os.environ["HOPSWORKS_API_KEY"] = hopsworks_api

project = hopsworks.login()
print(f"Connected to project: {project.name}")

2024-12-19 12:35:26,021 INFO: Initializing external client
2024-12-19 12:35:26,021 INFO: Base URL: https://c.app.hopsworks.ai:443
2024-12-19 12:35:27,648 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1164446
Connected to project: oskaralf


In [19]:
df_weather = pd.read_csv("../data/formatted_weather_data_SE4.csv")
df_weather = df_weather.sort_values(by="time_start")
df_weather['time_start'] = pd.to_datetime(df_weather['time_start'])

In [30]:
df_prices = pd.read_csv("../data/formatted_price_data_SE4.csv")
df_prices = df_prices.sort_values(by="time_start")
df_prices['time_start'] = pd.to_datetime(df_prices['time_start'])

In [20]:
import great_expectations as ge

weather_expectation_suite = ge.core.ExpectationSuite(
    expectation_suite_name="aq_expectation_suite"
)

weather_expectation_suite.add_expectation(
    ge.core.ExpectationConfiguration(
        expectation_type="expect_column_min_to_be_between",
        kwargs={
            "column":"temperature_2m",
            "min_value":-100.0,
            "max_value":500.0,
            "strict_min":True
        }
    )
)

{"expectation_type": "expect_column_min_to_be_between", "kwargs": {"column": "temperature_2m", "min_value": -100.0, "max_value": 500.0, "strict_min": true}, "meta": {}}

In [28]:
import great_expectations as ge

prices_expectation_suite = ge.core.ExpectationSuite(
    expectation_suite_name="price_expectation"
)

prices_expectation_suite.add_expectation(
    ge.core.ExpectationConfiguration(
        expectation_type="expect_column_min_to_be_between",
        kwargs={
            "column":"price",
            "min_value":-0.1,
            "max_value":5000,
            "strict_min":True
        }
    )
)

{"expectation_type": "expect_column_min_to_be_between", "kwargs": {"column": "price", "min_value": -0.1, "max_value": 5000, "strict_min": true}, "meta": {}}

In [21]:
try:
    fs = project.get_feature_store()
    print(f"Connected to feature store: {fs.name}")
except Exception as e:
    print(f"Failed to get feature store: {e}")

Connected to feature store: oskaralf_featurestore


In [24]:
weather_fg = fs.get_or_create_feature_group(
    name='weather_data',
    description='Weather data for SE4',
    version=1,
    primary_key=['time_start'],
    event_time="time_start",
    expectation_suite=weather_expectation_suite
)

In [29]:
price_fg = fs.get_or_create_feature_group(
    name='electricity_price_data',
    description='Electricity price data for SE4',
    version=1,
    primary_key=['time_start'],
    event_time="time_start",
    expectation_suite=prices_expectation_suite
)

In [25]:
weather_fg.insert(df_weather)

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1164446/fs/1155149/fg/1394005
2024-12-19 12:44:16,349 INFO: 	1 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1164446/fs/1155149/fg/1394005


Uploading Dataframe: 100.00% |██████████| Rows 18048/18048 | Elapsed Time: 00:03 | Remaining Time: 00:00







Launching job: weather_data_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1164446/jobs/named/weather_data_1_offline_fg_materialization/executions
2024-12-19 12:44:31,493 INFO: Materialisation job was not scheduled.


(Job('weather_data_1_offline_fg_materialization', 'SPARK'),
 {
   "success": true,
   "results": [
     {
       "success": true,
       "expectation_config": {
         "expectation_type": "expect_column_min_to_be_between",
         "kwargs": {
           "column": "temperature_2m",
           "min_value": -100.0,
           "max_value": 500.0,
           "strict_min": true
         },
         "meta": {
           "expectationId": 694273
         }
       },
       "result": {
         "observed_value": -31.789,
         "element_count": 18048,
         "missing_count": null,
         "missing_percent": null
       },
       "meta": {
         "ingestionResult": "INGESTED",
         "validationTime": "2024-12-19T11:44:16.000345Z"
       },
       "exception_info": {
         "raised_exception": false,
         "exception_message": null,
         "exception_traceback": null
       }
     }
   ],
   "evaluation_parameters": {},
   "statistics": {
     "evaluated_expectations": 1,
     

In [31]:
price_fg.insert(df_prices)

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1164446/fs/1155149/fg/1394007
2024-12-19 12:49:30,452 INFO: 	1 expectation(s) included in expectation_suite.
Validation failed.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1164446/fs/1155149/fg/1394007


Uploading Dataframe: 100.00% |██████████| Rows 18695/18695 | Elapsed Time: 00:02 | Remaining Time: 00:00


Launching job: electricity_price_data_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1164446/jobs/named/electricity_price_data_1_offline_fg_materialization/executions


(Job('electricity_price_data_1_offline_fg_materialization', 'SPARK'),
 {
   "success": false,
   "results": [
     {
       "success": false,
       "expectation_config": {
         "expectation_type": "expect_column_min_to_be_between",
         "kwargs": {
           "column": "price",
           "min_value": -0.1,
           "max_value": 5000,
           "strict_min": true
         },
         "meta": {
           "expectationId": 694274
         }
       },
       "result": {
         "observed_value": -0.69112,
         "element_count": 18695,
         "missing_count": null,
         "missing_percent": null
       },
       "meta": {
         "ingestionResult": "INGESTED",
         "validationTime": "2024-12-19T11:49:30.000448Z"
       },
       "exception_info": {
         "raised_exception": false,
         "exception_message": null,
         "exception_traceback": null
       }
     }
   ],
   "evaluation_parameters": {},
   "statistics": {
     "evaluated_expectations": 1,
    