In [1]:
import pandas as pd
import os
import numpy as np
import requests
import datetime
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import datetime
import time
import json
from geopy.geocoders import Nominatim
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
from matplotlib.ticker import MultipleLocator
import openmeteo_requests
import requests_cache
from retry_requests import retry
import hsfs
from pathlib import Path
from dotenv import load_dotenv
import hopsworks
import sys

root_dir = Path().resolve().parent
sys.path.append(str(root_dir))

from format_data import format_weather_data, format_price_data, merge_data
from get_electricity_prices import get_data
from get_weather_data import get_historical_weather, get_weather_forecast

import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
load_dotenv()
hopsworks_api = os.getenv("HOPSWORKS_API_KEY")

os.environ["HOPSWORKS_API_KEY"] = hopsworks_api

project = hopsworks.login()

print(f"Connected to project: {project.name}")

2024-12-23 13:38:58,326 INFO: Initializing external client
2024-12-23 13:38:58,326 INFO: Base URL: https://c.app.hopsworks.ai:443
2024-12-23 13:39:00,009 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1164446
Connected to project: oskaralf


In [3]:
def get_feature_group_as_dataframe(project_name, feature_group_name, version=1):
    fs = project.get_feature_store()
    feature_group = fs.get_feature_group(name=feature_group_name, version=version)
    df = feature_group.read()
    return df

In [4]:
fs = project.get_feature_store()

In [None]:
price_df = get_feature_group_as_dataframe(project.name, "electricity_price_data")
weather_df = get_feature_group_as_dataframe(project.name, "weather_data")
entsoe_df = get_feature_group_as_dataframe(project.name, "entsoe_data")

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (6.13s) 
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (5.43s) 


In [None]:
merged_df = merge_data(entsoe, weather_df)
merged_df = merged_df.sort_values(by='date')

In [None]:
price_fg = fs.get_feature_group(
    name='electricity_price_data_3',
    version=1,
)
weather_fg = fs.get_feature_group(
    name='weather_data_2',
    version=1,
)

In [9]:
selected_features = price_fg.select(['price']).join(weather_fg.select_all(), on=['time_start'])
selected_features.show(10)

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (3.24s) 


Unnamed: 0,price,time_start,temperature_2m,precipitation,snow_depth,pressure_msl,cloud_cover,wind_speed_10m,wind_speed_100m,wind_direction_10m,wind_direction_100m,city
0,1.3116,2024-05-25T19:00:00,10.911,0.0,0.0,1029.099976,89.0,8.913181,13.70839,223.363464,209.931427,Luleå
1,-0.02008,2024-01-29T03:00:00,-0.239,0.0,0.51,995.900024,100.0,30.821552,49.130436,202.671387,204.687149,Luleå
2,0.24027,2023-11-22T13:00:00,0.311,0.4,0.1,997.0,100.0,41.705654,61.145973,186.940887,188.463959,Luleå
3,1.1391,2022-12-18T23:00:00,-9.988999,0.0,0.2,1007.799988,1.0,6.87779,17.81909,263.991089,278.130005,Luleå
4,0.62367,2023-10-24T19:00:00,-3.289,0.0,0.0,1022.099976,100.0,3.319036,6.763786,49.398785,64.79892,Luleå
5,0.33147,2023-12-28T04:00:00,-15.839,0.0,0.44,1004.400024,69.0,5.600286,3.706427,315.000092,29.054508,Luleå
6,0.67063,2023-12-09T02:00:00,-5.089,0.1,0.32,1028.0,100.0,20.674158,31.1686,172.998825,173.367584,Luleå
7,2.44096,2024-10-29T16:00:00,2.911,0.2,0.0,1000.700012,100.0,24.107906,36.662163,203.310959,204.048172,Luleå
8,0.85157,2024-01-20T08:00:00,-23.889,0.0,0.47,1008.400024,0.0,7.895416,10.739833,294.227722,283.570435,Luleå
9,0.87142,2024-01-02T12:00:00,-24.338999,0.0,0.44,1029.199951,97.0,5.904439,3.706427,322.431335,29.054508,Luleå


In [11]:
feature_view = fs.get_or_create_feature_view(
    name='electricity_price_fv',
    description="weather features with electricity prices as the target",
    version=1,
    labels=['price'],
    query=selected_features,
)

Feature view created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1164446/fs/1155149/fv/electricity_price_fv/version/1


In [14]:
X_train, X_test, y_train, y_test = feature_view.train_test_split(
    test_size = 0.2
)

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (4.23s) 



In [22]:
train_features = X_train.drop(['time_start', 'city'], axis=1)
test_features = X_test.drop(['time_start', 'city'], axis=1)

In [19]:
merged_df = merged_df.drop(columns=['date', 'time_start', 'city'])

In [None]:
os.makedirs('model', exist_ok=True)

In [24]:
train_features

Unnamed: 0,temperature_2m,precipitation,snow_depth,pressure_msl,cloud_cover,wind_speed_10m,wind_speed_100m,wind_direction_10m,wind_direction_100m
1,-0.239000,0.0,0.51,995.900024,100.0,30.821552,49.130436,202.671387,204.687149
3,-9.988999,0.0,0.20,1007.799988,1.0,6.877790,17.819090,263.991089,278.130005
4,-3.289000,0.0,0.00,1022.099976,100.0,3.319036,6.763786,49.398785,64.798920
5,-15.839000,0.0,0.44,1004.400024,69.0,5.600286,3.706427,315.000092,29.054508
6,-5.089000,0.1,0.32,1028.000000,100.0,20.674158,31.168600,172.998825,173.367584
...,...,...,...,...,...,...,...,...,...
18808,,,,,,,,,
18809,,,,,,,,,
18810,,,,,,,,,
18811,,,,,,,,,


In [27]:
X_test

Unnamed: 0,time_start,temperature_2m,precipitation,snow_depth,pressure_msl,cloud_cover,wind_speed_10m,wind_speed_100m,wind_direction_10m,wind_direction_100m,city
0,2024-05-25T19:00:00,10.911,0.0,0.00,1029.099976,89.0,8.913181,13.708390,223.363464,209.931427,Luleå
2,2023-11-22T13:00:00,0.311,0.4,0.10,997.000000,100.0,41.705654,61.145973,186.940887,188.463959,Luleå
20,2022-11-28T00:00:00,1.111,0.0,0.03,1028.599976,100.0,16.595179,26.717722,192.528793,194.036270,Luleå
21,2023-09-17T13:00:00,11.711,0.0,0.00,1014.900024,67.0,29.035288,40.877045,340.441650,340.447906,Luleå
36,2023-11-02T02:00:00,-3.889,0.0,0.04,1004.299988,100.0,27.416782,40.310356,13.671325,17.140432,Luleå
...,...,...,...,...,...,...,...,...,...,...,...
18773,2023-01-26T03:00:00,-0.339,0.0,0.46,1009.000000,27.0,14.186923,27.059933,305.706787,310.683899,Luleå
18794,2023-04-14T17:00:00,-2.139,0.0,0.46,1032.000000,0.0,8.089993,18.888851,69.145462,97.666718,Luleå
18796,2022-11-14T04:00:00,-3.189,0.0,0.00,1024.599976,100.0,5.351785,9.387651,317.726379,327.528839,Luleå
18798,2024-07-01T14:00:00,15.361,0.2,0.00,1000.900024,100.0,12.096214,15.629971,36.528946,38.453735,Luleå


In [25]:

xgb_regressor = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1, max_depth=5)
xgb_regressor.fit(train_features, y_train)

y_pred = xgb_regressor.predict(test_features)

mse = mean_squared_error(y_test.iloc[:,0], y_pred)
print("MSE:", mse)
r2 = r2_score(y_test.iloc[:,0], y_pred)
print("R squared:", r2)


MSE: 0.26828626381666415
R squared: 0.603119652617496


In [26]:
df = y_test
df['predicted_price'] = y_pred

In [29]:
df['date'] = X_test['time_start']
df = df.sort_values(by=['date'])
df.head(5)

Unnamed: 0,price,predicted_price,date
8109,0.3843,0.590861,2022-11-01T02:00:00
4833,0.39301,0.591139,2022-11-01T03:00:00
2379,0.84882,0.738401,2022-11-01T07:00:00
6764,0.26241,0.513541,2022-11-01T23:00:00
5189,0.46478,0.526345,2022-11-02T11:00:00


In [30]:
model_dir = "SE4_model"
if not os.path.exists(model_dir):
    os.mkdir(model_dir)
images_dir = model_dir + "/images"
if not os.path.exists(images_dir):
    os.mkdir(images_dir)

In [31]:
from hsml.schema import Schema
from hsml.model_schema import ModelSchema

# Creating input and output schemas using the 'Schema' class for features (X) and target variable (y)
input_schema = Schema(X_train)
output_schema = Schema(y_train)

# Creating a model schema using 'ModelSchema' with the input and output schemas
model_schema = ModelSchema(input_schema=input_schema, output_schema=output_schema)

# Converting the model schema to a dictionary representation
schema_dict = model_schema.to_dict()

In [32]:
xgb_regressor.save_model(model_dir + "/model.json")

In [33]:
res_dict = { 
        "MSE": str(mse),
        "R squared": str(r2),
    }

In [36]:
mr = project.get_model_registry()

# Creating a Python model in the model registry named 'air_quality_xgboost_model'

se4_model = mr.python.create_model(
    name="electricity_xgboost_model", 
    metrics= res_dict,
    model_schema=model_schema,
    input_example=X_test.sample().values, 
    description="Electricity prices predictor for SE4",
)

# Saving the model artifacts to the 'air_quality_model' directory in the model registry
se4_model.save(model_dir)

  0%|          | 0/6 [00:00<?, ?it/s]

Uploading: 0.000%|          | 0/351260 elapsed<00:00 remaining<?

Uploading: 0.000%|          | 0/159 elapsed<00:00 remaining<?

Uploading: 0.000%|          | 0/1012 elapsed<00:00 remaining<?

Model created, explore it at https://c.app.hopsworks.ai:443/p/1164446/models/electricity_xgboost_model/1


Model(name: 'electricity_xgboost_model', version: 1)