# Feature pipeline
* Query new weather data
* Feature Group Insertion

In [14]:
import pandas as pd
import hopsworks
import matplotlib.pyplot as plt
import great_expectations as ge

from weather_utils import *

import sys
sys.path.append('..')  # Add the parent directory (project root) to the Python path
from config import *

# Disable annoying warnings
import warnings
warnings.filterwarnings("ignore")

# 1) Query Weather Data

In [15]:
# Setup connection with Open-Meteo
openmeteo = get_openmeteo_connection()

# Define query parameters
params = {
    "latitude": LATITUDE,
    "longitude": LONGITUDE,
    "daily": ["weather_code", "temperature_2m_min", "precipitation_sum", "wind_gusts_10m_max"],
    "timezone": TIMEZONE,
    "past_days": 1,
    "forecast_days": 0
}

# Execute the query
responses = openmeteo.weather_api(BASELINE_URL_OPEN_METEO, params=params)

# Extract the location because the response can be done for multiple cities as well
response = responses[0]

# 3) Feature Engineering

### 3a) Convert request to dataframe

In [16]:
# Process yesterday's data into a Pandas dataframe
df_weather_yesterday = process_weather_request(response)

In [17]:
display(df_weather_yesterday)

Unnamed: 0,date,weather_code_wmo,temperature_min,precipitation_sum,wind_gusts_max
0,2023-11-14,3.0,0.7175,0.0,28.08


### 3b) Inspect data

In [18]:
# Convert the WMO weather codes column to integers
df_weather_yesterday['weather_code_wmo'] = df_weather_yesterday['weather_code_wmo'].astype(int)

# Check again if there is any missing data
df_weather_yesterday.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   date               1 non-null      object 
 1   weather_code_wmo   1 non-null      int64  
 2   temperature_min    1 non-null      float32
 3   precipitation_sum  1 non-null      float32
 4   wind_gusts_max     1 non-null      float32
dtypes: float32(3), int64(1), object(1)
memory usage: 160.0+ bytes


### 3c) Format values

In [19]:
# Add a new column with the month as an integer
df_weather_yesterday['month'] = pd.to_datetime(df_weather_yesterday['date']).dt.month

In [20]:
# Group WMO codes into labels and new group code label
df_weather_yesterday = group_wmo_weather_codes(df_weather_yesterday)

In [21]:
# Add weather code descriptions
df_weather_yesterday = add_weather_code_labels(df_weather_yesterday)

In [22]:
display(df_weather_yesterday)

Unnamed: 0,date,weather_code_wmo,temperature_min,precipitation_sum,wind_gusts_max,month,weather_code,weather_code_desc,weather_code_desc_short
0,2023-11-14,3,0.7175,0.0,28.08,11,2,"Mainly clear, partly cloudy, and overcast",Clear


# 4) Upload to Hopsworks Feature Store

In [23]:
# Connect to HopsWorks
project = hopsworks.login()

# Retrieve feature store
fs = project.get_feature_store()

Connection closed.
Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/178324
Connected. Call `.close()` to terminate connection gracefully.


In [None]:
# Create new Feature Store (no backfill) or retrieve if existing (backfill done, or past days already inserted)
historical_weather_fg = fs.get_or_create_feature_group(
    name=FG_HISTORY_NAME,
    description=FG_HISTORY_DESC,
    version=FG_HISTORY_V,
    primary_key=FG_HISTORY_PK,
    event_time=["date"],
    statistics_config={"enabled": True,
                       "histograms": True,
                       "correlations": True}
)

# Insert data in the feature group
historical_weather_fg.insert(df_weather_yesterday,
                             write_options={"wait_for_job": True} # wait for job to end, so the new data is processed in Hopsworks and the next pipeline can use it
                            )