# Backfill Pipeline
* Fetch historical data from Open-Meteo
* Connect to the Hopsworks feature store
* Create feature groups and insert them to the feature store

In [None]:
import pandas as pd
import hopsworks
import openmeteo_requests
import requests_cache
from retry_requests import retry
import matplotlib.pyplot as plt
from weather_utils import *

import sys
sys.path.append('..')  # Add the parent directory (project root) to the Python path
from config import *

# Disable annoying warnings
import warnings
warnings.filterwarnings("ignore")

# Query Weather Data

In [None]:
# Create query parameters.
# Can query historical data up to 3 months
params = {
    "latitude": LATITUDE,
    "longitude": LONGITUDE,
    "daily": ["weather_code", "temperature_2m_min", "precipitation_sum", "wind_gusts_10m_max"],
    "timezone": TIMEZONE,
    "start_date": "2023-12-10",
    "end_date": "2024-01-18" # last day is not included
}

# Setup connection with Open-Meteo
openmeteo = get_openmeteo_connection()

# Execute the query
responses = openmeteo.weather_api(BASELINE_URL_OPEN_METEO, params=params)

# Extract the location because the response can be done for multiple cities as well
response = responses[0]

# Convert request to dataframe¶
df_hist_data = process_weather_request(response)

# Feature Engineering

In [None]:
# Process daily data into a Pandas dataframe
df_hist_data = process_weather_request(response)

### Remove missing values

In [None]:
# Check if there is any missing data
df_hist_data.info()

In [None]:
# Remove missing days not accessible through the query
df_hist_data = df_hist_data.dropna()

# Convert the WMO weather codes column to integers
df_hist_data['weather_code_wmo'] = df_hist_data['weather_code_wmo'].astype(int)

# Check again if there is any missing data
df_hist_data.info()

### Format values

In [None]:
# Add a new column with the month as an integer
df_hist_data['month'] = pd.to_datetime(df_hist_data['date']).dt.month

In [None]:
# Group WMO codes into labels and new group code label
df_hist_data = group_wmo_weather_codes(df_hist_data)

In [None]:
# Add weather code descriptions
df_hist_data = add_weather_code_labels(df_hist_data)

# Visualize

### Manual inspection

In [None]:
# Newest values
print('Newest values:')
display(df_hist_data.tail())

# Oldest values
print('Oldest values:')
display(df_hist_data.head())

### Plot trends

In [None]:
# Create a figure with four vertically aligned subplots
fig, (ax1, ax2, ax3, ax4) = plt.subplots(4, 1, figsize=(10, 10), sharex=True)

# First subplot: Weather Code
ax1.plot(df_hist_data['date'], df_hist_data['weather_code'], marker='o', linestyle='-', markersize=2)
ax1.set_ylabel('Weather')
ax1.set_title('Weather Codes Trend')

# Customize y-axis labels with weather code labels instead of numbers
ax1.set_yticks(df_hist_data['weather_code'])
ax1.set_yticklabels(df_hist_data['weather_code_desc'])

# Second subplot: Min Temperature
ax2.plot(df_hist_data['date'], df_hist_data['temperature_min'], marker='o', linestyle='-', markersize=2)
ax2.set_ylabel('Min Temperature (°C)')
ax2.set_title('Min Temperature Trend')

# Third subplot: Precipitations Sum
ax3.plot(df_hist_data['date'], df_hist_data['precipitation_sum'], marker='o', linestyle='-', markersize=2)
ax3.set_ylabel('Precipitations Sum (mm)')
ax3.set_title('Precipitations Trend')

# Fourth subplot: Max Wind Gusts
ax4.plot(df_hist_data['date'], df_hist_data['wind_gusts_max'], marker='o', linestyle='-', markersize=2)
ax4.set_xlabel('Date')
ax4.set_ylabel('Max Wind Gusts (km/h)')
ax4.set_title('Max Wind Gusts Trend')

# Repeat x-label in all subplots
ax1.tick_params(axis='x', rotation=45)
ax2.tick_params(axis='x', rotation=45)
ax3.tick_params(axis='x', rotation=45)
ax4.tick_params(axis='x', rotation=45)

# Display vertical lines aligned with x-ticks dates in all subplots
for tick in ax1.get_xticks():
    ax1.axvline(tick, color='gray', linestyle='--', alpha=0.5)
    ax2.axvline(tick, color='gray', linestyle='--', alpha=0.5)
    ax3.axvline(tick, color='gray', linestyle='--', alpha=0.5)
    ax4.axvline(tick, color='gray', linestyle='--', alpha=0.5)

# Display the plot
plt.tight_layout()  # Ensures proper spacing between subplots
plt.show()

# Upload to Hopsworks Feature Store

In [None]:
# Connect to Hopsworks
project = hopsworks.login()

# Retrieve Feature Store
fs = project.get_feature_store() 

In [None]:
# Create Feature Group
historical_weather_fg = fs.get_or_create_feature_group(
    name=FG_HISTORY_NAME,
    description=FG_HISTORY_DESC,
    version=FG_HISTORY_V,
    primary_key=FG_HISTORY_PK,
    event_time=["date"],
    statistics_config={"enabled": True,
                       "histograms": True,
                       "correlations": True}
)

In [None]:
# Upload dataframe into Feature Store
historical_weather_fg.insert(df_hist_data,
                             write_options={"wait_for_job": False})