# <span style="color:#ff5f27"> 📝 Imports

In [1]:
import os
import datetime
import requests
import json
import pandas as pd

# <span style="color:#ff5f27"> ⚙️ Functions

In [2]:
def convert_date_to_unix(x):
    """
    Convert datetime to unix time in milliseconds.
    """
    dt_obj = datetime.datetime.strptime(str(x), '%Y-%m-%d %H:%M:%S')
    dt_obj = int(dt_obj.timestamp() * 1000)
    return dt_obj

In [3]:
def get_weather_data(city_name: str,
                     coordinates: list,
                     start_date: str = None,
                     end_date: str = None,
                     forecast: bool = False):
    """
    Takes city name, coordinates and returns pandas DataFrame with weather data.
    
    'start_date' and 'end_date' are required if you parse historical observations data. (forecast=False)
    
    If forecast=True - returns 7 days forecast data by default (without specifying daterange).
    """
    
    latitude, longitude = coordinates
    
    params = {
        'latitude': latitude,
        'longitude': longitude,
        'hourly': ['temperature_2m','relativehumidity_2m','precipitation',
                   'weathercode','windspeed_10m','winddirection_10m'],
        'start_date': start_date,
        'end_date': end_date
    }
    
    if forecast:
        # historical forecast endpoint
        base_url = 'https://api.open-meteo.com/v1/forecast' 
    else:
        # historical observations endpoint
        base_url = 'https://archive-api.open-meteo.com/v1/archive?' 
        
    response = requests.get(base_url, params=params)

    response_json = response.json()

    some_metadata = {key: response_json[key] for key in ('latitude', 'longitude',
                                                         'timezone', 'hourly_units')}
    
    
    res_df = pd.DataFrame(response_json["hourly"])
    
    res_df["forecast_hr"] = 0
    
    if forecast:
        res_df["forecast_hr"] = res_df.index
    
    some_metadata["city_name"] = city_name
    res_df["city_name"] = city_name
    
    # rename columns
    res_df = res_df.rename(columns={
        "time": "base_time",
        "temperature_2m": "temperature",
        "relativehumidity_2m": "relative_humidity",
        "weathercode": "weather_code",
        "windspeed_10m": "wind_speed",
        "winddirection_10m": "wind_direction"
    })
    
    # change columns order
    res_df = res_df[["city_name", "base_time", "forecast_hr", "temperature", "precipitation",
                     "relative_humidity", "weather_code", "wind_speed", "wind_direction"]]
    
    # convert dates in 'base_time' column
    res_df["base_time"] = pd.to_datetime(res_df["base_time"])
    
    # create 'unix' columns
    res_df["unix_time"] = res_df["base_time"].apply(convert_date_to_unix)
    
    return res_df, some_metadata

# <span style="color:#ff5f27"> 🔮 Data Parsing

In [4]:
weather_df, metadata = get_weather_data("Paris", (48.85, 2.35),
                                        forecast=False, 
                                        start_date="2023-02-10", end_date="2023-02-12")

In [5]:
weather_df

Unnamed: 0,city_name,base_time,forecast_hr,temperature,precipitation,relative_humidity,weather_code,wind_speed,wind_direction,unix_time
0,Paris,2023-02-10 00:00:00,0,-0.4,0.0,93,2,3.7,241,1675983600000
1,Paris,2023-02-10 01:00:00,0,-0.2,0.0,95,3,3.6,225,1675987200000
2,Paris,2023-02-10 02:00:00,0,0.1,0.0,96,3,5.5,212,1675990800000
3,Paris,2023-02-10 03:00:00,0,0.6,0.0,96,3,5.5,212,1675994400000
4,Paris,2023-02-10 04:00:00,0,1.0,0.0,96,3,6.3,211,1675998000000
...,...,...,...,...,...,...,...,...,...,...
67,Paris,2023-02-12 19:00:00,0,5.9,0.0,86,1,7.9,60,1676224800000
68,Paris,2023-02-12 20:00:00,0,4.9,0.0,90,0,7.9,60,1676228400000
69,Paris,2023-02-12 21:00:00,0,3.9,0.0,92,0,7.6,65,1676232000000
70,Paris,2023-02-12 22:00:00,0,3.0,0.0,94,0,7.9,60,1676235600000


In [6]:
weather_df, metadata = get_weather_data("Paris", (48.85, 2.35),
                                        forecast=True, 
                                        start_date="2023-02-10", end_date="2023-02-12")

In [7]:
weather_df

Unnamed: 0,city_name,base_time,forecast_hr,temperature,precipitation,relative_humidity,weather_code,wind_speed,wind_direction,unix_time
0,Paris,2023-02-10 00:00:00,0,1.4,0.0,82,3,3.1,135,1675983600000
1,Paris,2023-02-10 01:00:00,1,1.4,0.0,86,3,4.1,128,1675987200000
2,Paris,2023-02-10 02:00:00,2,1.4,0.0,86,3,3.0,104,1675990800000
3,Paris,2023-02-10 03:00:00,3,2.1,0.0,84,3,2.6,124,1675994400000
4,Paris,2023-02-10 04:00:00,4,2.1,0.0,84,3,5.9,133,1675998000000
...,...,...,...,...,...,...,...,...,...,...
67,Paris,2023-02-12 19:00:00,67,6.6,0.0,79,0,2.8,50,1676224800000
68,Paris,2023-02-12 20:00:00,68,5.4,0.0,84,0,2.8,40,1676228400000
69,Paris,2023-02-12 21:00:00,69,3.9,0.0,89,0,3.1,45,1676232000000
70,Paris,2023-02-12 22:00:00,70,3.3,0.0,91,0,3.3,13,1676235600000


In [8]:
metadata

{'latitude': 48.84,
 'longitude': 2.3599997,
 'timezone': 'GMT',
 'hourly_units': {'time': 'iso8601',
  'temperature_2m': '°C',
  'relativehumidity_2m': '%',
  'precipitation': 'mm',
  'weathercode': 'wmo code',
  'windspeed_10m': 'km/h',
  'winddirection_10m': '°'},
 'city_name': 'Paris'}

---
# <span style="color:#ff5f27"> 👩🏻‍🔬 Backfill Pipeline

In [36]:
today = datetime.date.today() # datetime object

day7next = str(today + datetime.timedelta(7))
day7ago = str(today - datetime.timedelta(7))
tomorrow = str(today + datetime.timedelta(1))

In [23]:
str(today)

'2023-05-02'

In [24]:
with open('target_cities.json') as json_file:
    target_cities = json.load(json_file)

In [25]:
target_cities

{'London': [51.51, -0.13],
 'Paris': [48.85, 2.35],
 'Stockholm': [59.33, 18.07],
 'New York': [40.71, -74.01],
 'Los Angeles': [34.05, -118.24],
 'Singapore': [1.36, 103.82],
 'Sydney': [-33.87, 151.21],
 'Hong Kong': [22.28, 114.16],
 'Rome': [41.89, 12.48],
 'Kyiv': [50.45, 30.52]}

### <span style="color:#ff5f27"> 🧙🏼‍♂️ Parsing historical weather observations from January 1 2000 till 7 days before today (thats the restrictions of this particular API.)

In [37]:
observations_df = pd.DataFrame()

for city_name in target_cities:
    weather_df_temp, metadata_temp = get_weather_data(city_name, target_cities[city_name],
                                                      start_date="2023-01-01", end_date=day7ago)
    observations_df = pd.concat([observations_df, weather_df_temp])

In [38]:
observations_df

Unnamed: 0,city_name,base_time,forecast_hr,temperature,precipitation,relative_humidity,weather_code,wind_speed,wind_direction,unix_time
0,London,2023-01-01 00:00:00,0,11.4,0.2,87,51,29.1,220,1672527600000
1,London,2023-01-01 01:00:00,0,11.0,0.1,87,51,32.6,228,1672531200000
2,London,2023-01-01 02:00:00,0,10.5,0.0,84,1,29.8,228,1672534800000
3,London,2023-01-01 03:00:00,0,10.0,0.0,83,2,26.6,229,1672538400000
4,London,2023-01-01 04:00:00,0,9.8,0.0,84,2,25.7,227,1672542000000
...,...,...,...,...,...,...,...,...,...,...
2755,Kyiv,2023-04-25 19:00:00,0,12.1,0.0,61,1,8.7,150,1682442000000
2756,Kyiv,2023-04-25 20:00:00,0,11.1,0.0,64,2,8.6,182,1682445600000
2757,Kyiv,2023-04-25 21:00:00,0,10.4,0.0,67,0,8.8,199,1682449200000
2758,Kyiv,2023-04-25 22:00:00,0,9.4,0.0,71,0,8.4,205,1682452800000


In [33]:
# observations_df.to_csv("observations_df.csv", index=False)

### <span style="color:#ff5f27"> 🧙🏼‍♂️ Parsing historical weather forecasts from 7 days before today till today (process it as observations).

In [62]:
forecast_batch_df = pd.DataFrame()

for city_name in target_cities:
    weather_df_temp, metadata_temp = get_weather_data(city_name, target_cities[city_name],
                                                      start_date=day7ago, end_date=str(today),
                                                      forecast=True)
    forecast_batch_df = pd.concat([forecast_batch_df, weather_df_temp])

forecast_batch_df["forecast_hr"] = 0

In [63]:
forecast_batch_df

Unnamed: 0,city_name,base_time,forecast_hr,temperature,precipitation,relative_humidity,weather_code,wind_speed,wind_direction,unix_time
0,London,2023-04-25 00:00:00,0,5.3,0.0,78,2,11.2,6,1682373600000
1,London,2023-04-25 01:00:00,0,4.5,0.0,78,0,10.1,6,1682377200000
2,London,2023-04-25 02:00:00,0,3.8,0.0,80,0,9.0,2,1682380800000
3,London,2023-04-25 03:00:00,0,3.3,0.0,81,0,8.6,358,1682384400000
4,London,2023-04-25 04:00:00,0,2.9,0.0,82,0,8.4,353,1682388000000
...,...,...,...,...,...,...,...,...,...,...
187,Kyiv,2023-05-02 19:00:00,0,13.6,0.0,42,1,7.6,175,1683046800000
188,Kyiv,2023-05-02 20:00:00,0,12.6,0.0,45,1,7.9,180,1683050400000
189,Kyiv,2023-05-02 21:00:00,0,11.9,0.0,47,1,8.3,185,1683054000000
190,Kyiv,2023-05-02 22:00:00,0,11.4,0.0,48,3,8.5,192,1683057600000


In [64]:
# forecast_batch_df.to_csv("forecast_batch_df.csv", index=False)

### <span style="color:#ff5f27"> 🧙🏼‍♂️ Parsing weather forecasts for 7 next days.

In [65]:
forecast_df = pd.DataFrame()

for city_name in target_cities:
    weather_df_temp, metadata_temp = get_weather_data(city_name, target_cities[city_name],
                                                      start_date=tomorrow, end_date=day7next,
                                                      forecast=True)
    forecast_df = pd.concat([forecast_df, weather_df_temp])

In [66]:
forecast_df

Unnamed: 0,city_name,base_time,forecast_hr,temperature,precipitation,relative_humidity,weather_code,wind_speed,wind_direction,unix_time
0,London,2023-05-03 00:00:00,0,7.9,0.0,83,2,6.1,87,1683064800000
1,London,2023-05-03 01:00:00,1,7.5,0.0,84,2,5.8,97,1683068400000
2,London,2023-05-03 02:00:00,2,7.1,0.0,81,2,6.1,90,1683072000000
3,London,2023-05-03 03:00:00,3,6.8,0.0,84,1,5.5,79,1683075600000
4,London,2023-05-03 04:00:00,4,6.4,0.0,87,1,5.6,63,1683079200000
...,...,...,...,...,...,...,...,...,...,...
163,Kyiv,2023-05-09 19:00:00,163,9.7,0.0,52,3,3.0,76,1683651600000
164,Kyiv,2023-05-09 20:00:00,164,9.3,0.0,54,3,2.4,153,1683655200000
165,Kyiv,2023-05-09 21:00:00,165,8.9,0.0,55,3,4.8,193,1683658800000
166,Kyiv,2023-05-09 22:00:00,166,8.4,0.0,57,3,5.6,207,1683662400000


In [None]:
# forecast_df.to_csv("forecast_df.csv", index=False)

---
# <span style="color:#ff5f27"> ⬇️ Insert all data into Feature Store

In [44]:
import hopsworks

# project = hopsworks.login(project='weather')
project = hopsworks.login()

fs = project.get_feature_store() 

Connection closed.
Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/15520
Connected. Call `.close()` to terminate connection gracefully.


In [68]:
weather_fg = fs.get_or_create_feature_group(
    name='weather_data',
    description="Public Weather Data from 2000-01-01. Updates every day.",
    version=1,
    primary_key=["city_name", "unix_time", "forecast_hr"],
    # partition_key=["city_name"],
    event_time=["unix_time"],
    online_enabled=True
)

In [69]:
weather_fg.insert(observations_df, write_options={"wait_for_job": False})
weather_fg.insert(forecast_batch_df, write_options={"wait_for_job": False})
weather_fg.insert(forecast_df, write_options={"wait_for_job": True}) 
# we wait for the last one.

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/15520/fs/15440/fg/42348


Uploading Dataframe: 0.00% |          | Rows 0/27600 | Elapsed Time: 00:00 | Remaining Time: ?

Launching offline feature group backfill job...
Backfill Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/15520/jobs/named/weather_data_1_offline_fg_backfill/executions


Uploading Dataframe: 0.00% |          | Rows 0/1920 | Elapsed Time: 00:00 | Remaining Time: ?

Launching offline feature group backfill job...
Backfill Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/15520/jobs/named/weather_data_1_offline_fg_backfill/executions


Uploading Dataframe: 0.00% |          | Rows 0/1680 | Elapsed Time: 00:00 | Remaining Time: ?

Launching offline feature group backfill job...
Backfill Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/15520/jobs/named/weather_data_1_offline_fg_backfill/executions


(<hsfs.core.job.Job at 0x7f7e32b60a30>, None)

---
# <span style="color:#ff5f27"> 👨🏻‍🏫 Retrieve and check data consistency 

In [None]:
weather_retrieved = weather_fg.read()
weather_retrieved = weather_retrieved.sort_values("base_time")
weather_retrieved

In [None]:
# Create a datetime index object
dt_index = pd.date_range(
    start='2000-01-01',
    end=str(today + datetime.timedelta(8)), # to include last, "seventh" day.
    freq='H'
)

In [None]:
# Compare the length of the dataframe and datetime index
if len(dt_index) - 1 != int(len(weather_retrieved) / len(city_names)): # we should compare dt_index to one city daterows.
    print('Inconsistent dates in dataframe.')
else:
    print("Everything seems fine.")

In [None]:
# I substract 1 from len(dt_index) cause it takes 00:00 hour from 8th day.
dt_index

---