# Weather Forecast Data

This notebook is used to extract the weather forecast data from the Open-Meteo API ([Link](https://open-meteo.com/en/docs)). First, we get the coordinates of the locations of which we want to have the forecasts. This is done via a SQL query to an existing database on our Azure SQL Server. Having the coordinates, we call the weather forecast API and get hourly data for the next day for each location, i.e. we receive 24 data entries per location.

The weather forecast consists of several data points like temperature, precipitation probability, wind speed or cloud coverage. In total, we retrieve 19 meteorological data points per location and hour. The received data is then ingested to a separate table in our SQL database. From there, it can be joined with other data sources for further analysis and evaluation.

In [63]:
# Import required libraries

import json
import os
import pandas as pd
import math
import time

import openmeteo_requests

import pymssql
import requests_cache
from retry_requests import retry
from sqlalchemy import create_engine, text

In [3]:
# Get current working directory
print(os.getcwd())

c:\Users\twips\Documents\GitHub\Wanderwege\notebooks


In [5]:
# Load configuration from config/db_config.json
with open('../config/db_config.json', 'r') as f:
    db_config = json.load(f)

# Get database credentials
server = db_config['server']
database = db_config['database']
db_user = db_config['db_user']
db_password = db_config['db_password']

In [6]:
# Establish connection
conn = pymssql.connect(server, db_user, db_password, database)

# Execute SQL query
query = "SELECT * FROM dbo.overpass"
cursor = conn.cursor()
cursor.execute(query)

# Fetch all the rows from the executed query
rows = cursor.fetchall()

# Fetch the column names from the cursor description
columns = [col[0] for col in cursor.description]

# Store the results in a Pandas DataFrame
df = pd.DataFrame(rows, columns=columns)

# Close the connection
conn.close()

In [16]:
# Print the shape of the dataframe
df.shape

(15303, 5)

In [10]:
# Print the first 5 rows of the dataframe
df.head()

Unnamed: 0,timestamp_apicall,id,name,lat,lon
0,2024-09-23 13:57:01,22614,Nationalpark Wanderroute 15 (Munt la Schera),46.650143,10.2301992
1,2024-09-23 13:57:01,103607,Wanderwege SG,47.4309774,9.62017
2,2024-09-23 13:57:01,112830,Uetliberg - Uetliberg Uto Kulm,47.351168,8.4897796
3,2024-09-23 13:57:01,112831,Folenweid - Baldern,47.3291235,8.5007261
4,2024-09-23 13:57:01,112833,Felsenegg - Balderen,47.3152439,8.5050559


In [32]:
# Print the data types of the columns
df.dtypes

timestamp_apicall    object
id                    int64
name                 object
lat                  object
lon                  object
dtype: object

In [33]:
# Convert latitude and longitude to float
df["lat"] = df["lat"].astype(float)
df["lon"] = df["lon"].astype(float)

In [34]:
# Describe the dataframe (columns "lat" and "lon")
df[["lat", "lon"]].describe()

Unnamed: 0,lat,lon
count,15303.0,15303.0
mean,47.038487,8.271581
std,0.3683,0.748053
min,45.880236,6.094391
25%,46.861994,7.687364
50%,47.081839,8.327948
75%,47.297456,8.825083
max,47.807957,10.455874


In [35]:
# Create a subset of the dataframe with the first 300 rows
df_subset = df[:300]

In [36]:
# Only keep id, lat ond lon of the subset
df_subset = df_subset[["id", "lat", "lon"]]

In [37]:
# Print the shape of the subset
df_subset.shape

(300, 3)

In [38]:
# Setup the Open-Meteo API client with cache and retry on error
cache_session = requests_cache.CachedSession('.cache', expire_after=3600)
retry_session = retry(cache_session, retries=5, backoff_factor=0.2)
openmeteo = openmeteo_requests.Client(session=retry_session)

In [108]:
# Make sure all required weather variables are listed here
# The order of variables in hourly or daily is important to assign them correctly below

# URL for the Open-Meteo API
url = "https://api.open-meteo.com/v1/forecast"

# Initialize a list to store data for all locations
all_hourly_data = []

# Define a the size of each batch
batch_size = 100

# Add time and datestamp of API call to dataframe
timestamp_apicall = pd.Timestamp.now().strftime("%Y-%m-%d %H:%M:%S")

# Loop through all batches and make a request for each batch
for i in range(0, len(df_subset), batch_size):
    latitude = list(df_subset[i:i+batch_size]["lat"])
    longitude = list(df_subset[i:i+batch_size]["lon"])

    item_start = i
    item_end = i + len(latitude) - 1
    batch = int((i + batch_size) / batch_size)

    print("-------------------------")
    print(f"Batch: {batch}")
    print(f"Items: {item_start}-{item_end}")
    print(f"Number of items: {len(latitude)}")

    params = {
        "latitude": latitude,
        "longitude": longitude,
        "hourly": [
            "temperature_2m", "relative_humidity_2m", "dew_point_2m",
            "apparent_temperature", "precipitation", "rain", "snowfall",
            "snow_depth", "weather_code", "pressure_msl", "surface_pressure",
            "cloud_cover", "cloud_cover_low", "cloud_cover_mid", "cloud_cover_high",
            "wind_speed_10m", "wind_gusts_10m", "is_day", "sunshine_duration"
        ],
        "forecast_days": 1,
    }
    
    responses = openmeteo.weather_api(url, params=params)

    print(f"Number of fetched items: {len(responses)}")

    # Loop through all responses and extract data for each location and hourly forecast
    for i, response in enumerate(responses):
        # Process location metadata
        # print(f"Coordinates {response.Latitude()}°N {response.Longitude()}°E")
        # print(f"Elevation {response.Elevation()} m asl")
        # print(f"Timezone {response.Timezone()} {response.TimezoneAbbreviation()}")
        # print(f"Timezone difference to GMT+0 {response.UtcOffsetSeconds()} s")

        # Define index to be looked up in df_subset
        i_loc = item_start + i

        # Process hourly data for this location
        hourly = response.Hourly()

        # Extract variables (note: needs to be same order as in request)
        hourly_data = {
            "id": df_subset["id"].iloc[i_loc],
            "date": pd.date_range(
                start=pd.to_datetime(hourly.Time(), unit="s", utc=True),
                end=pd.to_datetime(hourly.TimeEnd(), unit="s", utc=True),
                freq=pd.Timedelta(seconds=hourly.Interval()),
                inclusive="left"
            ),
            "lat": df_subset["lat"].iloc[i_loc],
            "lon": df_subset["lon"].iloc[i_loc],
            "temperature_2m": hourly.Variables(0).ValuesAsNumpy(),
            "relative_humidity_2m": hourly.Variables(1).ValuesAsNumpy(),
            "dew_point_2m": hourly.Variables(2).ValuesAsNumpy(),
            "apparent_temperature": hourly.Variables(3).ValuesAsNumpy(),
            "precipitation": hourly.Variables(4).ValuesAsNumpy(),
            "rain": hourly.Variables(5).ValuesAsNumpy(),
            "snowfall": hourly.Variables(6).ValuesAsNumpy(),
            "snow_depth": hourly.Variables(7).ValuesAsNumpy(),
            "weather_code": hourly.Variables(8).ValuesAsNumpy(),
            "pressure_msl": hourly.Variables(9).ValuesAsNumpy(),
            "surface_pressure": hourly.Variables(10).ValuesAsNumpy(),
            "cloud_cover": hourly.Variables(11).ValuesAsNumpy(),
            "cloud_cover_low": hourly.Variables(12).ValuesAsNumpy(),
            "cloud_cover_mid": hourly.Variables(13).ValuesAsNumpy(),
            "cloud_cover_high": hourly.Variables(14).ValuesAsNumpy(),
            "wind_speed_10m": hourly.Variables(15).ValuesAsNumpy(),
            "wind_gusts_10m": hourly.Variables(16).ValuesAsNumpy(),
            "is_day": hourly.Variables(17).ValuesAsNumpy(),
            "sunshine_duration": hourly.Variables(18).ValuesAsNumpy(),
            "timestamp_apicall": timestamp_apicall,
        }

        # Convert to DataFrame and append to list
        all_hourly_data.append(pd.DataFrame(hourly_data))
        
    time.sleep(2)


-------------------------
Batch: 1
Items: 0-99
Number of items: 100
Number of fetched items: 100
-------------------------
Batch: 2
Items: 100-199
Number of items: 100
Number of fetched items: 100
-------------------------
Batch: 3
Items: 200-299
Number of items: 100
Number of fetched items: 100


In [109]:
# Print the length of the resulting list
print(len(all_hourly_data))

300


In [111]:
# Concatenate all location data into a single DataFrame
all_hourly_data = pd.concat(all_hourly_data)
print(all_hourly_data.shape)

(7200, 24)


In [112]:
# Describe first half of the columns in the dataframe to decide what data types to use
all_hourly_data.iloc[:, :all_hourly_data.shape[1]//2].describe()

Unnamed: 0,id,lat,lon,temperature_2m,relative_humidity_2m,dew_point_2m,apparent_temperature,precipitation,rain,snowfall,snow_depth
count,7200.0,7200.0,7200.0,7200.0,7200.0,7200.0,7200.0,7200.0,7200.0,7200.0,7200.0
mean,166879.013333,47.118394,8.564355,11.409118,88.86042,9.570983,10.529437,0.208667,0.199042,0.0,0.005424
std,28470.452458,0.299599,0.757814,3.432407,8.471201,3.637586,4.451874,0.575994,0.557864,0.0,0.013364
min,22614.0,46.32609,6.413507,1.373,56.0,-3.291423,-4.104864,0.0,0.0,0.0,0.0
25%,145859.75,46.862828,8.194187,9.0555,83.0,7.040367,7.461066,0.0,0.0,0.0,0.0
50%,167929.0,47.233362,8.69855,12.00775,91.0,10.380244,11.366615,0.0,0.0,0.0,0.0
75%,188402.25,47.303267,8.999902,13.88175,95.0,12.759791,14.0642,0.1,0.1,0.0,0.0
max,223797.0,47.602057,10.230199,19.586,100.0,15.745424,19.848083,7.3,7.1,0.0,0.06


In [113]:
# Describe second half of the columns in the dataframe to decide what data types to use
all_hourly_data.iloc[:, all_hourly_data.shape[1]//2:].describe()

Unnamed: 0,weather_code,pressure_msl,surface_pressure,cloud_cover,cloud_cover_low,cloud_cover_mid,cloud_cover_high,wind_speed_10m,wind_gusts_10m,is_day,sunshine_duration
count,7200.0,7200.0,7200.0,7200.0,7200.0,7200.0,7200.0,7200.0,7200.0,7200.0,7200.0
mean,31.468472,1012.858887,898.386536,94.000275,61.297222,84.799858,61.664444,6.035823,16.28265,0.5,244.639374
std,30.258692,1.246145,58.573189,14.809651,31.042561,23.672815,45.39185,3.630975,11.720735,0.500035,785.528076
min,0.0,1009.900024,765.802124,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3.0,1011.900024,857.312622,97.0,44.0,78.0,0.0,3.41526,8.28,0.0,0.0
50%,3.0,1012.799988,915.449402,100.0,69.0,97.0,100.0,5.411986,14.759999,0.5,0.0
75%,61.0,1013.599976,950.10701,100.0,85.0,100.0,100.0,7.698024,22.68,1.0,0.0
max,96.0,1018.900024,977.563232,100.0,100.0,100.0,100.0,27.811018,85.679993,1.0,3600.0


In [117]:
# Store data in csv
all_hourly_data.to_csv("../data/processed/weather_forecast_1d_h.csv")

In [121]:
# Create table if it doesn't exist
table_name = "OPNM_WeatherForecast_1d_H"
query = f"""
    IF OBJECT_ID(N'dbo.{table_name}', N'U') IS NULL
    BEGIN
        CREATE TABLE {table_name} (
            id                      INT         NOT NULL,
            date                    DATETIME    NOT NULL,
            lat                     FLOAT       NOT NULL,
            lon                     FLOAT       NOT NULL,
            temperature_2m          FLOAT       NULL,
            relative_humidity_2m    FLOAT       NULL,
            dew_point_2m            FLOAT       NULL,
            apparent_temperature    FLOAT       NULL,
            precipitation           FLOAT       NULL,
            rain                    FLOAT       NULL,
            snowfall                FLOAT       NULL,
            snow_depth              FLOAT       NULL,
            weather_code            FLOAT       NULL,
            pressure_msl            FLOAT       NULL,
            surface_pressure        FLOAT       NULL,
            cloud_cover             FLOAT       NULL,
            cloud_cover_low         FLOAT       NULL,
            cloud_cover_mid         FLOAT       NULL,
            cloud_cover_high        FLOAT       NULL,
            wind_speed_10m          FLOAT       NULL,
            wind_gusts_10m          FLOAT       NULL,
            is_day                  FLOAT       NULL,
            sunshine_duration       FLOAT       NULL,
            timestamp_apicall       DATETIME    NULL,

            PRIMARY KEY (id, date)
        );
    END
    """

conn = pymssql.connect(server, db_user, db_password, database)
cursor = conn.cursor()
cursor.execute(query)

conn.commit()
conn.close()

In [124]:
# Create connection string for SQLAlchemy
connection_string = f"mssql+pymssql://{db_user}:{db_password}@{server}/{database}"
engine = create_engine(connection_string)

In [125]:
# Ingest data to tabledatabase table
all_hourly_data.to_sql(table_name, con=engine, if_exists='append', index=False)

66