In [52]:
import pymssql
import json
import os
import pandas as pd
import openmeteo_requests
import requests_cache
from retry_requests import retry
import time
from sqlalchemy import create_engine

In [3]:
# Get current working directory
current_dir = os.getcwd()
print(current_dir)

# c:\Users\etien\OneDrive\02_Progression\CAS_DataEngineering_ZHAW\03_Leistungsnachweis\Wanderwege\notebooks

c:\GitHub\Wanderwege\notebooks\sandbox


In [4]:
# Load configuration from config/db_config.json
with open('../../config/db_config.json', 'r') as f:
    db_config = json.load(f)

# Access db credentials
server = db_config['server']
database = db_config['database']
db_user = db_config['db_user']
db_password = db_config['db_password']

In [5]:
# Verbindungsaufbau
conn = pymssql.connect(server, db_user, db_password, database)

# Cursor erstellen und SQL-Abfragen ausführen
cursor = conn.cursor()
cursor.execute('SELECT * FROM wanderwege')

# Step 4: Fetch all the rows from the executed query
rows = cursor.fetchall()

# Step 5: Fetch the column names from the cursor description
columns = [col[0] for col in cursor.description]

# Step 6: Store the results in a Pandas DataFrame
df = pd.DataFrame(rows, columns=columns)

# Step 7: Close the connection
conn.close()

In [6]:
df

Unnamed: 0,timestamp_apicall,id,name,lat,lon
0,2024-09-20 10:42:31,22614,Nationalpark Wanderroute 15 (Munt la Schera),46.6501430,10.2301992
1,2024-09-20 10:42:31,103607,Wanderwege SG,47.4309774,9.6201700
2,2024-09-20 10:42:31,112830,Uetliberg - Uetliberg Uto Kulm,47.3511680,8.4897796
3,2024-09-20 10:42:31,112831,Folenweid - Baldern,47.3291235,8.5007261
4,2024-09-20 10:42:31,112833,Felsenegg - Balderen,47.3152439,8.5050559
...,...,...,...,...,...
15191,2024-09-20 10:42:31,18057730,Sobrio - Bodio,46.3914751,8.9049831
15192,2024-09-20 10:42:31,18057731,Bodio - Bodio Bahnhof,46.3811565,8.9070277
15193,2024-09-20 10:42:31,18057943,Anzonico - Monte Angone,46.4380334,8.8618023
15194,2024-09-20 10:42:31,18058034,Monte Angone - Cassine,46.4234746,8.8892182


In [7]:
df_test = df[:500]

In [8]:
df_test.describe()

Unnamed: 0,id
count,500.0
mean,210691.876
std,59205.728445
min,22614.0
25%,167394.75
50%,192952.5
75%,274490.25
max,300475.0


In [9]:
# Setup the Open-Meteo API client with cache and retry on error
cache_session = requests_cache.CachedSession('.cache', expire_after = 3600)
retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
openmeteo = openmeteo_requests.Client(session = retry_session)

In [14]:
# Make sure all required weather variables are listed here
# The order of variables in hourly or daily is important to assign them correctly below
url = "https://api.open-meteo.com/v1/forecast"

# Initialize a list to store data for each location
all_hourly_data = []

# slice df_test into groups of 180 items and iterate over the groups
for i in range(0, len(df_test), 180):
    latitude = list(df_test[i:i+180]["lat"])
    longitude = list(df_test[i:i+180]["lon"])

    item_start = i
    item_end = i + len(latitude)
    batch = int((i + 180) / 180)

    print(f"Batch: {batch}")
    print(f"Items: {item_start}-{item_end}")
    print(f"Number of items: {len(latitude)}")

    params = {
        "latitude": latitude,
        "longitude": longitude,
        "hourly": ["temperature_2m", "relative_humidity_2m", "dew_point_2m", "apparent_temperature", "precipitation", "rain", "snowfall", "snow_depth", "weather_code", "pressure_msl", "surface_pressure", "cloud_cover", "cloud_cover_low", "cloud_cover_mid", "cloud_cover_high", "wind_speed_10m", "wind_gusts_10m", "is_day", "sunshine_duration"],
    }
    responses = openmeteo.weather_api(url, params=params)

    print(f"Number of items fetched: {len(responses)}")

    # Loop through all responses (for multiple locations)
    for i, response in enumerate(responses):
        # Process location metadata
        # print(f"Coordinates {response.Latitude()}°N {response.Longitude()}°E")
        # print(f"Elevation {response.Elevation()} m asl")
        # print(f"Timezone {response.Timezone()} {response.TimezoneAbbreviation()}")
        # print(f"Timezone difference to GMT+0 {response.UtcOffsetSeconds()} s")

        # Process hourly data for this location
        hourly = response.Hourly()

        # Extract variables (same order as requested)
        hourly_data = {
            "date": pd.date_range(
                start=pd.to_datetime(hourly.Time(), unit="s", utc=True),
                end=pd.to_datetime(hourly.TimeEnd(), unit="s", utc=True),
                freq=pd.Timedelta(seconds=hourly.Interval()),
                inclusive="left"
            ),
            "lat_resp": response.Latitude(),
            "lon_resp": response.Longitude(),
            "lat": df_test[i]["lat"],
            "lon": df_test[i]["lon"],
            "id": df_test[i]["id"],
            "temperature_2m": hourly.Variables(0).ValuesAsNumpy(),
            "relative_humidity_2m": hourly.Variables(1).ValuesAsNumpy(),
            "dew_point_2m": hourly.Variables(2).ValuesAsNumpy(),
            "apparent_temperature": hourly.Variables(3).ValuesAsNumpy(),
            "precipitation": hourly.Variables(4).ValuesAsNumpy(),
            "rain": hourly.Variables(5).ValuesAsNumpy(),
            "snowfall": hourly.Variables(6).ValuesAsNumpy(),
            "snow_depth": hourly.Variables(7).ValuesAsNumpy(),
            "weather_code": hourly.Variables(8).ValuesAsNumpy(),
            "pressure_msl": hourly.Variables(9).ValuesAsNumpy(),
            "surface_pressure": hourly.Variables(10).ValuesAsNumpy(),
            "cloud_cover": hourly.Variables(11).ValuesAsNumpy(),
            "cloud_cover_low": hourly.Variables(12).ValuesAsNumpy(),
            "cloud_cover_mid": hourly.Variables(13).ValuesAsNumpy(),
            "cloud_cover_high": hourly.Variables(14).ValuesAsNumpy(),
            "wind_speed_10m": hourly.Variables(15).ValuesAsNumpy(),
            "wind_gusts_10m": hourly.Variables(16).ValuesAsNumpy(),
            "is_day": hourly.Variables(17).ValuesAsNumpy(),
            "sunshine_duration": hourly.Variables(18).ValuesAsNumpy(),
        }

        # Convert to DataFrame and append to list
        all_hourly_data.append(pd.DataFrame(hourly_data))
        
    time.sleep(5)


Batch: 1
Items: 0-180
Number of items: 180
Number of items fetched: 180


KeyError: 0

In [38]:
all_hourly_data = []

for i, response in enumerate(responses):
    # Process location metadata
    # print(f"Coordinates {response.Latitude()}°N {response.Longitude()}°E")
    # print(f"Elevation {response.Elevation()} m asl")
    # print(f"Timezone {response.Timezone()} {response.TimezoneAbbreviation()}")
    # print(f"Timezone difference to GMT+0 {response.UtcOffsetSeconds()} s")

    # Process hourly data for this location
    hourly = response.Hourly()
    
    # Extract variables (same order as requested)
    hourly_data = {
        "date": pd.date_range(
            start=pd.to_datetime(hourly.Time(), unit="s", utc=True),
            end=pd.to_datetime(hourly.TimeEnd(), unit="s", utc=True),
            freq=pd.Timedelta(seconds=hourly.Interval()),
            inclusive="left"
        ),
        "lat_resp": response.Latitude(),
        "lon_resp": response.Longitude(),
        "lat": df_test["lat"].iloc[i],
        "lon": df_test["lon"].iloc[i],
        "id": df_test["id"].iloc[i],
        "temperature_2m": hourly.Variables(0).ValuesAsNumpy(),
        "relative_humidity_2m": hourly.Variables(1).ValuesAsNumpy(),
        "dew_point_2m": hourly.Variables(2).ValuesAsNumpy(),
        "apparent_temperature": hourly.Variables(3).ValuesAsNumpy(),
        "precipitation": hourly.Variables(4).ValuesAsNumpy(),
        "rain": hourly.Variables(5).ValuesAsNumpy(),
        "snowfall": hourly.Variables(6).ValuesAsNumpy(),
        "snow_depth": hourly.Variables(7).ValuesAsNumpy(),
        "weather_code": hourly.Variables(8).ValuesAsNumpy(),
        "pressure_msl": hourly.Variables(9).ValuesAsNumpy(),
        "surface_pressure": hourly.Variables(10).ValuesAsNumpy(),
        "cloud_cover": hourly.Variables(11).ValuesAsNumpy(),
        "cloud_cover_low": hourly.Variables(12).ValuesAsNumpy(),
        "cloud_cover_mid": hourly.Variables(13).ValuesAsNumpy(),
        "cloud_cover_high": hourly.Variables(14).ValuesAsNumpy(),
        "wind_speed_10m": hourly.Variables(15).ValuesAsNumpy(),
        "wind_gusts_10m": hourly.Variables(16).ValuesAsNumpy(),
        "is_day": hourly.Variables(17).ValuesAsNumpy(),
        "sunshine_duration": hourly.Variables(18).ValuesAsNumpy(),
    }

    # Convert to DataFrame and append to list
    all_hourly_data.append(pd.DataFrame(hourly_data))


In [47]:
print(all_hourly_data[0])

                         date  lat_resp  lon_resp         lat         lon  \
0   2024-09-20 00:00:00+00:00     46.66     10.24  46.6501430  10.2301992   
1   2024-09-20 01:00:00+00:00     46.66     10.24  46.6501430  10.2301992   
2   2024-09-20 02:00:00+00:00     46.66     10.24  46.6501430  10.2301992   
3   2024-09-20 03:00:00+00:00     46.66     10.24  46.6501430  10.2301992   
4   2024-09-20 04:00:00+00:00     46.66     10.24  46.6501430  10.2301992   
..                        ...       ...       ...         ...         ...   
163 2024-09-26 19:00:00+00:00     46.66     10.24  46.6501430  10.2301992   
164 2024-09-26 20:00:00+00:00     46.66     10.24  46.6501430  10.2301992   
165 2024-09-26 21:00:00+00:00     46.66     10.24  46.6501430  10.2301992   
166 2024-09-26 22:00:00+00:00     46.66     10.24  46.6501430  10.2301992   
167 2024-09-26 23:00:00+00:00     46.66     10.24  46.6501430  10.2301992   

        id  temperature_2m  relative_humidity_2m  dew_point_2m  \
0    2261

In [46]:
for i in range(0, len(all_hourly_data)):
    print(f'Index: {i}, Lon_df_test: {df_test["lon"].iloc[i]}, Lat_df_test: {df_test["lat"].iloc[i]}, Id: {df_test["id"].iloc[i]}')
    print(f'Index: {i}, Lon_all_hou: {all_hourly_data[i]["lon"].iloc[0]}, Lat_all_hou: {all_hourly_data[i]["lat"].iloc[0]}, Id: {all_hourly_data[i]["id"].iloc[0]}')

Index: 0, Lon_df_test: 10.2301992, Lat_df_test: 46.6501430, Id: 22614
Index: 0, Lon_all_hou: 10.2301992, Lat_all_hou: 46.6501430, Id: 22614
Index: 1, Lon_df_test: 9.6201700, Lat_df_test: 47.4309774, Id: 103607
Index: 1, Lon_all_hou: 9.6201700, Lat_all_hou: 47.4309774, Id: 103607
Index: 2, Lon_df_test: 8.4897796, Lat_df_test: 47.3511680, Id: 112830
Index: 2, Lon_all_hou: 8.4897796, Lat_all_hou: 47.3511680, Id: 112830
Index: 3, Lon_df_test: 8.5007261, Lat_df_test: 47.3291235, Id: 112831
Index: 3, Lon_all_hou: 8.5007261, Lat_all_hou: 47.3291235, Id: 112831
Index: 4, Lon_df_test: 8.5050559, Lat_df_test: 47.3152439, Id: 112833
Index: 4, Lon_all_hou: 8.5050559, Lat_all_hou: 47.3152439, Id: 112833
Index: 5, Lon_df_test: 8.5186003, Lat_df_test: 47.3164826, Id: 112834
Index: 5, Lon_all_hou: 8.5186003, Lat_all_hou: 47.3164826, Id: 112834
Index: 6, Lon_df_test: 8.5941959, Lat_df_test: 46.6864945, Id: 117485
Index: 6, Lon_all_hou: 8.5941959, Lat_all_hou: 46.6864945, Id: 117485
Index: 7, Lon_df_tes

In [48]:
print(len(all_hourly_data))
all_hourly_data = pd.concat(all_hourly_data)
print(len(all_hourly_data))

180
30240


In [50]:
# Load database access configuration from config/db_config.json
with open('../../config/db_config.json', 'r') as f:
    db_config = json.load(f)

# Access db credentials
server = db_config['server']
database = db_config['database']
db_user = db_config['db_user']
db_password = db_config['db_password']

In [53]:
# Create connection string for SQLAlchemy
connection_string = f"mssql+pymssql://{db_user}:{db_password}@{server}/{database}"
engine = create_engine(connection_string)

In [55]:
# Write data to database
all_hourly_data.to_sql('weather_forecast_7d_h', con=engine, if_exists='append', index=False)

28

In [12]:
# Make sure all required weather variables are listed here
# The order of variables in hourly or daily is important to assign them correctly below
url = "https://api.open-meteo.com/v1/forecast"

# Initialize a list to store data for each location
all_hourly_data = []

print(list(df_test["lat"][:5]))
print(list(df_test[:5]["lat"]))
'''
# slice df_test into groups of 180 items and iterate over the groups
latitude = list(df_test["lat"])
longitude = list(df_test["lon"])

params = {
    "latitude": latitude,
    "longitude": longitude,
    "hourly": ["temperature_2m", "relative_humidity_2m", "dew_point_2m", "apparent_temperature", "precipitation", "rain", "snowfall", "snow_depth", "weather_code", "pressure_msl", "surface_pressure", "cloud_cover", "cloud_cover_low", "cloud_cover_mid", "cloud_cover_high", "wind_speed_10m", "wind_gusts_10m", "is_day", "sunshine_duration"],
}
responses = openmeteo.weather_api(url, params=params)

print(f"Number of items fetched: {len(responses)}")

# Loop through all responses (for multiple locations)
for i, response in enumerate(responses):
    # Process location metadata
    # print(f"Coordinates {response.Latitude()}°N {response.Longitude()}°E")
    # print(f"Elevation {response.Elevation()} m asl")
    # print(f"Timezone {response.Timezone()} {response.TimezoneAbbreviation()}")
    # print(f"Timezone difference to GMT+0 {response.UtcOffsetSeconds()} s")

    # Process hourly data for this location
    hourly = response.Hourly()

    # Extract variables (same order as requested)
    hourly_data = {
        "date": pd.date_range(
            start=pd.to_datetime(hourly.Time(), unit="s", utc=True),
            end=pd.to_datetime(hourly.TimeEnd(), unit="s", utc=True),
            freq=pd.Timedelta(seconds=hourly.Interval()),
            inclusive="left"
        ),
        "lat_resp": response.Latitude(),
        "lon_resp": response.Longitude(),
        "lat": df_test[i]["lat"],
        "lon": df_test[i]["lon"],
        "id": df_test[i]["id"],
        "temperature_2m": hourly.Variables(0).ValuesAsNumpy(),
        "relative_humidity_2m": hourly.Variables(1).ValuesAsNumpy(),
        "dew_point_2m": hourly.Variables(2).ValuesAsNumpy(),
        "apparent_temperature": hourly.Variables(3).ValuesAsNumpy(),
        "precipitation": hourly.Variables(4).ValuesAsNumpy(),
        "rain": hourly.Variables(5).ValuesAsNumpy(),
        "snowfall": hourly.Variables(6).ValuesAsNumpy(),
        "snow_depth": hourly.Variables(7).ValuesAsNumpy(),
        "weather_code": hourly.Variables(8).ValuesAsNumpy(),
        "pressure_msl": hourly.Variables(9).ValuesAsNumpy(),
        "surface_pressure": hourly.Variables(10).ValuesAsNumpy(),
        "cloud_cover": hourly.Variables(11).ValuesAsNumpy(),
        "cloud_cover_low": hourly.Variables(12).ValuesAsNumpy(),
        "cloud_cover_mid": hourly.Variables(13).ValuesAsNumpy(),
        "cloud_cover_high": hourly.Variables(14).ValuesAsNumpy(),
        "wind_speed_10m": hourly.Variables(15).ValuesAsNumpy(),
        "wind_gusts_10m": hourly.Variables(16).ValuesAsNumpy(),
        "is_day": hourly.Variables(17).ValuesAsNumpy(),
        "sunshine_duration": hourly.Variables(18).ValuesAsNumpy(),
    }

    # Convert to DataFrame and append to list
    all_hourly_data.append(pd.DataFrame(hourly_data))
    '''

['46.6501430', '47.4309774', '47.3511680', '47.3291235', '47.3152439']
['46.6501430', '47.4309774', '47.3511680', '47.3291235', '47.3152439']


'\n# slice df_test into groups of 180 items and iterate over the groups\nlatitude = list(df_test["lat"])\nlongitude = list(df_test["lon"])\n\nparams = {\n    "latitude": latitude,\n    "longitude": longitude,\n    "hourly": ["temperature_2m", "relative_humidity_2m", "dew_point_2m", "apparent_temperature", "precipitation", "rain", "snowfall", "snow_depth", "weather_code", "pressure_msl", "surface_pressure", "cloud_cover", "cloud_cover_low", "cloud_cover_mid", "cloud_cover_high", "wind_speed_10m", "wind_gusts_10m", "is_day", "sunshine_duration"],\n}\nresponses = openmeteo.weather_api(url, params=params)\n\nprint(f"Number of items fetched: {len(responses)}")\n\n# Loop through all responses (for multiple locations)\nfor i, response in enumerate(responses):\n    # Process location metadata\n    # print(f"Coordinates {response.Latitude()}°N {response.Longitude()}°E")\n    # print(f"Elevation {response.Elevation()} m asl")\n    # print(f"Timezone {response.Timezone()} {response.TimezoneAbbrev

In [1]:
all_hourly_data

NameError: name 'all_hourly_data' is not defined