##### **Collect data from KNMI using knmy package**

Update: did not use this in the end, but downloaded it manually from https://www.daggegevens.knmi.nl/klimatologie/uurgegevens

In [5]:
from knmy import knmy
from datetime import datetime
import pandas as pd

In [6]:
# KNMY GitHub: https://github.com/barthoekstra/knmy/tree/master
# KNMY Documentation: https://knmy.readthedocs.io/en/latest/


SM = 7  # Starting month
SD = 1  # Starting day
EM = 12  # Ending month
ED = 30 # Ending day


# req_variables = ['WIND', 'TEMP', 'SUNR', 'PRCP', 'PRES', 'VICL', 'MSTR']

# req_variables = ['ALL']

# Define cities and their stations
cities = {
    # 'Utrecht': [260],      # De Bilt
    # 'Rotterdam': [344],    # Rotterdam
    'Amsterdam': [240],   # Lelystad
    # Add new cities here: 'CityName': [station_id]
}


years = [2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023]


In [7]:
import time
from requests.exceptions import HTTPError

years = [2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023]

# Define all variable groups we want
req_variables = ['WIND', 'TEMP', 'SUNR', 'PRCP', 'PRES', 'VICL', 'MSTR']

def fetch_with_retry(year, city, stations, max_retries=3):
    for attempt in range(max_retries):
        try:
            s_moment = datetime(year, SM, SD, 0)
            e_moment = datetime(year, EM, ED, 23)
            
            # Fetch data
            disclaimer, stations, variables, data = knmy.get_knmi_data(
                type='hourly',
                stations=stations,
                start=s_moment,
                # variables=req_variables,
                end=e_moment,
                inseason=False,
                parse=True
            )
            
            return data
            
        except HTTPError as e:
            if attempt == max_retries - 1:  # Last attempt
                print(f"Failed to fetch data for {city} {year} after {max_retries} attempts: {e}")
                raise
            print(f"Attempt {attempt + 1} failed, retrying after delay...")
            time.sleep(5 * (attempt + 1))  # Increasing delay between retries
            
for year in years:
    for city, stations in cities.items():
        print(f"Processing {city}...")
        try:
            data = fetch_with_retry(year, city, stations)
            
            # Handle duplicate columns and save
            data = data.loc[:, ~data.columns.duplicated()]
            data.to_csv(
                f"../../data/data_raw/{year}_meteo_{city}.csv", 
                index=True, 
                sep=';', 
                decimal='.', 
                encoding='utf-8'
            )
            print(f"Saved {city} data with shape {data.shape} of year {year}")
            
            # Add delay between cities
            time.sleep(2)
            
        except Exception as e:
            print(f"Error processing {city} for {year}: {str(e)}")
            continue
        
        # Add delay between years
        time.sleep(1)

Processing Amsterdam...
Saved Amsterdam data with shape (4416, 25) of year 2016
Processing Amsterdam...
Saved Amsterdam data with shape (4416, 25) of year 2017
Processing Amsterdam...
Saved Amsterdam data with shape (4416, 25) of year 2018
Processing Amsterdam...
Saved Amsterdam data with shape (4416, 25) of year 2019
Processing Amsterdam...
Saved Amsterdam data with shape (4416, 25) of year 2020
Processing Amsterdam...
Saved Amsterdam data with shape (4416, 25) of year 2021
Processing Amsterdam...
Saved Amsterdam data with shape (4416, 25) of year 2022
Processing Amsterdam...
Saved Amsterdam data with shape (4416, 25) of year 2023
