## Download Euskalmet station data

In [1]:
## Load required libraries

# To create a token to be sent along all communications with the Euskalmet API:
import jwt

# To communicate with the Euskalmet API:
import requests

# To handle output of requests:
import json

# Data handling:
import numpy as np
import pandas as pd
import os.path

Create my JWT (https://jwt.io/) based on instructions by Euskalmet (https://opendata.euskadi.eus/api-euskalmet/-/how-to-use-meteo-rest-services/)

In [2]:
# Load payload information:
fpath = "../api/payload.txt"
f = open(fpath, 'r')
payload = json.loads(f.read())

## payload has the following structure:
# payload = {
#             "aud": "met01.apikey", # -> fixed value
#             "iss": "NAME", # issuer description, whatever
#             "exp": 1906453456, # expiration timestamp, as epoch. You can get the epoch for a date here -> https://www.epochconverter.com/
#             "version": "1.0.0", # -> fixed value
#             "iat": 1685455334, # emission timestamp, as epoch. Must be exp > iat
#             "email": "YOUR_EMAIL@EMAIL.COM" # the email you used when asking for the API keys
#         }

# Create token based on payload and private key:
my_jwt = jwt.encode(payload, open("../api/Apikey/privateKey.pem", "rb").read(), algorithm = "RS256")


Based on this, create the header:

In [3]:
headers = {"Authorization": f"Bearer {my_jwt}", "Accept": "application/json"}
#headers

Before starting to interact with the API it might be useful to save the string of the API url (the "root", so to speak) and avoid typing all of it every time:

In [4]:
rurl = "https://api.euskadi.eus"

Try downloading station list:

In [5]:
r = requests.get(rurl + "/euskalmet/stations", headers = headers)

We can see the fields within the response:

In [6]:
#dir(r)

The actual response to our request is within the _text_ field. We can handle the output with `json`:

In [7]:
#json.loads(r.text)
with open("../data/station_info_list.json", "w") as f:
    f.write(r.text)

This provides the list of stations. I've realised that there are several instances (snapshots) for each station. In each of them some sensor information might have changed, and that's very important when requesting the data, or you'll end up with 404 errors because you were asking for a sensor that was not installed in a certain date. (hmm but I can't retrieve information for all of them, seems weird...)

Usually, we would be interested in one or a few stations. We can easily check the codes here: https://www.euskalmet.euskadi.eus/behaketa/estazioen-datuak/

To start working with any station we need to know which sensors are available so that we can retrieve their readings:

IMPORTANT: digging more into this I've seen that the (temperature) sensor happened on 2021-04-13-00. 
Dates >= 2021-04-13-00 have the new sensor.
This does not match any of the dates I see in the station information?? I mean, the sensor information from 
the snapshots seems correct, but I have had to guess the date of the change...
Anyway, I will need to consider it, but this is problematic would I want to automatise the process...

In [8]:
# Select station:
station_id = "C068"
snapshots = ["20150220", "20221003"] # YYYYMMDD format
sensors_info = []

for snp in snapshots:
    # Retrieve information:
    addurl = f"/euskalmet/stations/{station_id}/{snp}"
    r_sensors = requests.get(rurl + addurl, headers = headers)
    info = json.loads(r_sensors.text)
    
    # Extract the sensor IDs (stored within the "sensors" field):
    sensor_ids = [x["sensorKey"].split("/")[-1] for x in info["sensors"]]

    # Iterate through sensors, obtaining the 'measureType' and 'measureId' of each variable (they are stored within the "meteors" field)
    sensors_info.append({x: json.loads(requests.get(rurl + f"/euskalmet/sensors/{x}", headers = headers).text)["meteors"] for x in sensor_ids})

# Note that in 'sensors' the -> 'unit': 'CENTIMETERS' <- refers to the height at which the sensor is located in the station.


Now set the time window in which we want to download data.

We want to download the entire dataset of the station, but doing so every time we run the script is not efficient.
So, set a condition by which we check if the readings output file of this specific station already exists:

In [9]:
fpath = f"../data/readings_{station_id}.csv"
if os.path.exists(fpath):
    # if it does, check which was the last date, and set the range from inmediately after to now:
    # read file line by line and print the last line
    # (this is probably not efficient for large files, so I might need to come back to change it)
    print(f"File for station {station_id} already exists. Reading last date to continue download until today.")
    with open(fpath,'r') as file:        
            for line in file:
                pass
    last_date = [float(x) for x in line.replace('\n', '').split(",")]
    start_date = pd.Timestamp(year = int(last_date[0]), 
                              month = int(last_date[1]), 
                              day = int(last_date[2]), 
                              hour = int(last_date[3])+1, 
                              tz = "utc")
    end_date = pd.Timestamp.now(tz = "utc")
else:
    # if it does not, create date range from beginning to now:
    # To get all the data available, set the start date to the installation date of the station (available in the sensor information):
    print(f"No file exists for station {station_id}. Setting dates to download full dataset.")
    start_date = pd.to_datetime(info['installDate'], format = "%Y-%m-%dT%H:%M:%S", utc = True) # format according to ISO 8601
    end_date = pd.Timestamp.now(tz = "utc")
    

# Create the desired date range to retrieve data:
dater = pd.date_range(start = start_date, end = end_date, freq = "1H")

File for station C068 already exists. Reading last date to continue download until today.


Iterate through all the measurements and dates:

In [10]:
# Create named dataframes to stores temporary and full results:
#
# Get the variable names for our station so that we add them as column names:
sensors_vars = [[d['measureId'] for d in values] for sensor_id, values in sensors_info[0].items()]
sensors_vars = [item for sublist in sensors_vars for item in sublist] # the output is a list of lists, this merges them
colnames = ['YYYY', 'MM', 'DD', 'hh', 'mm'] + sensors_vars + ["api_status"]
tmp = pd.DataFrame(columns = colnames)

# Create csv file with header if not already created:
fpath = f"../data/readings_{station_id}.csv"
if not os.path.exists(fpath):
    tmp.to_csv(fpath, na_rep = "nan", header = True, index = False)
    
# To get all the readings, we will need to loop through dates, sensors and measurents:
for date_i in dater:
    tmp['YYYY'] = np.repeat(date_i.year, 6)
    tmp['MM'] = np.repeat(date_i.month, 6)
    tmp['DD'] = np.repeat(date_i.day, 6)
    tmp['hh'] = np.repeat(date_i.hour, 6)
    tmp['mm'] = np.arange(0, 60, 10) # arange() considers the start and stop of the interval as [start, stop) so to get 50 I need to set stop=60
    
    # Optional, to keep track of the download process for large batches:
    #with open('../data/output.txt', 'w') as f:
    #    f.write(f'Downloading: {date_i}')
    
    # As aforementioned, we need to consider the date we are in to get the correct sensor information:
    if date_i < pd.Timestamp(year = 2021, month = 4, day = 13, hour = 0, tz = "utc"):
        si = sensors_info[0].items()
    else:
        si = sensors_info[1].items()
        
    for sensor_id, values in si:
        for measure_type in values:

            # Get the measure type and measure id of this iteration:
            measure_type_id = measure_type["measureType"]
            measure_id = measure_type["measureId"]

            # Create the url for the request based on these, plus the date:
            url = (rurl +
                   f"/euskalmet/readings/forStation/{station_id}/{sensor_id}/measures/{measure_type_id}/{measure_id}/"
                   f"at/{int(date_i.year):04}/{int(date_i.month):02}/{int(date_i.day):02}/{int(date_i.hour):02}")

            # Send request to get readings:
            rd = requests.get(url, headers = headers)

            # Check status_code of response (>=300 means trouble):
            if rd.status_code<300:
                # If OK, get the data:
                tmp[f"{measure_id}"] = json.loads(rd.text)['values'] # store values of the specific variable
                tmp["api_status"] = np.repeat(rd.status_code, 6)

            else:
                # If not, fill with nan:
                tmp[f"{measure_id}"] = np.repeat(np.nan, 6)
                tmp["api_status"] = np.repeat(rd.status_code, 6)


    # Append readings from this iteration to the full table:
    # There is a shit-ton of data and sending-receiving requests to the Euskalment API looks like the bottleneck.
    # I've estimated that for around 10 years of data it will take ~72h. So I think it will be better to append results
    # from every iteration to the csv file. This way, as I always check the last day of the file to resume the download,
    # I can do it in batches. It's also really helpful when the internet connection fails and the script stops, otherwise 
    # I would need to restarts from scratch if I do not save the progress continuously (has happenned before, hehe).
    tmp.to_csv(fpath, mode = 'a', na_rep = "nan", index = False, header = False)
