# Fetching air quality data from the FMI open data timeseries API

[API documentation](https://github.com/fmidev/smartmet-plugin-timeseries/blob/master/docs/Using-the-Timeseries-API.md),
[API examples](https://github.com/fmidev/smartmet-plugin-timeseries/blob/master/docs/Examples.md),
[JSON API example call](https://opendata.fmi.fi/timeseries?format=json&groupareas=0&producer=airquality_urban&area=Helsinki&param=time,fmisid,PM10_PT1H_avg,PM25_PT1H_avg,O3_PT1H_avg,CO_PT1H_avg,SO2_PT1H_avg,NO2_PT1H_avg,TRSC_PT1H_avg),
[CSV API call for the fmisid to name mapping](https://opendata.fmi.fi/timeseries?format=ascii&groupareas=0&separator=,&producer=airquality_urban&area=Finland&param=fmisid,name,latitude,longitude&starttime=2022-08-26T08:00:00%2B00:00&endtime=2022-08-26T08:00:00%2B00:00&tz=UTC).

In [1]:
import requests
import pendulum
import pandas as pd
import numpy as np

In [2]:
start_time = pendulum.yesterday('UTC')
end_time = pendulum.tomorrow('UTC')

aq_fields = {
    'fmisid': np.int32,
    'time': np.datetime64,
    'AQINDEX_PT1H_avg': np.float64,
    'PM10_PT1H_avg': np.float64,
    'PM25_PT1H_avg': np.float64,
    'O3_PT1H_avg': np.float64,
    'CO_PT1H_avg': np.float64,
    'SO2_PT1H_avg': np.float64,
    'NO2_PT1H_avg': np.float64,
    'TRSC_PT1H_avg': np.float64,
}

url = 'https://opendata.fmi.fi/timeseries'

params = {
    'format': 'json',
    'precision': 'double',
    'groupareas': '0',
    'producer': 'airquality_urban',
    'area': 'Uusimaa',
    'param': ','.join(aq_fields.keys()),
    'starttime': start_time.isoformat(timespec="seconds"),
    'endtime': end_time.isoformat(timespec="seconds"),
    'tz': 'UTC',
}

data = requests.get(url, params=params).json()

In [3]:
df = pd.DataFrame(data).astype(aq_fields)
# df = df.set_index(['fmisid', 'time'])
df[0:10]

Unnamed: 0,fmisid,time,AQINDEX_PT1H_avg,PM10_PT1H_avg,PM25_PT1H_avg,O3_PT1H_avg,CO_PT1H_avg,SO2_PT1H_avg,NO2_PT1H_avg,TRSC_PT1H_avg
0,100662,2022-08-25 00:00:00,1.0,-4.1,1.4,42.6,,0.6,1.1,
1,100662,2022-08-25 01:00:00,1.0,-2.2,1.0,42.2,,0.6,0.9,
2,100662,2022-08-25 02:00:00,1.0,-1.3,1.1,43.2,,0.5,1.6,
3,100662,2022-08-25 03:00:00,1.0,-1.1,0.6,41.7,,0.5,3.5,
4,100662,2022-08-25 04:00:00,1.0,-2.5,1.0,47.1,,0.5,4.9,
5,100662,2022-08-25 05:00:00,1.0,0.4,0.7,43.6,,0.9,10.7,
6,100662,2022-08-25 06:00:00,1.0,4.2,3.9,49.7,,1.3,11.2,
7,100662,2022-08-25 07:00:00,1.0,5.9,4.8,54.0,,1.3,10.4,
8,100662,2022-08-25 08:00:00,1.0,7.0,3.8,58.4,,1.3,6.7,
9,100662,2022-08-25 09:00:00,2.0,8.4,4.1,61.9,,1.2,5.3,


In [4]:
df.to_parquet('data/airquality.parquet', compression='zstd')

# DuckDB

In [5]:
import duckdb
con = duckdb.connect(database=':memory:')

In [6]:
con.execute('CREATE OR REPLACE TABLE airquality_urban AS SELECT * FROM df')

<duckdb.DuckDBPyConnection at 0x7fc2d82cf8b0>

In [7]:
df2 = con.execute('SELECT * FROM airquality_urban').fetchdf()
df2.sample(10)

Unnamed: 0,fmisid,time,AQINDEX_PT1H_avg,PM10_PT1H_avg,PM25_PT1H_avg,O3_PT1H_avg,CO_PT1H_avg,SO2_PT1H_avg,NO2_PT1H_avg,TRSC_PT1H_avg
302,103140,2022-08-25 15:00:00,1.0,,,,,0.2,,-0.1
39,100691,2022-08-25 03:00:00,1.0,5.6,1.5,,,,2.6,
178,100762,2022-08-26 10:00:00,2.0,28.6,8.9,29.1,,,17.1,
489,107147,2022-08-26 00:00:00,1.0,6.3,3.3,,,,8.1,
410,104074,2022-08-25 17:00:00,1.0,6.3,2.4,,,,4.3,
528,107399,2022-08-26 03:00:00,1.0,6.0,4.4,,,,2.7,
526,107399,2022-08-26 01:00:00,1.0,4.2,2.9,,,,1.9,
170,100762,2022-08-26 02:00:00,1.0,5.2,3.7,51.8,,,2.7,
193,100763,2022-08-25 13:00:00,1.0,7.3,2.8,,,,8.5,
135,100742,2022-08-26 03:00:00,1.0,7.7,4.3,,,,11.8,
