# Convert FMI air quality data from to JSON and Parquet

## ... from XML and WFS

In [2]:
from fmiopendata.wfs import download_stored_query

import pyarrow as pa
import pandas as pd
import numpy as np

import json
import gzip

Let's download some air quality data first.

In [3]:
obs = download_stored_query(
    'urban::observations::airquality::hourly::multipointcoverage',
    args=['timeseries=True'])

Then we'll pull out a complete list of all possible measurements.

In [4]:
cols = set([v for p in obs.data for v in obs.data[p]])
cols.remove('times')
cols

{'AQINDEX_PT1H_avg',
 'CO_PT1H_avg',
 'NO2_PT1H_avg',
 'NO_PT1H_avg',
 'O3_PT1H_avg',
 'PM10_PT1H_avg',
 'PM25_PT1H_avg',
 'QBCPM25_PT1H_AVG',
 'SO2_PT1H_avg',
 'TRSC_PT1H_avg'}

And turn them into DataFrames, which we'll then merge into a single one.

In [5]:
dfs = []
for name in obs.data:
    data = {k: obs.data[name][k]['values'] for k in cols}
    idx = pd.DatetimeIndex(name='hour', data=obs.data[name]['times'])
    idx0 = pd.CategoricalIndex(name='place', data=[name]*idx.size)
    df = pd.DataFrame(data=data, index=[idx0, idx], columns=cols)
    dfs.append(df)
df = pd.concat(dfs)
df.sample(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,TRSC_PT1H_avg,AQINDEX_PT1H_avg,NO_PT1H_avg,QBCPM25_PT1H_AVG,PM10_PT1H_avg,O3_PT1H_avg,NO2_PT1H_avg,CO_PT1H_avg,PM25_PT1H_avg,SO2_PT1H_avg
place,hour,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Parainen,2022-07-31 23:00:00,,1.0,,,18.3,,,,,
Espoo Pohjois-Tapiola 2,2022-08-01 00:00:00,,1.0,0.4,,6.1,,7.7,,3.1,
Kuopio Haminalahti,2022-08-01 11:00:00,1.6,1.0,,,,,,,,
Rauma Sinisaari,2022-08-01 05:00:00,0.5,1.0,,,,,,,,0.7
Naantali keskusta Asematori,2022-08-01 03:00:00,,1.0,,,5.2,,8.2,,,-0.2


We'll attach the location metadata for the measurement points to the DataFrame.

In [6]:
df.attrs.update({'location_metadata': obs.location_metadata})

And finally, save it.

In [7]:
df.to_parquet('data/airquality.parquet', compression='brotli')

As Parquet, a single day's data will take about 20 kB, 
while a compressed JSON file will be around 14 kB.

In [9]:
with gzip.open('data/airquality.json.gz', 'wt', encoding='utf-8') as f:
    out = {'data': obs.data, 'location_metadata': obs.location_metadata}
    json.dump(out, f, default=str)