# Convert FMI air quality data to JSON and Parquet

## ... from XML and WFS

In [2]:
from fmiopendata.wfs import download_stored_query
import pandas as pd
import numpy as np
import datetime

Let's download some [open air quality data](https://en.ilmatieteenlaitos.fi/open-data-manual-fmi-wfs-services) first.

In [32]:
query = 'urban::observations::airquality::hourly::multipointcoverage'
end_time = datetime.datetime.utcnow()
start_time = end_time - datetime.timedelta(days=7)

When fetching a whole month, the query will take a few minutes.

In [37]:
args = [
    'timeseries=True',
    f"starttime={start_time.isoformat(timespec='seconds')}Z",
    f"endtime={end_time.isoformat(timespec='seconds')}Z"
]
obs = download_stored_query(query, args=args)

Then we'll pull out a complete list of all possible measurements.

In [40]:
cols = list(set([v for p in obs.data for v in obs.data[p]]))
cols.remove('times')

And turn them into DataFrames, which we'll then merge into a single one.

In [42]:
dfs = []
for name in obs.data:
    data = {k: obs.data[name][k]['values'] for k in cols}
    mi = pd.MultiIndex.from_product([[name], obs.data[name]['times']], names=['place', 'hour'])
    df = pd.DataFrame(data=data, index=mi, columns=cols, dtype='float64')
    dfs.append(df)
df = pd.concat(dfs)
df.sample(5)

PM10_PT1H_avg       float64
CO_PT1H_avg         float64
NO2_PT1H_avg        float64
AQINDEX_PT1H_avg    float64
QBCPM25_PT1H_AVG    float64
TRSC_PT1H_avg       float64
O3_PT1H_avg         float64
NO_PT1H_avg         float64
SO2_PT1H_avg        float64
PM25_PT1H_avg       float64
dtype: object

We'll attach the location metadata for the measurement points to the DataFrame.

In [23]:
df.attrs.update({'location_metadata': obs.location_metadata})

And finally, save it.

In [24]:
df.to_parquet('../data/airquality.parquet')

As Parquet, one month worth of data will take about 200 kB, 
while a gzipped JSON file will be around 400 kB.

The original XML file was about 6.5 MB, gzipped 500 kB.

In [8]:
import json
import gzip

with gzip.open('data/airquality.json.gz', 'wt', encoding='utf-8') as f:
    out = {'data': obs.data, 'location_metadata': obs.location_metadata}
    json.dump(out, f, default=str)

In [9]:
import defusedxml.ElementTree as ET

with gzip.open('data/airquality.xml.gz', 'w') as f:
    f.write(ET.tostring(obs._xml))

In [10]:
!ls -Fl data/air*

-rw-rw-r-- 1 mikael mikael  70539 elo     6 23:45 data/airquality.json.gz
-rw-rw-r-- 1 mikael mikael  66922 elo     6 23:45 data/airquality.parquet
-rw-rw-r-- 1 mikael mikael 130650 elo     6 23:45 data/airquality.xml.gz
