Source: https://www.senamhi.gob.pe/?p=calidad_del_aire-estadistica&e=112265

In [17]:
import requests
from bs4 import BeautifulSoup
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import h3

null = np.nan

In [49]:
def parse_time(string):
    return pd.to_datetime([d.replace("'", '') for d in string.split(',')[:-1]], format="%d/%m/%Y%H:%M:")

In [50]:
def get_data(station_id):
    url = f'https://www.senamhi.gob.pe/site/sea/www/site/sea/graficas/dato_hora.php?estacion={station_id}&cont=N_NO2&f1=01/04/2019&f2=30/04/2019'
    resp = requests.get(url)
    script = BeautifulSoup(resp.text, 'html.parser').body.select('script[type="text/javascript"]')[0].string
    title = re.match("'(.+)'", script.split('text: ')[1])[1].split(': ')[1]
    vals = re.match("\[(.+)\]", script.split('data: ')[-1])[1]
    timestamps = re.match("\[(.+)\]", script.split('categories: ')[-1])[1]
    return (title, np.array(eval(vals)), parse_time(timestamps))

In [19]:
def convert(s):
    deg, m, s, d = [a+b for a,b in re.findall(r"(\d+\.\d+)|(\w+)", s)]
    return -(float(deg) + float(m)/60 + float(s)/60**2)

In [20]:
coords = {
    'CARABAYLLO (CRB) - Lima Norte' : (-11.901921, -77.0357845),
    'ATE (ATE) - Lima Este' : (-12.026121,-76.9192707),
    'SAN BORJA (SBJ) - Lima Centro' : ('12°6′31.94″ S', '77°0′27.68″ W'),
    'CAMPO DE MARTE (CDM) - Lima Centro' : ('12°4′13.96″ S', '77°2′35.57″ W'),
    'SANTA ANITA (STA) - Lima Este' : ('12°2′34.88″ S', '76°58′17.2″ W'),
    'SAN MARTIN DE PORRES (SMP) - ' : ('12°0′32″ S','77°5′4.1″ W'),
    'HUACHIPA (HCH) - Lima Este' : (-12.0173213,-76.9510424),
    'SAN JUAN DE LURIGANCHO (SJL) - Lima Este' : ('11°58′53.89″ S','76°59′57.29″ W'),
}
for k,(lat, long) in coords.items():
    if(isinstance(lat, str)):
        lat, long = convert(lat), convert(long)
    coords[k] = h3.geo_to_h3(lat, long, 9)

In [51]:
stations = [111286, 111287, 112192, 112193, 112194, 112208, 112233, 112265, 112266, 112267]
data = [get_data(s) for s in stations]

In [67]:
df = pd.DataFrame(
    [(coords[station], t, d) for (station, no2, timestamps) in data if station in coords for (d, t) in zip(no2, timestamps)],
    columns=['h3id', 'time', 'NO2']
)
# convert ug/m3 to ppb
# The conversion assumes an ambient pressure of 1 atmosphere and a temperature of 25 degrees Celsius.
# https://www2.dmu.dk/atmosphericenvironment/expost/database/docs/ppm_conversion.pdf
df['NO2'] = df['NO2']/1.88
df.to_csv('y_data.csv', index=False)