In [None]:
from urllib.request import urlopen
from urllib.error import HTTPError, URLError
from bs4 import BeautifulSoup
import pandas as pd
import time

# Default parameters

In [None]:
CRAWL_DELAY = 3
RETRY_DELAY = 15

BASE_URL = 'http://pollen.aaaai.org/nab/index.cfm?p=AllergenCalendar&stationid=1&qsFullDate='
LEVELS = {
    'absent': 0,
    'l': 1,
    'm': 2,
    'h': 3,
    'v': 4
}

cols = ['trees', 'weeds', 'grass']

# Function for fetching data from aaaai.org

In [None]:
def get_data(year, month):
    date_str = '{:02d}/1/{:04d}'.format(month, year)
    url = BASE_URL + date_str
    with urlopen(url) as r:
        html = r.read()
        bs = BeautifulSoup(html, 'html.parser')

    dates = bs.find_all(attrs={'class': 'nabCalendarDate'})

    data = {
        'day': [],
        'trees': [],
        'weeds': [],
        'grass': [],
        'molds': []
    }
    for i, date in enumerate(dates):
        day = int(date.find('a').text)
        data['day'].append(day)
        for span in date.find_all('span'):
            text = span.text.lower()
            level = LEVELS[span['class'][0]]
            if text in data.keys():
                data[text].append(level)

    df = pd.DataFrame(data)
    df['month'] = month
    df['year'] = year
    df = df[['year', 'month', 'day', 'trees', 'weeds', 'grass', 'molds']]
    return df

# Fetch data for a single month

In [None]:
month = 4
year = 2019
df = get_data(year, month)
df.index = pd.to_datetime((df.year * 10000 + df.month * 100 + df.day), format='%Y%m%d')
df = df[cols]
df.to_csv('data/eug-or-{year}-{month:02d}.csv'.format(year=year, month=month))

# Fetch data for range of months/years

In [None]:
df_list = []
for year in range(2001, 2021):
    for month in range(1, 13):
        print(year, month)
        while True:
            try:
                df_ = get_data(year, month)
            except (HTTPError, URLError):
                print("failed... retrying in {} seconds".format(RETRY_DELAY))
                time.sleep(RETRY_DELAY)
            else:
                break
        df_list.append(df_)
        time.sleep(CRAWL_DELAY)

In [None]:
df = pd.concat(df_list, axis=0).reset_index(drop=True)
df.index = pd.to_datetime((df.year * 10000 + df.month * 100 + df.day), format='%Y%m%d')
df = df[cols]
df.to_csv('data/eug-or.csv')