In [1]:
from urllib.request import urlopen
from urllib.error import HTTPError, URLError
from bs4 import BeautifulSoup
import pandas as pd
import time

# Default parameters

In [2]:
CRAWL_DELAY = 3
RETRY_DELAY = 15

BASE_URL = 'http://pollen.aaaai.org/nab/index.cfm?p=AllergenCalendar&stationid=1&qsFullDate='
LEVELS = {
    'absent': 0,
    'l': 1,
    'm': 2,
    'h': 3,
    'v': 4
}

cols = ['trees', 'weeds', 'grass']

# Function for fetching data from aaaai.org

In [3]:
def get_data(year, month):
    date_str = '{:02d}/1/{:04d}'.format(month, year)
    url = BASE_URL + date_str
    with urlopen(url) as r:
        html = r.read()
        bs = BeautifulSoup(html, 'html.parser')

    dates = bs.find_all(attrs={'class': 'nabCalendarDate'})

    data = {
        'day': [],
        'trees': [],
        'weeds': [],
        'grass': [],
        'molds': []
    }
    for i, date in enumerate(dates):
        day = int(date.find('a').text)
        data['day'].append(day)
        for span in date.find_all('span'):
            text = span.text.lower()
            level = LEVELS[span['class'][0]]
            if text in data.keys():
                data[text].append(level)

    df = pd.DataFrame(data)
    df['month'] = month
    df['year'] = year
    df = df[['year', 'month', 'day', 'trees', 'weeds', 'grass', 'molds']]
    return df

# Fetch data for a single month

In [24]:
month = 4
year = 2019
df = get_data(year, month)
df.index = pd.to_datetime((df.year * 10000 + df.month * 100 + df.day), format='%Y%m%d')
df = df[cols]
df.to_csv('data-{year}-{month:02d}.csv'.format(year=year, month=month))

# Fetch data for range of months/years

In [15]:
df_list = []
for year in range(2001, 2020):
    for month in range(1, 13):
        print(year, month)
        while True:
            try:
                df_ = get_data(year, month)
            except (HTTPError, URLError):
                print("failed... retrying in {} seconds".format(RETRY_DELAY))
                time.sleep(RETRY_DELAY)
            else:
                break
        df_list.append(df_)
        time.sleep(CRAWL_DELAY)

2001 1
2001 2
2001 3
2001 4
2001 5
2001 6
2001 7
2001 8
2001 9
2001 10
2001 11
2001 12
2002 1
2002 2
failed... retrying in 15 seconds
2002 3
failed... retrying in 15 seconds
2002 4
failed... retrying in 15 seconds
2002 5
2002 6
2002 7
2002 8
2002 9
2002 10
2002 11
failed... retrying in 15 seconds
2002 12
2003 1
failed... retrying in 15 seconds
2003 2
2003 3
2003 4
2003 5
2003 6
2003 7
2003 8
2003 9
2003 10
failed... retrying in 15 seconds
2003 11
2003 12
2004 1
2004 2
2004 3
2004 4
2004 5
2004 6
2004 7
2004 8
2004 9
2004 10
2004 11
2004 12
2005 1
2005 2
2005 3
2005 4
failed... retrying in 15 seconds
2005 5
2005 6
2005 7
2005 8
2005 9
2005 10
2005 11
2005 12
2006 1
2006 2
2006 3
2006 4
2006 5
2006 6
2006 7
2006 8
2006 9
2006 10
2006 11
failed... retrying in 15 seconds
2006 12
2007 1
2007 2
2007 3
2007 4
2007 5
2007 6
2007 7
2007 8
2007 9
2007 10
2007 11
2007 12
2008 1
2008 2
2008 3
2008 4
2008 5
2008 6
2008 7
2008 8
failed... retrying in 15 seconds
2008 9
2008 10
2008 11
2008 12
2009 1


In [16]:
df = pd.concat(df_list, axis=0).reset_index(drop=True)
df.index = pd.to_datetime((df.year * 10000 + df.month * 100 + df.day), format='%Y%m%d')
df = df[cols]
df.to_csv('eug-or.csv')