# Reading AERONET Data

### Extract Date Range

For a given station, web scrap the AERONET website to deterime the date range the data are available

### Read Data

For a given site, read the data by following the steps:

1- Set the parameters as provided in:
             https://aeronet.gsfc.nasa.gov/print_web_data_help_v3.html

2- Use Requests to access the page where the data are located.

3- Use BeautifulSoup to parse the webpage containing the data.

4- Dump the content of the data in a CSV file.

5- Use Pandas to read the CSV file into a Pandas DataFrame

#### Authors
Mei Han, Jules Kouatchou, Dana R. Louie, Xiaomei Lu, 
Brian E. Magill, Carlos Ordaz, Xiaohua Pan, Nazma I. Syeda, and Yujie Wang

In [1]:
import sys
import datetime
import pprint
import requests as reqs
import bs4
from bs4 import BeautifulSoup as bso
import pandas as pd

print(f"Requests version:      {reqs.__version__}")
print(f"BeautifulSoup version: {bs4.__version__}")
print(f"Pandas version:        {pd.__version__}")

Requests version:      2.26.0
BeautifulSoup version: 4.10.0
Pandas version:        1.3.2


## <font color="red">Extract Date Range for Available Data </font>

In [2]:
def extract_form_fields(soup):
    """
      Parse the content of a BeautifulSoup object
      (containing the a a web page with a form) to 
      create a dictionary with default values of the form.

      Input Prameter:
        - soup: BeautifulSoup object
    
      Returned Value:
        - A dictionary
      
      This function was adapted from the one shown in:
      
            https://gist.github.com/simonw/104413
    """
    fields = dict()

    tuple_types1 = ('text', 'password', 'submit', 'image')
    tuple_types2 = ('checkbox', 'radio', 'hidden')
    
    for input in soup.findAll('input'):
        # Ignore submit/image with no name attribute
        if input['type'] in ('submit', 'image') and not 'name' in input:
            continue

        # Single element name/value fields
        if input['type'] in tuple_types1:
            value = ''
            if 'value' in input:
                value = input['value']
            fields[input['name']] = value
            continue

        # Checkboxes and radios
        if input['type'] in tuple_types2:
            value = ''
            if input.has_attr("checked"):
                if input.has_attr('value'):
                    value = input['value']
                else:
                    value = 'on'
            if 'name' in input and value:
                fields[input['name']] = value

            if not 'name' in input:
                fields[input['name']] = value

            continue

        assert False, f"Input type {input['type']} not supported"
    
    # textareas
    for textarea in soup.findAll('textarea'):
        fields[textarea['name']] = textarea.string or ''

    # select fields
    for select in soup.findAll('select'):
        value = ''
        options = select.findAll('option')
        is_multiple = select.has_attr('multiple')
        #is_multiple = select.has_key('multiple')
        selected_options = [
            option for option in options
            if option.has_attr('selected')
        ]

        # If no select options, go with the first one
        if not selected_options and options:
            selected_options = [options[0]]

        if not is_multiple:
            assert(len(selected_options) < 2)
            if len(selected_options) == 1:
                value = selected_options[0]['value']
        else:
            value = [option['value'] for option in selected_options]

        fields[select['name']] = value

    return fields

In [3]:
def get_station_http_address(station_name):
    """
       Given a station name, this function constructs 
       the AERONET webpage address associated with the station.
       The content of the webpage is a form where parameter
       default values are set.
       
       Input Parameter:
          - station_name (str): name of the station
    
        Returned value:
          - The AERONET http address of the station webpage.
    """
    
    # Set the reference link for AERONET
    entry_link = "https://aeronet.gsfc.nasa.gov/cgi-bin/webtool_aod_v3"
    
    # Access the main webpage and extract its content
    source = reqs.get(entry_link)
    mysoup = bso(source.text, 'html.parser')
    
    # Search all the href HTML tags 
    links_with_text = [a['href'] for a in mysoup.find_all('a', href=True) if a.text]
    
    # Identify the href tag that has the location name
    for link in links_with_text:
        if station_name in link:
            tmp_link = link
            break
    idx = tmp_link.index("?")
    tmp_link = tmp_link[idx:]

    # Reconstruct part of the link
    tmp_link= tmp_link.encode('ascii','replace').decode().replace("?ion", "&region")
    
    # Full htpp address of the station
    loc_link = ''.join([entry_link, tmp_link])
    
    return loc_link

In [4]:
def get_date_range(station_name):
    """
       For a given station, get the date range when
       the data are available.
       
       Input Parameter:
          - station_name (str): name of the site
    
       Returned Value:
          - Two lists of strings containing the begining
            date and ending date, respectively.
    """
    
    loc_link = get_station_http_address(station_name)
    loc_source = reqs.get(loc_link)
    loc_soup = bso(loc_source.text, 'html.parser')

    data = extract_form_fields(loc_soup)

    byear = data['year'][-2:]
    bmonth = data['month'].zfill(2)
    bday = data['day'].zfill(2)
    eyear = data['year2'][-2:]
    emonth = data['month2'].zfill(2)
    eday = data['day2'].zfill(2)

    return [byear, bmonth, bday], [eyear, emonth, eday]

#### Example:

In [5]:
station_name = "Andenes"
bdate, edate = get_date_range(station_name)

print(f"Beg Date (year/month/day): {bdate[0]}/{bdate[1]}/{bdate[2]}")
print(f"End Date (year/month/day): {edate[0]}/{edate[1]}/{edate[2]}")

Beg Date (year/month/day): 02/01/01
End Date (year/month/day): 21/12/31


## <font color="red">Read Data </font>

In [6]:
def read_aeronet_station(station_name, beg_date, end_date):
    """
      Given a station name and a range of date, returns a Pandas
      DataFrame containing the AERONET data at the station
      within the provided date range.
      
      Input Parameters:
         - station_name: (str) name of the station
         - beg_date: (list) beginning date ([yyyy, mm, dd])
         - bend_date: (list) end date ([yyyy, mm, dd])
         
      Returned Value:
         - Pandas DataFrame containing the data.
    """
    base_url = "https://aeronet.gsfc.nasa.gov/cgi-bin/print_web_data_v3"    
 
    # Set parameters for the station
    YEAR_INDEX = 0
    MONTH_INDEX =1
    DAY_INDEX = 2
    
    payload = {
        "site": station_name,
        "year": beg_date[YEAR_INDEX ],
        "month": beg_date[MONTH_INDEX],
        "day": beg_date[DAY_INDEX],
        "year2": end_date[YEAR_INDEX ],
        "month2": end_date[MONTH_INDEX],
        "day2": end_date[DAY_INDEX],
        "AOD20": 1,
        "AVG": 20,
        "if_no_html": 1
        }

    # Connect to the station webpage and check if it exists
    response = reqs.get(base_url, params=payload)

    if "html" in response.headers['content-type']:     
        print(f" The url: ")
        print(f"    --> {response.url} ")
        print(f" Is not reachable. Please check your settings.")
        return None
    
    # Use Pandas to raed the data
    dateparse = lambda x: datetime.datetime.strptime(x, '%d:%m:%Y %H:%M:%S')
    df = pd.read_csv(response.url, skiprows=5, na_values=-999,
                     parse_dates={'datetime': [1, 2]},
                     date_parser=dateparse, index_col=0,
                     squeeze=True)
    
    return df

#### Example:

In [7]:
station_name = 'GSFC'

beg_date = [2010, 1, 1]
end_date = [2010, 12, 31]

df = read_aeronet_station(station_name, beg_date, end_date)

df

Unnamed: 0_level_0,AERONET_Site,Day_of_Year,AOD_1640nm,AOD_1020nm,AOD_870nm,AOD_865nm,AOD_779nm,AOD_675nm,AOD_667nm,AOD_620nm,...,N[440-675_Angstrom_Exponent],N[500-870_Angstrom_Exponent],N[340-440_Angstrom_Exponent],N[440-675_Angstrom_Exponent[Polar]],Data_Quality_Level,AERONET_Instrument_Number,AERONET_Site_Name,Site_Latitude(Degrees),Site_Longitude(Degrees),Site_Elevation(m)
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-01-01 12:00:00,GSFC,1,0.009026,0.031369,0.044996,,,0.076400,,,...,1,1,1,0,lev20,451,GSFC,38.9925,-76.839833,87.0
2010-01-02 12:00:00,GSFC,2,0.014079,0.028317,0.037447,,,0.058369,,,...,1,1,1,0,lev20,451,GSFC,38.9925,-76.839833,87.0
2010-01-03 12:00:00,GSFC,3,0.009665,0.027027,0.036290,,,0.057466,,,...,33,33,33,0,lev20,451,GSFC,38.9925,-76.839833,87.0
2010-01-05 12:00:00,GSFC,5,0.046116,0.052239,0.059274,,,0.073843,,,...,5,5,5,0,lev20,451,GSFC,38.9925,-76.839833,87.0
2010-01-07 12:00:00,GSFC,7,0.007822,0.021023,0.028735,,,0.047633,,,...,30,30,30,0,lev20,451,GSFC,38.9925,-76.839833,87.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2010-12-24 12:00:00,GSFC,358,0.008217,0.013706,0.016895,,,0.023626,,,...,36,36,36,0,lev20,451,GSFC,38.9925,-76.839833,87.0
2010-12-27 12:00:00,GSFC,361,0.012578,0.017128,0.019418,,,0.026399,,,...,19,19,19,0,lev20,451,GSFC,38.9925,-76.839833,87.0
2010-12-28 12:00:00,GSFC,362,0.007933,0.014443,0.017737,,,0.024232,,,...,33,33,33,0,lev20,451,GSFC,38.9925,-76.839833,87.0
2010-12-29 12:00:00,GSFC,363,0.013678,0.031546,0.041804,,,0.064905,,,...,29,29,29,0,lev20,451,GSFC,38.9925,-76.839833,87.0
