# Requires a premium accuweather account (14 day trial available)

In [1]:
# load modules
import requests
from requests.auth import HTTPBasicAuth
import lxml.html as lh
import xml.etree.ElementTree as ET
import json
import itertools
import pandas as pd

In [2]:
# import getpass module to enter password information
import getpass
password = getpass.getpass()

········


In [3]:
# set payload information to login
payload = {
    'username': 'narquette',
    'password' : password
}

# store login url  
login_url = "https://wwwl.accuweather.com/authenticate.php"

# create a login session
with requests.Session() as session:
    post = session.post(login_url, data=payload)

In [4]:
# store all of the columns for the dataframe
columns = [
    'WeatherYear', 'WeatherMonth', 'WeatherDay', 'Actual High', 'Actual Low',
    'Actual Avg', 'Normal High', 'Normal Low', 'Normal Avg',
    'Actual - Normal Avg Difference', 'Rec High', 'Rec Year', 'Rec Low',
    'Rec Year', 'Precipitation Amount', 'Snow Amount', 'Snow Ground',
    'Heat Degree Day', 'Cool Degree Day'
]

# store all of the parsed and cleaned data
data_clean_all = []

# specific city to use in the historical weather search
city = 'ILHA+DAS+CAIEIRAS'

# loop through years from 2016 to 2019
for year in range(2016, 2020):

    # loop through months 1 (Jan) to 12 (Dec)
    for month in range(1, 13):

        # build url for getting historical data
        requests_url = f"https://premiuma.accuweather.com/pro/past-months.asp?display=0&month_select={month}&year_select={year}&location={city}"

        # get the session for the url
        r = session.get(requests_url)

        # get the root information
        root = lh.fromstring(r.content)

        # loop through the dataTable html
        for t in root.findall(".//*[@id='dataTable']"):

            # get the number of child nodes in the table
            length = len(t.getchildren())

            # get the data for each rown
            for i in range(1, length):
                col_length = len(t[i].getchildren())  # get the column length
                data = [t[i][n].text for n in range(0, col_length - 1)
                        ]  # get all of the text and store it in a list
                data_clean = list(map(lambda s: s.strip(),
                                      data))  # strip out the next line returns
                data_clean.insert(0, month)  # add the month data to list
                data_clean.insert(0, year)  # add the year data to the list
                data_clean_all.append(
                    data_clean
                )  # append each table row (data_clean) to a list (data_clean_all)

# add all of the list data (data_clean_all) to a dataframe
df = pd.DataFrame.from_records(data_clean_all, columns=columns)

#drop meaningless columns
df = df.drop([
    'Rec High', 'Rec Year', 'Rec Low', 'Rec Year', 'Snow Amount', 'Snow Ground'
],
             axis=1)

#add columns to classify 12-hour rain amounts
#https://www.weather.gov/jetstream/mos_max
#0 - No accumulation
#1 - 0.01" to 0.09" (0.3 mm to 2 mm)
#2 - 0.10" to 0.24" (3 mm to 6 mm)
#3 - 0.25" to 0.49" (6 mm to 12 mm)
#4 - 0.50" to 0.99" (13 mm to 25 mm)
#5 - 1.00" to 1.99" (25 mm to 50 mm)
#6 - ≥2.00" (≥51 mm)


def rain_classify(rain_amount):
    if rain_amount <= .09:
        rain_type = 1
    elif rain_amount <= .24:
        rain_type = 2
    elif rain_amount <= .49:
        rain_type = 3
    elif rain_amount <= .99:
        rain_type = 4
    elif rain_amount <= 1.99:
        rain_type = 5
    else:
        rain_type = 6
    return rain_type


#add rain classification column to the dataframe
df['Rain Classification'] = df.apply(
    lambda x: rain_classify(float(x['Precipitation Amount'])), axis=1)

#add dataframe to a file
df.to_csv(
    os.path.join('../',
                 '../',
                 '../',
                 'Data',
                 'Preprocessing',
                 '"BrazilWeatherData_Vitario_2016_2019.csv',
                 sep='|',
                 index=False))