In [38]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import lxml.html as lh
import numpy as np

In [34]:
def get_month_table(harborurl,month,year):
    if month > 9: #month as a string to use for the URL def
        month_str = str(month)
    else: 
        month_str = '0'+str(month)
    #This code (next 3 lines) is adapted from https://towardsdatascience.com/web-scraping-html-tables-with-python-c9baba21059
    url=harborurl+'/?tide='+year+'-'+month_str+'#monthly-tide-chart'
    #Create a handle, page, to handle the contents of the website
    page = requests.get(url)
    #Store the contents of the website under doc
    doc = lh.fromstring(page.content)
    #Parse data that are stored between <tr>..</tr> of HTML
    tr_elements = doc.xpath('//tr')
    #save the raw html rows as "table"
    table = tr_elements
    return(table)

def find_first_row(table):
    for row in range(len(table)):
        if table[row][0].text_content() == '1':
            return(row)

def isInt(s):
    try: 
        int(s)
        return True
    except ValueError:
        return False

def scrape_from_url(harborurl,year):
    #Empty df to store the data:
    df = []

    for month in range(1,13):
        month_str_display = months[month-1]
        table = get_month_table(harborurl,month,year) #get the data for this month

        #find the first row
        row = find_first_row(table)

        #while the first entry of the row is less than the number of days in the month
        while isInt(table[row][0].text_content()):
            row_data = ['']*number_of_columns
            for col in range(number_of_columns):
                row_data[col] = table[row][col].text_content()
            row_data.insert(0,month_str_display)
            df.append(row_data)
            row+=1
    #Clean data
    column_names = ['Month','Date','Day','high_1','height_hi_1','high_2','height_hi_2','low_1','height_low_1','low_2','height_low_2','sunrise','sunset']
    data = pd.DataFrame(df,columns=column_names)
    high1 = pd.melt(data,id_vars=['Month','Date','Day','height_hi_1'],value_vars=['high_1'],var_name='event', value_name='time')
    high2 = pd.melt(data,id_vars=['Month','Date','Day','height_hi_2'],value_vars=['high_2'],var_name='event', value_name='time')
    low1 = pd.melt(data,id_vars=['Month','Date','Day','height_low_1'],value_vars=['low_1'],var_name='event', value_name='time')
    low2 = pd.melt(data,id_vars=['Month','Date','Day','height_low_2'],value_vars=['low_2'],var_name='event', value_name='time')
    sunrise = pd.melt(data,id_vars=['Month','Date','Day'],value_vars=['sunrise'],var_name='event', value_name='time')
    sunset = pd.melt(data,id_vars=['Month','Date','Day'],value_vars=['sunset'],var_name='event', value_name='time')
    high1 = high1.rename(columns={"height_hi_1": "height"})
    high2 = high2.rename(columns={"height_hi_2": "height"})
    low1 = low1.rename(columns={"height_low_1": "height"})
    low2 = low2.rename(columns={"height_low_2": "height"})
    sunrise['height'] = np.nan
    sunset['height'] = np.nan
    data = (pd.concat([high1, high2, low1, low2, sunrise, sunset])
         .sort_index()
         .reset_index())
    data = data[data['time']!='']
    
    #Put into long data format
    for row in range(len(data)):
        event = data['event'].iloc[row]
        time = data['time'].iloc[row]
        if (event == 'high_1' or event == 'low_1' or event == 'sunrise'): #morning events
            if not re.search('PM',str(time)):
                time = time + ' AM'
            time = pd.to_datetime(time).strftime('%H:%M %p')
        if (event == 'high_2' or event == 'low_2' or event == 'sunset'): #evening events
            if not re.search('AM',str(time)):
                time = time + ' PM'
            time = pd.to_datetime(time).strftime('%H:%M %p')
        data['time'].iloc[row] = time
    data['full_date'] = data['Month']+'-'+data['Date']+'-'+year+' '+data['time']
    data['full_date'] = pd.to_datetime(data['full_date'], format='%B-%d-%Y %H:%M %p')
    data['week_num'] = np.nan
    for row in range(len(data)):
        data['week_num'].iloc[row] = data['full_date'].iloc[row].isocalendar()[1]
        if data['Day'].iloc[row] == 'Sun':
            data['week_num'].iloc[row] += 1
    data = data.sort_values(by='full_date', ascending=True)
    return data

In [35]:
mainurl = 'https://www.usharbors.com/'
page = requests.get(mainurl)
soup = BeautifulSoup(page.content, 'html.parser')
stateurls = soup.find_all('option')
stateurls = [x.get('value') for x in stateurls]
stateurls = stateurls[1:-1]
stateurls

['https://www.usharbors.com/harbor/alabama/',
 'https://www.usharbors.com/harbor/alaska/',
 'https://www.usharbors.com/harbor/california/',
 'https://www.usharbors.com/harbor/connecticut/',
 'https://www.usharbors.com/harbor/delaware/',
 'https://www.usharbors.com/harbor/florida/',
 'https://www.usharbors.com/harbor/georgia/',
 'https://www.usharbors.com/harbor/hawaii/',
 'https://www.usharbors.com/harbor/illinois/',
 'https://www.usharbors.com/harbor/indiana/',
 'https://www.usharbors.com/harbor/lake-champlain/',
 'https://www.usharbors.com/harbor/louisiana/',
 'https://www.usharbors.com/harbor/maine/',
 'https://www.usharbors.com/harbor/maryland/',
 'https://www.usharbors.com/harbor/massachusetts/',
 'https://www.usharbors.com/harbor/michigan/',
 'https://www.usharbors.com/harbor/minnesota/',
 'https://www.usharbors.com/harbor/mississippi/',
 'https://www.usharbors.com/harbor/new-hampshire/',
 'https://www.usharbors.com/harbor/new-jersey/',
 'https://www.usharbors.com/harbor/new-york

In [36]:
def get_state_harbors(stateurl):
    page = requests.get(stateurl)
    soup = BeautifulSoup(page.content, 'html.parser')
    maybeharborurls = soup.find_all('option')
    maybeharborurls = [x.get('value') for x in maybeharborurls]
    harborurls = []
    for row in maybeharborurls:
        match = re.match(r""+stateurl+"(?P<harbor>.+)/tides/",str(row))
        if match:
            harborurls.append(str(row))
    return harborurls

In [39]:
months = ['January','February','March','April','May','June','July','August','September','October','November','December']
year = '2020'
number_of_columns = 12
harborurls = get_state_harbors(stateurls[0])
if len(harborurls)>0: #some of the states don't have harbors
    df = scrape_from_url(harborurls[0],year)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


OutOfBoundsDatetime: Out of bounds nanosecond timestamp: 1-01-01 13:00:00

NameError: name 'data' is not defined