In [1]:
import dash
import dash_core_components as dcc
import dash_html_components as html
import pandas as pd
import plotly.graph_objs as go
import plotly.express as px
import numpy as np
from dash.dependencies import Input, Output
import datetime
import requests
import lxml.html as lh
import datetime
import re
import plotly.offline as py
py.init_notebook_mode(connected=True)

In [2]:
def get_month_table(state,location,month,year):
    if month > 9: #month as a string to use for the URL def
        month_str = str(month)
    else: 
        month_str = '0'+str(month)
    #This code (next 3 lines) is adapted from https://towardsdatascience.com/web-scraping-html-tables-with-python-c9baba21059
    url='https://www.usharbors.com/harbor/'+state+'/'+location+'/tides/?tide='+year+'-'+month_str+'#monthly-tide-chart'
    #Create a handle, page, to handle the contents of the website
    page = requests.get(url)
    #Store the contents of the website under doc
    doc = lh.fromstring(page.content)
    #Parse data that are stored between <tr>..</tr> of HTML
    tr_elements = doc.xpath('//tr')
    #save the raw html rows as "table"
    table = tr_elements
    return(table)

def find_first_row(table):
    for row in range(len(table)):
        if table[row][0].text_content() == '1':
            return(row)

def isInt(s):
    try: 
        int(s)
        return True
    except ValueError:
        return False

def scrape_from_url(state,location):
    #Empty df to store the data:
    df = []

    for month in range(1,13):
        month_str_display = months[month-1]
        table = get_month_table(state,location,month,year) #get the data for this month

        #find the first row
        row = find_first_row(table)

        #while the first entry of the row is less than the number of days in the month
        while isInt(table[row][0].text_content()):
            row_data = ['']*number_of_columns
            for col in range(number_of_columns):
                row_data[col] = table[row][col].text_content()
            row_data.insert(0,month_str_display)
            df.append(row_data)
            row+=1
    #Clean data
    column_names = ['Month','Date','Day','high_1','height_hi_1','high_2','height_hi_2','low_1','height_low_1','low_2','height_low_2','sunrise','sunset']
    data = pd.DataFrame(df,columns=column_names)
    high1 = pd.melt(data,id_vars=['Month','Date','Day','height_hi_1'],value_vars=['high_1'],var_name='event', value_name='time')
    high2 = pd.melt(data,id_vars=['Month','Date','Day','height_hi_2'],value_vars=['high_2'],var_name='event', value_name='time')
    low1 = pd.melt(data,id_vars=['Month','Date','Day','height_low_1'],value_vars=['low_1'],var_name='event', value_name='time')
    low2 = pd.melt(data,id_vars=['Month','Date','Day','height_low_2'],value_vars=['low_2'],var_name='event', value_name='time')
    sunrise = pd.melt(data,id_vars=['Month','Date','Day'],value_vars=['sunrise'],var_name='event', value_name='time')
    sunset = pd.melt(data,id_vars=['Month','Date','Day'],value_vars=['sunset'],var_name='event', value_name='time')
    high1 = high1.rename(columns={"height_hi_1": "height"})
    high2 = high2.rename(columns={"height_hi_2": "height"})
    low1 = low1.rename(columns={"height_low_1": "height"})
    low2 = low2.rename(columns={"height_low_2": "height"})
    sunrise['height'] = np.nan
    sunset['height'] = np.nan
    data = (pd.concat([high1, high2, low1, low2, sunrise, sunset])
         .sort_index()
         .reset_index())
    data = data[data['time']!='']
    
    #Put into long data format
    for row in range(len(data)):
        event = data['event'].iloc[row]
        time = data['time'].iloc[row]
        if (event == 'high_1' or event == 'low_1' or event == 'sunrise'): #morning events
            if not re.search('PM',str(time)):
                time = time + ' AM'
            time = pd.to_datetime(time).strftime('%H:%M %p')
        if (event == 'high_2' or event == 'low_2' or event == 'sunset'): #evening events
            if not re.search('AM',str(time)):
                time = time + ' PM'
            time = pd.to_datetime(time).strftime('%H:%M %p')
        data['time'].iloc[row] = time
    data['full_date'] = data['Month']+'-'+data['Date']+'-'+year+' '+data['time']
    data['full_date'] = pd.to_datetime(data['full_date'], format='%B-%d-%Y %H:%M %p')
    data['week_num'] = np.nan
    for row in range(len(data)):
        data['week_num'].iloc[row] = data['full_date'].iloc[row].isocalendar()[1]
        if data['Day'].iloc[row] == 'Sun':
            data['week_num'].iloc[row] += 1
    data = data.sort_values(by='full_date', ascending=True)
    return data

In [3]:
months = ['January','February','March','April','May','June','July','August','September','October','November','December']
year = '2020'
number_of_columns = 12
state = 'massachusetts'
location = 'great-hill-ma'

df = scrape_from_url(state,location)
df = df[df['event']!='sunset']
df = df[df['event']!='sunrise']
df['full_date'] = pd.to_datetime(df['full_date'], format='%Y-%m-%d %H:%M')
df = df.sort_values(by='full_date', ascending=True)
df = df.reset_index()



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [4]:
def subtract_to_sunday(array):
    subs = [0]*len(array)
    for day in range(len(array)):
        if array[day]=='Sun':
            sub = 0
        if array[day]=='Mon':
            sub = 1
        if array[day]=='Tue':
            sub = 2
        if array[day]=='Wed':
            sub = 3
        if array[day]=='Thu':
            sub = 4
        if array[day]=='Fri':
            sub = 5
        if array[day]=='Sat':
            sub = 6
        subs[day] = sub
    return subs

In [5]:
df['day_only'] = df['Month'] + df['Date'] + year
df['day_only'] = pd.to_datetime(df['day_only'], format='%B%d%Y')
df['days_from_sunday'] = subtract_to_sunday(df['Day'])

In [6]:
df

Unnamed: 0,level_0,index,Month,Date,Day,height,event,time,full_date,week_num,day_only,days_from_sunday
0,0,0,January,1,Wed,3.6,high_1,00:00 AM,2020-01-01 00:00:00,1.0,2020-01-01,3
1,5,0,January,1,Wed,0.7,low_1,04:57 AM,2020-01-01 04:57:00,1.0,2020-01-01,3
2,2,0,January,1,Wed,3.6,high_2,12:20 PM,2020-01-01 12:20:00,1.0,2020-01-01,3
3,1,0,January,1,Wed,0.6,low_2,17:21 PM,2020-01-01 17:21:00,1.0,2020-01-01,3
4,6,1,January,2,Thu,3.6,high_1,00:53 AM,2020-01-02 00:53:00,1.0,2020-01-02,4
...,...,...,...,...,...,...,...,...,...,...,...,...
1410,2184,364,December,30,Wed,3.6,high_2,20:20 PM,2020-12-30 20:20:00,53.0,2020-12-30,3
1411,2190,365,December,31,Thu,0.2,low_1,01:16 AM,2020-12-31 01:16:00,53.0,2020-12-31,4
1412,2193,365,December,31,Thu,4.5,high_1,08:36 AM,2020-12-31 08:36:00,53.0,2020-12-31,4
1413,2191,365,December,31,Thu,0.1,low_2,14:07 PM,2020-12-31 14:07:00,53.0,2020-12-31,4


In [7]:
for row in range(len(df)): # calculate the number of minutes until each time
    delta = df['full_date'].iloc[row] - df['day_only'].iloc[row]
    df['mins_since_sun'].iloc[row] = pd.Timedelta(delta).total_seconds()/60
    min_to_add = df['days_from_sunday'].iloc[row]*24*60
    df['mins_since_sun'].iloc[row] = df['mins_since_sun'].iloc[row] + min_to_add
#pd.Timedelta(df['mins_since_sun']).total_seconds()/60
#+ datetime.timedelta(minutes=subtract_to_sunday(df['Day'])*24*60)

KeyError: 'mins_since_sun'

In [None]:
df

In [None]:
month = "January"
temp = df[df['Month']==month]
fig = px.scatter(x=temp['mins_since_sun'], 
            y=temp['height'],
            facet_row=temp['week_num'],
            #line='spline'
            #hovertext=df['time']
                )
fig.show() #px
#py.iplot([fig],filename='test') #go