In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import numpy as np

In [2]:
# The name on the page, the name we want to use
columns_of_interest = {'ID': 'id', 'DATE': 'timestamp', 'YSIBGARFU': 'blue_green_algae',
                      'YSITEMP': 'water_temperature'}

In [3]:
def update_df_to_floats(data, exclusion_list):
    for column in data.columns:
        if column not in exclusion_list:
            data[column] = data[column].astype(float)
    return data

In [4]:
def build_df_from_data_ta(raw_rows):
    data = []
    
    headings = [_.text.split('(')[0] for _ in raw_rows[1].findAll('th')]
    significant_columns = {}
    for i, heading in enumerate(headings):
        if heading in [_ for _ in columns_of_interest]:
            significant_columns[i] = columns_of_interest[heading]
    print(significant_columns)
    for row_raw in raw_rows[2:]:

        row = [_.text for _ in row_raw.findAll('td')]
        data.append({significant_columns[column]: row[column] for column in significant_columns.keys()})
    data = pd.DataFrame(data)
    data = data.replace(to_replace=r'-', value=np.nan).replace(to_replace=r'---', value=np.nan)
    # data = update_df_to_floats(data, ['id', 'mean_wave_direction', 'wind_direction', 'timestamp'])
    # data['timestamp'] = pd.to_datetime(data['timestamp'])
    return data

In [8]:
def scrape_buoy_data(buoy_id):
    url = 'http://greatlakesbuoys.org/station_page.php?station=%s&unit=E&tz=GMT' % str(buoy_id)
    result = requests.get(url)
    if result.status_code != 200:
        print("error on status code")
    soup = BeautifulSoup(result.content, "html5lib")
    raw_rows = soup.find(id="prevNObs").find('table').findAll('tr')
    data = build_df_from_data_ta(raw_rows)
    data['buoy_id'] = buoy_id
    return data

In [9]:
result = scrape_buoy_data('crib')
result.head()

{0: 'id', 1: 'timestamp', 14: 'blue_green_algae'}


Unnamed: 0,blue_green_algae,id,timestamp,buoy_id
0,0.36,55613,11/10/2016 18:30:00,crib
1,0.36,55612,11/10/2016 18:20:00,crib
2,0.36,55611,11/10/2016 18:10:00,crib
3,0.36,55610,11/10/2016 18:00:00,crib
4,0.36,55609,11/10/2016 17:50:00,crib


In [10]:
result = scrape_buoy_data(45176)
result.head()

{0: 'id', 1: 'timestamp', 22: 'water_temperature', 28: 'blue_green_algae'}


Unnamed: 0,blue_green_algae,id,timestamp,water_temperature,buoy_id
0,,28733,05/18/2017 20:40:00,32,45176
1,,28732,05/18/2017 20:30:00,32,45176
2,,28731,05/18/2017 20:20:00,32,45176
3,,28730,05/18/2017 20:10:00,32,45176
4,,28729,05/18/2017 20:00:00,32,45176
