In [1]:
import requests, re
import pickle as pkl
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [2]:
def get_soup(url):
    response = requests.get(url)
    page = response.text
    if response.status_code != 200:
        print("REQUESTS error")
        print(response.status_code)
    return BeautifulSoup(page, "lxml")

def str2floatflag(s):
    s = s.strip()
    letters = ''
    try:
        num = float(s)
    except ValueError:
        # return tuple contain int and str elements
        if '-' not in s:
            rs = r"[0-9|.]"
            num = "".join(re.findall(rs,s))
            letters = s.strip(num)
        else:
            num = np.NaN
    return (float(num), letters)


In [18]:
from datetime import timedelta
from datetime import date

start_date, end_date = pd.to_datetime('20050101'), pd.to_datetime('20170802')
sensor_urls = []
sensor_url_root = "https://cdec.water.ca.gov/cgi-progs/snow/PAGE6."
for d in pd.date_range(start_date, end_date):
    sensor_urls.append(sensor_url_root + date.strftime(d, "%Y%m%d"))

In [19]:
from time import sleep
import lxml

snow_df = pd.DataFrame()

for url in sensor_urls:
    # get page
    sensor_soup = get_soup(url)

    # get each table row
    data_table = sensor_soup.find_all("table")[0]
    rows = [row for row in data_table.find_all('tr')]

    # get page info
    page_date = pd.to_datetime(sensor_soup.find("h2").next_element)
    print(page_date)

    # get all data for a given day, create a dataframe
    df_rows = [] # list of dicts
    df_row = {}
    for r in rows:
        if r.find_all('font'): # it's a heading
            df_row['area'] = r.find_all('font')[0].text.lower()
        elif r.find_all('td')[0].text.strip() != 'Station': # it's not, and it's not a station heading
            r_data = r.find_all('td')
            df_row['st_name'], df_row['st_code'] = r_data[0].text.strip().lower(), r_data[1].text.strip().lower()
            df_row['agency'] = r_data[2].text.strip().lower()
            df_row['elev_ft'] = int(r_data[3].text.replace(',',''))
            df_row['apr1avg_in'], df_row['apr1avg_in_code'] = str2floatflag(r_data[4].text)
            df_row['today_in'], df_row['today_in_code'] = str2floatflag(r_data[5].text)
            df_row['apr1_pct'], df_row['apr1_pct_code'] = str2floatflag(r_data[6].text.strip('%'))
            df_row['yesday_in'], df_row['yesday_in_code'] = str2floatflag(r_data[7].text)
            df_row['date'], df_row['url'] = page_date, url
            df_rows.append(df_row.copy())
    page_df = pd.DataFrame.from_dict(df_rows)
    snow_df = snow_df.append(page_df)
    print("sleep 1.5 sec")
    sleep(1.5)

AttributeError: 'NoneType' object has no attribute 'next_element'

In [11]:
# these are empty for now - should maybe make this conditional
del snow_df['apr1_pct_code']
del snow_df['apr1avg_in_code']

In [12]:
snow_df.head()


Unnamed: 0,area,st_name,st_code,agency,elev_ft,apr1avg_in,today_in,today_in_code,apr1_pct,yesday_in,yesday_in_code,date,url
0,trinity river,peterson flat,pet,dwr,7150,29.2,7.8,r,26.0,7.8,r,2017-01-01,https://cdec.water.ca.gov/cgi-progs/snow/PAGE6...
1,trinity river,red rock mountain,rrm,dwr,6700,39.6,17.1,r,43.0,17.2,r,2017-01-01,https://cdec.water.ca.gov/cgi-progs/snow/PAGE6...
2,trinity river,bonanza king,bnk,dwr,6450,40.5,,,,,,2017-01-01,https://cdec.water.ca.gov/cgi-progs/snow/PAGE6...
3,trinity river,shimmy lake,shm,dwr,6400,40.3,11.1,r,27.0,11.3,r,2017-01-01,https://cdec.water.ca.gov/cgi-progs/snow/PAGE6...
4,trinity river,middle boulder 3,mb3,dwr,6200,28.3,8.0,r,28.0,7.8,r,2017-01-01,https://cdec.water.ca.gov/cgi-progs/snow/PAGE6...


In [13]:
soup = get_soup('http://cdec.water.ca.gov/misc/SnowSensors.html')

data_table = soup.find_all("table")[0]
rows = [row for row in data_table.find_all('tr')]

stations_list = []
station = {}
for r in rows:
    row_array = tuple(r.text.split('\n'))
    if len(row_array) == 7:
        station['st_code'] = row_array[0].strip().lower()
        station['st_lat'], station['st_lon'] = float(row_array[3].split()[0]), float(row_array[3].split()[1])
        stations_list.append(station.copy())


In [14]:
station_coords = pd.DataFrame.from_dict(stations_list)
snow_coords_df = snow_df.merge(station_coords, on='st_code', how='left')


In [15]:
pkl.dump(snow_coords_df, open( "snow_coords_df_long.pkl", "wb" ) )