In [1]:
import urllib3
import pickle as pkl
import time
from warnings import warn

from bs4 import BeautifulSoup
import pandas as pd

from trail import Trail

In [2]:
# scrape homepage

"""
If `scrape == True` then all of the web pages are re-downloaded.
Otherwise, a cached copy is loaded from disk instead.
"""
scrape = False
home_page_pkl_name = 'homepage.pkl'

# download homepage
homepage_url = 'http://www.bayarearides.com'
if scrape:
    http = urllib3.PoolManager()
    r = http.request('GET', homepage_url)
    with open(home_page_pkl_name, 'wb') as f:
        pkl.dump(r.data, f)

# load hompage html from disk
with open(home_page_pkl_name, 'rb') as f:
    homepage_response_data = pkl.load(f)

In [3]:
# extract snippets from homepage

homepage_soup = BeautifulSoup(homepage_response_data, 'html.parser')
trails = Trail.extract_trails_from_soup(homepage_soup)

In [4]:
# get trail page html

trail_pages_html = {}
for trail in trails:
    
    print(trail.name)

    # trail info
    trail_name = trail.name
    trail_url = '/'.join([homepage_url, trail.relative_url])
    trail_pkl_name = '{trail_name}.pkl'.format(trail_name=trail_name)

    # download trail html
    if scrape:
        print("\tScraping")
        r = http.request('GET', trail_url)
        with open(trail_pkl_name, 'wb') as f:
            pkl.dump(r.data, f)
        time.sleep(10)

    # load trail html from disk
    try:
        with open(trail_pkl_name, 'rb') as f:
            html = pkl.load(f)
            trail_pages_html[trail_name] = html
    except FileNotFoundError:
        print('\tNo data')

Alameda Creek Trail
Almaden Quicksilver Park
Alpine Road to Windy Hill
Angel Island
Annadel (Suggestion 1)
Annadel (Suggestion 2)
Annadel (Suggestion 3)
Arastradero
Berry Creek Falls
Big Basin
Big Basin - Boulder Creek Loop
Big Rock Ridge
Black Diamond Mines
Bolinas Ridge (long loop)
Bolinas Ridge (short loop)
Briones (Suggestion 1)
Briones (Suggestion 2)
Brushy Peak
Butano State Park
Calaveras Road
Camp Tamarancho
Cañada Road
Carquinez Scenic Drive
Cheese Factory Loop
China Camp
Coyote Creek Trail
Coyote Hills
Crockett Hills
DeLaveaga Park
Del Valle
Demo Forest (Braille Trail)
Demo Forest (Corral Trail)
Demo Forest (Flow Trail)
Demo Forest (Sawpit Trail)
East Bay Canyons
El Sereno
Empire Grade Loop
Fairfax to Mount Tam Loop
Foothill Expressway
Fifield-Cahill Ridge
Fort Ord (2007 Sea Otter course)
Fort Ord (2012 Sea Otter course)
Fort Ord (eastern trails)
Fremont Older
Garin and Dry Creek
Golden Gate to Sausalito
Grant Ranch
Grizzly Flat and Charcoal Grade
Half Moon Bay Coastside Trail

In [5]:
# update trail with trail page info

for trail in trails:
    html = trail_pages_html[trail.name]
    trail_soup = BeautifulSoup(html, 'html.parser')
    trail.set_description(trail_soup)

In [6]:
# store parsed data
with open('all_trails.pkl', 'wb') as f:
    pkl.dump(trails, f)

In [7]:
# convert to pandas DataFrame
df = pd.DataFrame([t.to_dict() for t in trails])

In [8]:
# composition feature - replace nans with zeros
composition_columns = df.columns[df.columns.str.startswith('comp')]
df[composition_columns] = df[composition_columns].fillna(0)

In [9]:
# pickle the dataframe
df.to_pickle('dataframe.pkl')