In [1]:
from bs4 import BeautifulSoup
import urllib3
import pickle as pkl



In [58]:
class Snippet:
    def __init__(self, 
                 name, 
                 relative_url, 
                 description, 
                 fun_rating, 
                 scenic_rating,
                 aerobic_rating, 
                 technical_rating):
        self.name = name
        self.relative_url = relative_url
        self.description = description
        self.fun_rating = fun_rating
        self.scenic_rating = scenic_rating
        self.aerobic_rating = aerobic_rating
        self.technical_rating = technical_rating
        
    def extract_all_from_soup(soup):
        """Extracts Snippet objects from a bs4.BeautifulSoup of the homepage"""
        snippets = []
        for snippet_html in Snippet._get_snippets_html(soup):
            snippets.append(Snippet._snippet_from_html(snippet_html))
        return snippets
        
    def _get_snippets_html(soup):
        """Finds all the <tr class="even"> and <tr class="odd>
        
        These <tr ...> correspond to snippets
        """
        return soup.find_all(name='tr', attrs=['even', 'odd'])
    
    def _snippet_from_html(snippet_html):
        """Creates a Snippet from the snippet_html extracted with `_get_snippets_html(soup)`"""
        name, relative_url, description = Snippet._get_name_relurl_and_desc(snippet_html)
        fun_rating = Snippet._get_rating(snippet_html, 'fun')
        scenic_rating = Snippet._get_rating(snippet_html, 'scenic')
        aerobic_rating = Snippet._get_rating(snippet_html, 'aerobic')
        technical_rating = Snippet._get_rating(snippet_html, 'technical')
        return Snippet(name, 
                       relative_url, 
                       description, 
                       fun_rating, 
                       scenic_rating,
                       aerobic_rating,
                       technical_rating)

    def _get_rating(snippet_html, category):
        """Gets the numeric rating from a <div>
        
        The rating <div> looks like this:
            <div class="rating-container-fun">
            <div style="WIDTH: 20%;">2</div>
            </div>         
        """
        attr = 'rating-container-{category}'.format(category=category)
        ratings_table_html = snippet_html.find('td', 'listingratings')
        category_html = ratings_table_html.find('div', attr)
        return int(list(list(category_html.children)[1])[0])

    def _get_name_relurl_and_desc(snippet_html):
        
        """Extracts name, relative url, and description from <p class="snippet">
        
        The first (name and url) <p class="snippet"> looks like this:
            <p class="snippet">
             <span class="snippettitle">
              <b>
               <a href="rides/alamedacreek">
                Alameda Creek Trail
               </a>
              </b>
             </span>
            </p>
            
        The second (description) <p class="snippet"> looks like this:
            <p class="snippet">
             A casual ride on an easy and flat recreational trail that is not too special other than being somewhat more scenic than usual and giving you a choice of paved or dirt riding.
            </p>
        
        """
        name_and_url_part, desc_part = snippet_html.find_all('p', 'snippet')
        name = name_and_url_part.get_text()
        relative_url = name_and_url_part.find('a').get('href')
        desc = desc_part.get_text()
        return name, relative_url, desc
    


In [59]:
# scrape homepage

homepage_url = 'http://www.bayarearides.com'

"""
This commented out part retrieves a new copy of the homepage html
"""
# http = urllib3.PoolManager()
# r = http.request('GET', homepage_url)
# with open('homepage.pkl', 'wb') as f:
#     pkl.dump(r.data, f)

"""
This part loads a cached (pickled) copy of the homepage html
"""
with open('homepage.pkl', 'rb') as f:
    homepage_response_data = pkl.load(f)

In [60]:
# get snippets from homepage
homepage_soup = BeautifulSoup(homepage_response_data, 'html.parser')
snippets = Snippet.extract_all_from_soup(homepage_soup)