In [1]:
from typing import Any, Mapping

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
from selenium.webdriver.firefox.service import Service as FirefoxService
from selenium.webdriver.firefox.webdriver import WebDriver
from webdriver_manager.firefox import GeckoDriverManager


In [2]:
story_title_selector = "h1.entry-title"
story_author_selector = "span.post-author"
story_section_selector = "a.post-category"

""" Extracts metadata from the story at `url`
Args:
    driver: The Selenium Firefox WebDriver instance
    url: A url that hopefully points to a story
Returns:
    A dictionary containing metadata on the story. You'll probably want to capture more metadata than I did. 
    I imagine we might want the full text of the story, etc, etc.
"""

def scrape_story(driver: WebDriver, url: str) -> Mapping[str, Any]:
    print("scraping story", url)
    open_url_in_new_tab(driver, url)  # Open the story in a new tab
    
    story_title = driver.find_element(By.CSS_SELECTOR, story_title_selector).text  # Since we want raw metadata, we're accessing the elements' text attributes
    story_author = driver.find_element(By.CSS_SELECTOR, story_author_selector).text
    story_section = driver.find_element(By.CSS_SELECTOR, story_section_selector).text
    story_text = "" #...
    
    # ...
    # capture additional metadata
    # ...
    story_metadata = {
        "title": story_title,
        "author": story_author,
        "section": story_section,
        "url": url,
        #... additional metadata
    }
    close_current_tab(driver)
    return story_metadata, driver

In [3]:
""" Closes the current selenium driver tab and switches context to the first open tab
Args:
    driver: The Selenium Firefox WebDriver instance
Returns:
    None
"""
def close_current_tab(driver: WebDriver) -> None:
    driver.close()
    main_tab_handle = driver.window_handles[0]
    driver.switch_to.window(main_tab_handle)

In [4]:
""" Opens a new selenium driver  tab and navigates to the url provided
Args:
    driver: The Selenium Firefox WebDriver instance
    url: The url to navigate to
Returns:
    None
"""
def open_url_in_new_tab(driver: WebDriver, url: str) -> None:
    driver.switch_to.new_window() # opens a new tab
    new_tab_handle = driver.window_handles[-1]  # grab the last browser handle (which we just opened)
    driver.switch_to.window(new_tab_handle)
    driver.get(url)

In [5]:
##### Load landing page for individual Courier Site

driver = webdriver.Firefox(service=FirefoxService(GeckoDriverManager().install()))  # Ensure we have the latest GeckoDriver (used to run Firefox)
landing_page_url = "https://cardinalpine.com/"  # The landing page (hopefully) contains a dynamic list of all the stories we need to scrape from that outlet 
driver.get(landing_page_url)  # Navigate to the landing page  

[WDM] - Downloading: 16.9kB [00:00, 1.08MB/s]                                                                                                                                        


In [6]:
#### Find the "recent stories" feed

feed_container_class = "recent-posts"  # This is the HTML class of the div containing the story feed 
feed_container_element = driver.find_element(By.CLASS_NAME, feed_container_class)  # We'll capture that div in a python variable for easy access

In [7]:
#### Now it's time to begin actual scraping. If you press the "LOAD MORE STORIES" link at the bottom of the feed
#### You'll find that the page will load 8 additional stories
#### Given that behavior, we should set up a loop that will iteratively scrape stories, load more stories, scrape stories, load more stories... and so on


# We'll use "number_of_stories_seen" to form our loop condition.  
# If we press "load more stories" but the number of stories in the feed doesn't increment above "number_of_stories_seen, we're presumably done. 
# I say presumably, because I'm not sure how long we can keep loading stories without breaking the page
# And even if we can keep loading for a while, I'm not sure doing so wil capture every story Cardinal & Pine has published
# You'll want to do some sanity checks to ensure this approach really can capture all the stories. 

# If it happens to work, great! If not, we'll have to devise something new :)
# Feel free to reach out if you'd like to talk strategy. For now, take this script as more of a scraping tutorial than a perfect methodology for logging all stories
# Some of the "fun" of scraping comes from figuring out how to get the content you want

number_of_stories_seen = 0  # And without further ado, the variable in question 

# This is a CSS selector. # See https://saucelabs.com/resources/blog/selenium-tips-css-selectors for more info
# Each story element in the feed is contained in a div with the class "item"
# We can select it with this CSS selector syntax
story_container_css_selector = "div.item" 

more_button_css_selector = "load-more-button"

story_url_css_selector = "a.item-title" # Another CSS selector that we'll use to find story URLs 

all_stories_metadata = []

# I dislike the "while True" syntax, but it's standard in Python
# See https://peps.python.org/pep-0315/ and https://twitter.com/raymondh/status/1528772337306419200
# We'll break the loop when we reach our exit condition
while True:
    
    # Capture all of the story container divs in our feed
    # Note that we're using "find_elements" here, not "find_element" like we used above. 
    # find_elements will return all of the elements matching our selection criterion. find_element only returns the first
    story_container_elements = feed_container_element.find_elements(By.CSS_SELECTOR, story_container_css_selector)  
    
    # If we've seen all the stories in the feed, we're done. Break the loop
    number_of_stories_in_feed = len(story_container_elements)
    if number_of_stories_in_feed == number_of_stories_seen:
        print(f"No more stories in feed. Processed {number_of_stories_seen} stories total")
        break 
    
    print(f"{number_of_stories_in_feed} stories in feed")
    # Stories load 8 at a time. We should always scrape the most recently loaded 8 stories. 
    for story_container_element in story_container_elements[-8:]:
        # Find the "a" tag containing the story link
        # Then extract its "href" attribute, AKA the story's URL
        story_url = story_container_element.find_element(By.CSS_SELECTOR, story_url_css_selector).get_attribute("href")
        
        # Do the actual scraping
        story_metadata = scrape_story(driver, story_url)  
        
        # Store the current story's metadata in our larger data structure
        all_stories_metadata.append(story_metadata) 
        
        # increment our stories seen counter
        number_of_stories_seen += 1
    
    # After each set of 8 stories, we'll need to load more. 
    # Remove the break below and find a way to click on the "load more stories" link
    # You may need to make sure the link is visible before you click it (scroll to it)
    more_button_element = driver.find_element(By.CLASS_NAME, more_button_css_selector)
    more_button_element.click()
    
    

8 stories in feed
scraping story https://cardinalpine.com/story/were-running-out-of-time-to-adequately-address-climate-change/
scraping story https://cardinalpine.com/story/spring-is-here-our-guide-to-ncs-most-pleasant-and-pollinated-season/
scraping story https://cardinalpine.com/story/democrats-advocates-want-north-carolina-corporate-tax-retained/
scraping story https://cardinalpine.com/story/yes-there-are-elections-this-year-a-new-state-tool-tells-you-all-you-need-to-know/
scraping story https://cardinalpine.com/story/celebrate-pi-day-in-north-carolina-with-discount-pizza-and-philosophy/
scraping story https://cardinalpine.com/story/nc-supreme-court-will-hear-voting-rights-cases-it-already-decided/
scraping story https://cardinalpine.com/story/women-workers-here-are-tools-to-get-the-pay-you-deserve-copy/
scraping story https://cardinalpine.com/story/survey-give-us-your-oscar-picks/
16 stories in feed
scraping story https://cardinalpine.com/story/biden-tax-billionaires-corporations-s

KeyboardInterrupt: 

In [None]:
### eventually you'll want to write that all_stories_metadata array to a database
### when you're designing a schema for the database, you might consider creating one table for outlets, one for stories, and another for authors
### although if you have a solution you like better, feel free to go that way.  
### And again, email or drop a message if you have questions. 

In [None]:
driver.quit()