# RDS@GSU - Web Scraping Continued (Workshop # 2)

#### Copyright + References

In [None]:
# The content in this notebook was developed by Jeremy Walker.
# All sample code and notes are provided under a Creative Commons
# ShareAlike license.

# Official Copyright Rules / Restrictions / Priveleges
# Attribution-ShareAlike 4.0 International (CC BY-SA 4.0)
# https://creativecommons.org/licenses/by-sa/4.0/

# Part 1 - Refresher and Recap

In [None]:
# Import modules
import requests
from bs4 import BeautifulSoup

In [None]:
# Specify a webpage
state_dept = 'https://www.state.gov/press-releases/'
state_dept

In [None]:
# Get the webpage
webpage = requests.get(url = state_dept)
webpage.text

In [None]:
# Save the webpage
html_file = open(file = "web_documents/press_release_directory.html", mode = "w", encoding = "utf-8")
html_file.write(webpage.text)
html_file.close()

In [None]:
# Create the soup
soup = BeautifulSoup(markup = webpage.text)
soup

In [None]:
# Create a list of results by finding all links "a" within the "collection-results" element.
results = soup.find(class_="collection-results").find_all(name = "a")
results

In [None]:
# Create an object (first_result) from the first item in the "results" list
first_result = results[0]
first_result

In [None]:
# Get the webpage for the first_result
first_webpage = requests.get(url=first_result['href'])
first_webpage.text

In [None]:
# Save the new webpage to a new file
html_file = open(file = "web_documents/first_webpage.html", mode = "w", encoding = "utf-8")
html_file.write(first_webpage.text)
html_file.close()

### Altogether...

In [None]:
# Import modules
import requests
from bs4 import BeautifulSoup

# Specify a webpage
state_dept = 'https://www.state.gov/press-releases/'

# Get the webpage
webpage = requests.get(url = state_dept)

# Save the webpage
html_file = open(file = "web_documents/press_release_directory.html", mode = "w", encoding = "utf-8")
html_file.write(webpage.text)
html_file.close()

# Create the soup
soup = BeautifulSoup(markup = webpage.text)

# Create a list of results by finding all links "a" within the "collection-results" element.
results = soup.find(class_="collection-results").find_all(name = "a")

# Create an object (first_result) from the first item in the "results" list
first_result = results[0]

# Get the webpage for the first_result
first_webpage = requests.get(url=first_result['href'])

# Save the new webpage to a new file
html_file = open(file = "web_documents/first_webpage.html", mode = "w", encoding = "utf-8")
html_file.write(first_webpage.text)
html_file.close()

# Part 2 - Iterating through content 

### What we're building towards...
The block of code below represents the entirety of what Part 2 and Part 3 are building towards.

In [None]:
# import requests
# from bs4 import BeautifulSoup
# def scraper( starting_url , count):
#     website = starting_url
#     page_counter = count
#     webpage = requests.get(url = website)
#     soup = BeautifulSoup(markup = webpage.text)
#     results = soup.find(class_="collection-results").find_all(name = "a")
#     for page in results:
#         page_url = page['href']
#         webpage = requests.get(page_url)
#         filename = "web_documents/{}.html".format(page_counter)
#         html_file= open(file = filename, mode = "w", encoding="utf-8")
#         html_file.write(webpage.text)
#         html_file.close()
#         page_counter = page_counter + 1
#     if soup.find(class_="next page-numbers"):
#         next_page = soup.find(class_="next page-numbers")
#         next_url = next_page['href']
#         if page_counter < 30:
#             scraper(starting_url = next_url, count = page_counter)
# scraper(starting_url="https://www.state.gov/press-releases/", count=0)

In [None]:
# Import modules
import requests
from bs4 import BeautifulSoup

# Specify a webpage
state_dept = 'https://www.state.gov/press-releases/'

# Get the webpage
webpage = requests.get(url = state_dept)

# Save the webpage
html_file = open(file = "web_documents/press_release_directory.html", mode = "w", encoding = "utf-8")
html_file.write(webpage.text)
html_file.close()

# Create the soup
soup = BeautifulSoup(markup = webpage.text)

# Create a list of results by finding all links "a" within the "collection-results" element.
results = soup.find(class_="collection-results").find_all(name = "a")

In [None]:
# The results object is a list of individual "a" tag objects.
results

In [None]:
# In order to collect each individual webpage in the results list,
# we will use a for-loop to iterate through each item in the list.
# For-loops can be a bit tricky to understand if you are new to 
# programming and scripting.  However, there are many excellent
# guides and tutorials on the web for this:

# For Loops Tutorials : https://www.w3schools.com/python/python_for_loops.asp

# The following cells show a variety of examples of the basic syntax
# of for-loops.

# This example can be read as "For every item ('i') in a list 
# ('list_of_items'), display ('print()') the item ('i').

list_of_items = ["a","b","c","d"]

for i in list_of_items:
    print(i)

In [None]:
# While 'i' is a common placeholder for items when iterating
# through a list, it is arbitrary.  Any other placeholder will
# function exactly the same.

list_of_items = ["a","b","c","d"]

for x in list_of_items:
    print(x)

In [None]:
for banana in list_of_items:
    print(banana)

In [None]:
# While for-loops are usually used to perform
# some function or operation on a different item,
# that is not strictly required.  Python will do
# an arbitrary task for each iteration.

for banana in list_of_items:
    print("oranges")

In [None]:
# Returning to the results object, the following
# code iterates through each result and displays
# the tag.

for page in results:
    print("PAGE ITEM")
    print(page)

In [None]:
# Within the for-loop, you can define a new object
# or perform a function / operation and display the results.

# In the following example, the "page" placeholder represents
# the individual tag object from the results list.  Consequently,
# the "page" placeholder has the same characteristics as the
# item it represents.  In this case, that means we can get the
# "href" attribute.

for page in results:
    page_url = page['href']
    print(page_url)

In [None]:
# For loops can also modify existing objects. For example,
# we can create an object that simply increases by 1 for
# every iteration in the for loop.

page_counter = 0

for page in results:    
    
    print(page_counter) 
    
    page_counter = page_counter + 1

In [None]:
# Expand the for-loop to include both the counter and
# the hyperlink ("href").

page_counter = 0

for page in results:
    page_url = page['href']
    
    print(page_counter)
    print(page_url)
    
    page_counter = page_counter + 1

In [None]:
# Altogether, we can create a for-loop
# that iterates through the results list, collects each
# individual webpage and saves them using an enumerated
# filename.

page_counter = 0

for page in results:
    
    # Specify a webpage
    page_url = page['href']
    
    # Get the webpage
    webpage = requests.get(page_url)
    
    # Create filename
    # This is a little complicated, but in essence, the {}
    # inside a string allows us to dynamically change the 
    # text of a string.  In this case, the .format(...)
    # specifies that the filename will be the name of the
    # html document once it is saved to the computer.
    filename = "web_documents/{}.html".format(page_counter)
    
    # Save the webpage
    html_file= open(file = filename, mode = "w", encoding="utf-8")
    html_file.write(webpage.text)
    html_file.close()
    
    # Sanity-check: print out the URL and current page_counter
    print(page_counter)
    print(page_url)
    
    # Update the page_counter
    page_counter = page_counter + 1

### Part 2 Recap

In [None]:
# Import modules
import requests
from bs4 import BeautifulSoup

# Specify a webpage
state_dept = 'https://www.state.gov/press-releases/'

# Get the webpage
webpage = requests.get(url = state_dept)

# Save the webpage
html_file = open(file = "web_documents/press_release_directory.html", mode = "w", encoding = "utf-8")
html_file.write(webpage.text)
html_file.close()

# Create the soup
soup = BeautifulSoup(markup = webpage.text)

# Create a list of results by finding all links "a" within the "collection-results" element.
results = soup.find(class_="collection-results").find_all(name = "a")

# Initiate counter
page_counter = 0

# For Loop
for page in results:
    
    # Specify a webpage
    page_url = page['href']
    
    # Get the webpage
    webpage = requests.get(page_url)
    
    # Create filename
    filename = "web_documents/{}.html".format(page_counter)
    
    # Save the webpage
    html_file= open(file = filename, mode = "w", encoding="utf-8")
    html_file.write(webpage.text)
    html_file.close()
    
    # Update the page_counter
    page_counter = page_counter + 1

# Part 3 - Creating a Function to Wrap Everything Together

In [None]:
# In order to to simplify how the code above is
# displayed and used, we will encapsulate the
# script within a "function".  There is a lot
# to learn regarding exactly how Functions work,
# but that is beyond the scope of this workshop.
# For this section, we'll focus on using simple
# and straigthforward mechanics of Functions.

# Functions Tutorials - https://www.w3schools.com/python/python_functions.asp

# For example, the following function ("FunctionName") 
# takes two parameters.  In truth, you can define a
# function to take any number of parameters or none
# at all.  The function then does whatever operations
# you instruct.

def FunctionName(parameter1 , parameter2):
    # Do something, anything within the function...
    xyz = parameter1 + 500
    abc = xyz + parameter2
    print(abc)

In [None]:
# Once the function has been defined, we can
# call or use that function just like any of
# the other functions we've used.

FunctionName(parameter1=500, parameter2=1000)

In [None]:
FunctionName(parameter1=141, parameter2=1024)

In [None]:
# Building from there, we can create a function
# that will encapsulate everything we've done so 
# far with web-scraping.

def scraper (starting_url , count):
    # insert appropriate lines of code here
    print()

In [None]:
# Putting it all together, the function below will
# use the starting_url we provide and the count we 
# provide to go to a website, collect all of the URLs
# for individual Press Release documents, then download
# and save individual documents.

# Import modules
import requests
from bs4 import BeautifulSoup

def scraper( starting_url , count):

    # Specify a webpage
    website = starting_url

    # Initiate counter
    page_counter = count

    # Get the webpage
    webpage = requests.get(url = website)

    # Create the soup
    soup = BeautifulSoup(markup = webpage.text)

    # Create a list of results by finding all links "a" within the "collection-results" element.
    results = soup.find(class_="collection-results").find_all(name = "a")

    # For Loop
    for page in results:

        # Specify a webpage
        page_url = page['href']

        # Get the webpage
        webpage = requests.get(page_url)

        # Create filename
        filename = "web_documents/{}.html".format(page_counter)

        # Save the webpage
        html_file= open(file = filename, mode = "w", encoding="utf-8")
        html_file.write(webpage.text)
        html_file.close()

        # Update the page_counter
        page_counter = page_counter + 1

In [None]:
# Lastly, call or execute the script!
scraper(starting_url="https://www.state.gov/press-releases/", count=0)

In [None]:
# But wait, there's more!!

# In addition to writing a function to excecute a single
# set of commands, we can also set up the function in such
# a way that it repeats itself multiple times.  This is
# useful because now we can expand the function in a way
# that allows us to go to the "next page" on the Press
# Releases webpage and collect more and more documents.


# The following comment is an oversimplified example of
# a function executing some code and then calling itself
# again to repeat the same code.

# def scraper (parameters...):
#     code code code
#     code code code
#     scraper(parameters...)

# The following function has a fully fleshed out example 
# of this addition.


# Import modules
import requests
from bs4 import BeautifulSoup

def scraper( starting_url , count):
    
    # Specify a webpage
    website = starting_url

    # Initiate counter
    page_counter = count

    # Get the webpage
    webpage = requests.get(url = website)

    # Create the soup
    soup = BeautifulSoup(markup = webpage.text)

    # Create a list of results by finding all links "a" within the "collection-results" element.
    results = soup.find(class_="collection-results").find_all(name = "a")

    # For Loop
    for page in results:

        # Specify a webpage
        page_url = page['href']

        # Get the webpage
        webpage = requests.get(page_url)

        # Create filename
        filename = "web_documents/{}.html".format(page_counter)

        # Save the webpage
        html_file= open(file = filename, mode = "w", encoding="utf-8")
        html_file.write(webpage.text)
        html_file.close()

        # Update the page_counter
        page_counter = page_counter + 1
    

#     The following block of code adds recursion to the function.
#     First -  It uses the soup to see if there is a "next page-numbers" tag (i.e. "next button")
#     Second - If so, it extracts the URL for the next page of Press Releases results.
#     Third -  It checks the current value of page_counter.  If the value is less than 30,
#              it calls the "scraper()" function again and passes the next_url and page_counter
#              parameters and the WHOLE scraper function repeats itself.
#     Fourth - If page_counter is equal to or greater than 30, the scraper() function stops.

    if soup.find(class_="next page-numbers"):
        next_page = soup.find(class_="next page-numbers")
        next_url = next_page['href']
        
        if page_counter < 30:
            scraper(starting_url = next_url, count = page_counter)

In [None]:
# Same code as above, but with fewer comments.

# Import modules
import requests
from bs4 import BeautifulSoup

def scraper( starting_url , count):
    
    # Specify a webpage
    website = starting_url

    # Initiate counter
    page_counter = count

    # Get the webpage
    webpage = requests.get(url = website)

    # Create the soup
    soup = BeautifulSoup(markup = webpage.text)

    # Create a list of results by finding all links "a" within the "collection-results" element.
    results = soup.find(class_="collection-results").find_all(name = "a")

    # For Loop
    for page in results:

        # Specify a webpage
        page_url = page['href']

        # Get the webpage
        webpage = requests.get(page_url)

        # Create filename
        filename = "web_documents/{}.html".format(page_counter)

        # Save the webpage
        html_file= open(file = filename, mode = "w", encoding="utf-8")
        html_file.write(webpage.text)
        html_file.close()

        # Update the page_counter
        page_counter = page_counter + 1
    
    # Advance to the next page and repeat the process
    if soup.find(class_="next page-numbers"):
        next_page = soup.find(class_="next page-numbers")
        next_url = next_page['href']
        
        if page_counter < 30:
            scraper(starting_url = next_url, count = page_counter)

In [None]:
# Execute the final function one last time.
scraper(starting_url="https://www.state.gov/press-releases/", count=0)

### Part 3 - Recap
The following block of code building upon Workshop-1 and everything covered so far in this workshop's code.  Putting it all together, the following dense block of code should (at the time of writing) systematically collect the 30 most recent webpages from the US State Department's Press Releases webpage.

In [None]:
# import requests
# from bs4 import BeautifulSoup
# def scraper( starting_url , count):
#     website = starting_url
#     page_counter = count
#     webpage = requests.get(url = website)
#     soup = BeautifulSoup(markup = webpage.text)
#     results = soup.find(class_="collection-results").find_all(name = "a")
#     for page in results:
#         page_url = page['href']
#         webpage = requests.get(page_url)
#         filename = "web_documents/{}.html".format(page_counter)
#         html_file= open(file = filename, mode = "w", encoding="utf-8")
#         html_file.write(webpage.text)
#         html_file.close()
#         page_counter = page_counter + 1
#     if soup.find(class_="next page-numbers"):
#         next_page = soup.find(class_="next page-numbers")
#         next_url = next_page['href']
#         if page_counter < 30:
#             scraper(starting_url = next_url, count = page_counter)
# scraper(starting_url="https://www.state.gov/press-releases/", count=0)

# Part 4 - URL Parameters

When collecting data using web-scraping, another key feature of website to pay attention to is how URLs are used to represent information.  In some cases, URLs are static representations of webpages.  However, increasingly on many websites, URLs contain various parameters that are dynamic.  Usually indicated by a "?", these parameters will often contain information like the keywords you used in a search, metadata about where you are being referred to or from, etc...Some examples may help...

When using the search-box on the NYTimes website, you are directed to the following URL:

https://www.nytimes.com/search?query=covid

Following the word "...search" in the URL is one parameter-value pair: "query=covid".  This is in effect, dynamically telling the NYTimes web server that you want the search results page for the keyword "covid".

On the search page itself, if you change some of the search filters, you will notice that you are directed to a new and evolved URL with even more parameters:

https://www.nytimes.com/search?dropmab=true&endDate=20200923&query=covid&sort=newest&startDate=20200916&types=article

Some of the new URL Parameters are probably easy to interpret.  Others, like "dropmab" might require a little exploration and experimentation to figure out what they mean.
- dropmab = true
- endDate = 20200923
- query = covid
- sort = newest
- startDate = 20200916
- types = article

Ultimately, being able to inspect and understand what these parameters are may give you another method for collecting information from websites...LET'S PRACTICE!

#### Start by visiting : https://www.nytimes.com/search

In [None]:
# Import modules
import requests
from bs4 import BeautifulSoup

# Specify a webpage
nytimes = 'https://www.nytimes.com/search'

# Get the webpage
webpage = requests.get(url = nytimes)

# Create the soup
soup = BeautifulSoup(markup = webpage.text)

In [None]:
# First, identify an appropriate method (e.g. find_all(...) ) for identifying data on the page.
# In this case, I will find the HTML "class" that identifies the article-title for each search-result.
soup.find_all(class_="css-2fgx4k")

#### Use the NYTimes search to start a simple keyword search and inspect the URL
#### https://www.nytimes.com/search?query=covid

In [None]:
# Get the webpage, but this time include a params=... argument in the requests.get(...) function.
# The params=... argument requires a "dict" or "dictionary" form of information.

# Dict example:  { key : value }
# Dict example:  { "name" : "jeremy" }

# Dict example:  { key1 : value1 , key2 : value2 }
# Dict example:  { "first_name" : "jeremy" , "last_name" : "walker" }

# Specify a webpage
nytimes = 'https://www.nytimes.com/search'

# Get the webpage, with params
webpage = requests.get(
    url = nytimes,
    params = { "query" : "covid" } 
)

In [None]:
# Inspect the URL
webpage.url

In [None]:
# Create the soup
soup = BeautifulSoup(markup = webpage.text)

# Find and display the targetted results...
soup.find_all(class_="css-2fgx4k")

#### Now, using the NYTimes search page, change the "date" and "type" options on the search-page
#### https://www.nytimes.com/search?dropmab=true&endDate=20200910&query=covid&sort=best&startDate=20200901&types=article

In [None]:
# Specify a webpage
nytimes = 'https://www.nytimes.com/search'


# Get the webpage, with params
webpage = requests.get(
    url = nytimes,
    params = { 
        "dropmab" : "true" ,
        "endDate" : "20200910" ,
        "query" : "covid" ,
        "sort" : "best" ,
        "startDate" : "20200901" ,
        "types" : "article" ,
    } 
)

In [None]:
# Inspect the URL
webpage.url

In [None]:
# Create the soup
soup = BeautifulSoup(markup = webpage.text)

# Find and display the targetted results...
soup.find_all(class_="css-2fgx4k")

In [None]:
# EVERY WEBSITE IS DIFFERENT
# Some websites are flexible with required and optional parameters, formatting, etc...
# The example below is based on the prior code, but with some URL parameters missing

# Specify a webpage
nytimes = 'https://www.nytimes.com/search'



# Get the webpage, with params (some commented out)
webpage = requests.get(
    url = nytimes,
    params = { 
#         "dropmab" : "true" ,
        "endDate" : "20200910" ,
        "query" : "covid" ,
#         "sort" : "best" ,
        "startDate" : "20200901" ,
#         "types" : "article" ,
    } 
)

# Inspect the URL
print(webpage.url)
print()

# Create the soup
soup = BeautifulSoup(markup = webpage.text)

# Find and display the targetted results...
soup.find_all(class_="css-2fgx4k")

#### Iterating through content (again!)

In [None]:
# Define list of keyword search terms
keywords_list = ["covid","coronavirus","pandemic","CDC"]

for keyword in keywords_list:
    print(keyword)

In [None]:
# Import modules
import requests
from bs4 import BeautifulSoup

# Specify a webpage
nytimes = 'https://www.nytimes.com/search'

# Define list of keywords
keywords_list = ["covid","coronavirus","pandemic","CDC"]

# Create empty list of results to add items to
results = []

# Iterate through the list of keywords and use requests.get(...) to collect article-items 
# from each respective search results page.
for keyword in keywords_list:
    webpage = requests.get(
        url = nytimes,
        params = { 
        "query" : keyword ,
        "types" : "article" ,
        "startDate" : "20200901" ,
        "endDate" : "20200910" ,
        } 
    )
    
    print("Searching for keyword: {}".format(keyword))
    print(webpage.url)
    
    # Create the soup
    soup = BeautifulSoup(markup = webpage.text)
    
    # Find and display the targetted results...
    results = results + soup.find_all(class_="css-2fgx4k")

In [None]:
# Inspect the newly updated results object
results

### Part 4 - Practice

In [None]:
# Using the outline below, conduct your own URL parameter-based search using the following prompts

# keywords_list : define your own list of keywords to search

# params
# "types" : "article"
# "starDate" : "YYYYMMDD" (choose your own start date)
# "endDate" : "YYYYMMDD" (choose your own end date)

# soup - Using find_all(...) identify the HTML class_ that identifies the byline of each search result (i.e. the descriptive sentences that are directly under the article-titles)



# Import modules
import requests
from bs4 import BeautifulSoup

# Specify a webpage
nytimes = 'https://www.nytimes.com/search'

# Define list of keywords
keywords_list = ["???","???","???","???"]

# Create empty list of results to add items to
results = []

# Iterate through the list of keywords and use requests.get(...) to collect article-items 
# from each respective search results page.
for keyword in keywords_list:
    webpage = requests.get(
        url = nytimes,
        ??? = { 
        "query" : keyword ,
        "???" : "???" ,
        "???" : "???" ,
        "???" : "???" ,
        } 
    )
    
    print("Searching for keyword: {}".format(keyword))
    print(webpage.url)
    
    # Create the soup
    soup = BeautifulSoup(markup = webpage.text)
    
    # Find and display the targetted results...
    results = results + soup.find_all(class_="css-2fgx4k")

In [None]:
# Inspect the results
results

# Part 5 - Selenium WebDriver
Not all websites contain "static" content.  In many cases, websites will dynamically generate content using Javascript and a variety of adjacent tools.  If you think about websites like Facebook or LinkedIn that allow you to scroll "infinitely" down, those are examples of dynamic content.  This type of content CAN NOT be captured by simply using the default Requests or BeautifulSoup packages.  We have to use an automated-browser of some form in order to scrape dynamic content.

Note: this section introduces multiple complex addition to web-scraping processes.  It may be challenging to get everything to work on your computer.  It may be challenging to understand the tools and methods overall.  It may be challenging.  Stick with it though and you'll get there!

Core Python Selenium Module:
- https://selenium-python.readthedocs.io/
- __(Installation Guide)__ https://selenium-python.readthedocs.io/installation.html#installation

Two Unofficial Guides (many more exist though)
- https://towardsdatascience.com/web-scraping-using-selenium-python-8a60f4cf40ab
- https://www.scrapingbee.com/blog/selenium-python/

### Import Selenium
You may receive an error if you do not have the Python Selenium module installed (it is not included with Anaconda by default).  To install it, you will need to launch the "Anaconda Prompt" on your computer and type the following into the command line:

_conda install selenium_

For further info on this, please see the documentation: https://docs.anaconda.com/anaconda/user-guide/tasks/install-packages/

In [None]:
# Import BeautifulSoup4
from bs4 import BeautifulSoup

# From the selenium package, we need to import the webdriver and Keys functions.

from selenium import webdriver
from selenium.webdriver.common.keys import Keys

### Create "driver" object and start driving the Chromedriver using Python commands

In [None]:
# Create a driver object using webdriver.Chrome(...).  This is the object that
# will be used to steer and command a standalone version of Chrome.

# This will launch an independent Chrome browser
driver = webdriver.Chrome( executable_path="chromedriver.exe")
driver.implicitly_wait(5)
driver.set_window_size(1200, 1000)  # forces the Chrome window to a specific resolution

In [None]:
# Tell Chrome to go to a specific webpage
driver.get(url="https://www.nytimes.com")

In [None]:
# Inspect current URL
driver.current_url

In [None]:
# Begin isolating and targetting elements of the webpage.
# In this case, find the "Search" icon
search_button = driver.find_element_by_class_name("css-fnhm75")

In [None]:
# Once identified, driver can "click" on the targetted element on screen
search_button.click()

In [None]:
# Identify and select the text-field for the search box
search_field = driver.find_element_by_class_name("css-1axrnfw")

In [None]:
# Use .send_keys(...) to type letters into the search_field
search_field.send_keys("covid")

In [None]:
# Once text is entered in the search_field, send an "ENTER" command
search_field.send_keys(Keys.ENTER)

In [None]:
# Inspect current URL
driver.current_url

In [None]:
# Create soup and extract results
soup = BeautifulSoup( markup = driver.page_source )
results = soup.find_all(class_="css-2fgx4k")
results

In [None]:
# Close the browser
driver.close()

### Part 5 - ONE GIANT EXAMPLE
The following block of code represents a sequence that uses Selenium WebDriver to systematically collect the first 30-40 results from the State Department's Press Releases webpages.  It's a lot to take in all at once, but nearly all of the key functional components are drawn from other material covered in this workshop series.  Take your time when trying to read through it and understand it.

Note: This does not necessarily represent the most efficient way to write and execute the code for the task at hand.  It is structured so as to be as explanatory and as an instructional as possible.  As your skills develop, you will likely find many ways to make this code more succinct and efficient.

In [None]:
# Import BeautifulSoup4
from bs4 import BeautifulSoup

# Import Selenium webdriver
from selenium import webdriver

# Import a few utilities
import time
import pandas as pd

# Create an empty dataframe to store results in
data = pd.DataFrame(columns=["date","title","url","content"])

# Define starting base_url
base_url = "https://www.state.gov/press-releases/"

# Initiate a webdriver using chromedriver
driver = webdriver.Chrome( executable_path="chromedriver.exe")
driver.implicitly_wait(1)
driver.set_window_size(1000, 1200)

# Go to starting press release directory page ()
driver.get( url = base_url )

In [None]:
# Display press release directory URL
print("URL Directory Page: {}".format(driver.current_url))
print()

In [None]:
# Use Beautiful soup to find all of the links to individual press releases
# and store the links in a "results" object.
soup = BeautifulSoup(markup=driver.page_source)
collection_results = soup.find_all(class_="collection-result")
collection_results

In [None]:
results = []
for item in collection_results:
    link = item.find("a")["href"]
    results.append(link)

results

In [None]:
# Go to the first result
driver.get(results[0])

In [None]:
# Iterate through the results and use the driver to go to each link and harvest data
for i in results:
    # go to press release document
    driver.get(i)  

    # create soup from the document page
    soup = BeautifulSoup(markup=driver.page_source) 

    # create objects for each piece of information to be stored in the data object
    url = i
    title = soup.find(class_="featured-content__headline").text
    date = soup.find(class_="article-meta__publish-date").text
    content = soup.find(class_="entry-content").text

    # Append harvested information to the data object
    data = data.append({"date":date,"title":title,"url":url,"content":content}, ignore_index=True)

    # display the trimmed text of the document-title
    print("Doc Title: {}".format( title.strip()) )

In [None]:
# Send the driver back to the base_url, representing the press-release directory
driver.get( url = base_url )

In [None]:
# Inspect the data
data.head()

In [None]:
# Do a little data cleanup
data["title"] = data["title"].str.strip()
data["content"] = data["content"].str.strip()
data["date"] = pd.to_datetime(data["date"])

In [None]:
# Inspect the data
data.head()

In [None]:
# Check to see if there is a "next button" in the directory

next_button = driver.find_element_by_css_selector("a.next.page-numbers")
next_page = next_button.get_attribute("href")
base_url = next_page
base_url

In [None]:
# Send the driver to the newly updated base_url
driver.get( url = base_url )

In [None]:
# Close the driver
driver.close()

### Part 5 - ONE GIANT EXAMPLE (ALTOGETHER)

In [None]:
# Import BeautifulSoup4
from bs4 import BeautifulSoup

# Import Selenium webdriver
from selenium import webdriver

# Import a few utilities
import time
import pandas as pd

# Create an empty dataframe to store results in
data = pd.DataFrame(columns=["date","title","url","content"])

# Define starting base_url
base_url = "https://www.state.gov/press-releases/"

# Initiate a webdriver using chromedriver
driver = webdriver.Chrome( executable_path="chromedriver.exe")
driver.implicitly_wait(1)
driver.set_window_size(1000, 1200)

# Establish counter that will keep track of how many results have been collected
counter = 0

# Create a while loop (https://www.tutorialspoint.com/python/python_while_loop.htm)
# So long as counter is less than or equal to 30, the process will repeat
while counter < 30:
    # Go to starting press release directory page ()
    driver.get( url = base_url )
    
    # Display press release directory URL
    print("URL Directory Page: {}".format(driver.current_url))
    print()
    
    # Use Beautiful soup to find all of the links to individual press releases
    # and store the links in a "results" object.
    soup = BeautifulSoup(markup=driver.page_source)
    collection_results = soup.find_all(class_="collection-result")
    
    results = []
    for item in collection_results:
        link = item.find("a")["href"]
        results.append(link)
    print("List of links to individual press release documents:")
    print(results)
    print()
    
    # Iterate through the results and use the driver to go to each link and harvest data
    for i in results:
        # go to press release document
        driver.get(i)  
        
        # create soup from the document page
        soup = BeautifulSoup(markup=driver.page_source) 
        
        # create objects for each piece of information to be stored in the data object
        url = i
        title = soup.find(class_="featured-content__headline").text
        date = soup.find(class_="article-meta__publish-date").text
        content = soup.find(class_="entry-content").text
        
        # Append harvested information to the data object
        data = data.append({"date":date,"title":title,"url":url,"content":content}, ignore_index=True)
        
        # display the trimmed text of the document-title
        print("Doc Title: {}".format( title.strip()) )
    
    # Update the counter to reflect the current number of rows in the data object
    counter = data.shape[0]
    print()
    print("Total documents collected so far...{}".format(counter))
    print()
    
    # Send the driver back to the base_url, representing the press-release directory
    driver.get( url = base_url )
    
    # Try to find a "next button" and extract the URL for the next page in the directory.
    # If there is an error ("except"), then break and exit the while-loop.
    try:
        next_button = driver.find_element_by_css_selector("a.next.page-numbers")
        next_page = next_button.get_attribute("href")
        base_url = next_page
    except:
        break

# Close the driver
driver.close()

# Do a little data cleanup
data["title"] = data["title"].str.strip()
data["content"] = data["content"].str.strip()
data["date"] = pd.to_datetime(data["date"])

In [None]:
# Inspect your data
data.head()

In [None]:
# Save your work!
data.to_csv("state_dept.csv", index=False)

### Part 5 - Practice!
This practice exercise is radically more simple than the example above. Overall, you have a few specific objectives:
- First, make sure you can successfully import selenium webdrive, create a driver object to launch a browser, then go to the NYTimes search-page
- Second, using Python/Selenium to type search terms into the search field
- Third, change the "Date Range" filter to "Yesterday"
- Fourth, use BeautifulSoup to create a soup of the search-results page
- Fifth, identify and extract the headline tags for each article on screen

In [None]:
# Import BeautifulSoup4
from bs4 import BeautifulSoup

# Import Selenium webdriver
from selenium import webdriver

# Define starting base_url
base_url = "https://www.nytimes.com/search"

# Initiate a webdriver using chromedriver
driver = webdriver.Chrome( executable_path="chromedriver.exe")
driver.implicitly_wait(1)
# driver.set_window_size(1000, 1200) # uncomment this line and edit if needed

# Send the driver to the NYT search page
driver.get(url = base_url)

#### Type text into the search field

In [None]:
# Find the search text field by its class name
search_field = driver.find_element_by_class_name("???")

In [None]:
# Send the keyword "orange" or something else silly to the search box
search_field.send_keys("????")

In [None]:
# Send the following command to CTRL+A (i.e. "select all") text
search_field.send_keys(Keys.CONTROL + "a")

In [None]:
# Use the Keys.DELETE command to delete the selected text
search_field.send_keys(Keys.???)

In [None]:
# Use .send_keys(...) again to send the keyword "covid"
search_field.???("???")

In [None]:
# Use .send_keys(...) and Keys.ENTER to submit the search term
search_field.???(Keys.???)

#### Select the "Date Range" menu button and click on "Yesterday"

In [None]:
# Find the "Date Range" element.

# For the following practice exercises, there are many ways you may be able to 
# identify the element by class name, css selector, or xpath.

# driver.find_element_by_class_name(...)
# driver.find_element_by_css_selector(...)
# driver.find_element_by_xpath(...)

# Hint: For this button, there is a <div> tag that has a role="form" as its parent.
# You can use the class="..." from that <div> to identify the button.
date = driver.find_element_by_class_name("???")

In [None]:
# Use the .click() method
date.???()

In [None]:
# Find the element for the "Yesterday" option in the "Date Range" menu
yesterday = driver.???("???")

In [None]:
# Click on "Yesterday"
yesterday.???()

#### Create a soup and extract a list of article-titles

In [None]:
# Create a soup object using the driver.page_source for the HTML markup
soup = BeautifulSoup(markup=???.???)

In [None]:
# In the soup, use .find_all(...) to identify the list of headline tags.
# It is sufficient to simply capture the <li> or <h4> tags that contain
# the headline text and info.

titles = soup.find_all(name = "???", class_="???")

In [None]:
# Display the headline titles
titles

#### Close the driver!

In [None]:
# Finally, close the driver.
driver.???()