# RDS@GSU - Getting Started with Web Scraping

#### Copyright + References

In [1]:
# The content in this notebook was developed by Jeremy Walker.
# All sample code and notes are provided under a Creative Commons
# ShareAlike license.

# Official Copyright Rules / Restrictions / Priveleges
# Attribution-ShareAlike 4.0 International (CC BY-SA 4.0)
# https://creativecommons.org/licenses/by-sa/4.0/

### Content Notes

In [2]:
# All of this content was updated in September 2020.  Depending
# on when this content is accessed, and due to the nature of how 
# individual websites change their code, the code in this notebook
# may not function properly in the future.

# Part 0 - Ensure Required Modules Load

In [3]:
# Requests Module
# https://requests.readthedocs.io/en/master/user/quickstart/
import requests

# BeautifulSoup Module
# https://www.crummy.com/software/BeautifulSoup/bs4/doc/
from bs4 import BeautifulSoup

# Part 1 - Getting a webpage and saving it locally

In [5]:
# Specify a webpage
state_dept = 'https://www.state.gov/press-releases/'

In [6]:
state_dept

'https://www.state.gov/press-releases/'

In [7]:
# Using the Requests module, the "get" function
# allows us to use Python to fetch the webpage
webpage = requests.get(url = state_dept)

In [8]:
# The webpage object has various attributes
# and methods.  The ".text" will directly show
# the unformatted HTML code for the webpage
webpage.text

'<!doctype html>\n<html lang="en-US">\n<head>\n\t<meta charset="UTF-8">\n\t<meta name="viewport" content="width=device-width, initial-scale=1">\n\t<meta name="msapplication-tap-highlight" content="no" />\n\t<link rel="profile" href="http://gmpg.org/xfn/11">\n\t\n<!-- Google Tag Manager for WordPress by gtm4wp.com -->\n<script data-cfasync="false" data-pagespeed-no-defer type="text/javascript">//<![CDATA[\n\tvar gtm4wp_datalayer_name = "dataLayer";\n\tvar dataLayer = dataLayer || [];\n//]]>\n</script>\n<!-- End Google Tag Manager for WordPress by gtm4wp.com -->\n\t<!-- This site is optimized with the Yoast SEO plugin v14.8.1 - https://yoast.com/wordpress/plugins/seo/ -->\n\t<title>Press Releases - United States Department of State</title>\n\t<meta name="robots" content="index, follow" />\n\t<meta name="googlebot" content="index, follow, max-snippet:-1, max-image-preview:large, max-video-preview:-1" />\n\t<meta name="bingbot" content="index, follow, max-snippet:-1, max-image-preview:larg

In [10]:
# 1) Open the Python connection to the file
# 2) Write contents to the file
# 3) Close the Python connection to the file

# html_file = open(file = "REPLACE THIS PART", mode = "w", encoding = "utf-8")
# html_file = open(file = "directory_for_file_to_save.filetype", mode = "w", encoding = "utf-8")

html_file = open(file = "web_documents/press_release_directory.html", mode = "w", encoding = "utf-8")
html_file.write(webpage.text)
html_file.close()

### Part 1 - Recap

In [9]:
# Import modules
import requests
from bs4 import BeautifulSoup

# Specify a webpage
state_dept = 'https://www.state.gov/press-releases/'

# Get the webpage
webpage = requests.get(url = state_dept)

# Save the webpage
html_file = open(file = "web_documents/press_release_directory.html", mode = "w", encoding = "utf-8")
html_file.write(webpage.text)
html_file.close()

### Part 1 - Practice
The code below uses the same code as above, but with ??? replacing many key areas. You can practice by simply replicating what is above.  Or you can also try using a different URL, saving the file with a different name, or even changing the names of individual objects/variables.

In [None]:
# Import modules
import requests
from bs4 import BeautifulSoup

# Specify a webpage
state_dept = '???'

# Get the webpage and assign it to a "webpage" object
??? = requests.get(url = ???)

# Save the webpage - specify the filename
html_file = open(file = "web_documents/??????????.html", mode = "w", encoding = "utf-8")

# Save the webpage - write the webpage.text and then close the file
html_file.write(????.???)
html_file.close()

# Part 2 - Navigating HTML code
### Objective - Extracting the list of links for each individual press release.

In [None]:
# Import modules
import requests
from bs4 import BeautifulSoup

# Specify a webpage
state_dept = 'https://www.state.gov/press-releases/'

# Get the webpage
webpage = requests.get(url = state_dept)

# Save the webpage
html_file = open(file = "web_documents/press_release_directory.html", mode = "w", encoding = "utf-8")
html_file.write(webpage.text)
html_file.close()

In [11]:
# Using BeautifulSoup, we can transform "webpage.text"
# into an object that is organized in a way that is 
# easy to parse, navigate, and dissect. The resulting
# object is commonly referred to as "soup" and that
# is also the most common placeholder name.

soup = BeautifulSoup(markup = webpage.text)

In [12]:
# Although the soup looks like HTML, it actually
# represents a bag of "Tag" objects.  These contain
# all of the data and metadata from the webpage.

soup

<!DOCTYPE html>
<html lang="en-US">
<head>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<meta content="no" name="msapplication-tap-highlight"/>
<link href="http://gmpg.org/xfn/11" rel="profile"/>
<!-- Google Tag Manager for WordPress by gtm4wp.com -->
<script data-cfasync="false" data-pagespeed-no-defer="" type="text/javascript">//<![CDATA[
	var gtm4wp_datalayer_name = "dataLayer";
	var dataLayer = dataLayer || [];
//]]>
</script>
<!-- End Google Tag Manager for WordPress by gtm4wp.com -->
<!-- This site is optimized with the Yoast SEO plugin v14.8.1 - https://yoast.com/wordpress/plugins/seo/ -->
<title>Press Releases - United States Department of State</title>
<meta content="index, follow" name="robots"/>
<meta content="index, follow, max-snippet:-1, max-image-preview:large, max-video-preview:-1" name="googlebot"/>
<meta content="index, follow, max-snippet:-1, max-image-preview:large, max-video-preview:-1" name="bingbot"/>
<link href="h

In [13]:
# Within the soup, the ".find()" method returns
# the first tag matching the specified name or parameters.

# Example: finding the first "meta" tag in the soup
soup.find(name = "meta")

<meta charset="utf-8"/>

In [14]:
# ".find_all()" is similar, but returns a
# list of all the tags with matching criteria.

soup.find_all(name = "meta")

[<meta charset="utf-8"/>,
 <meta content="width=device-width, initial-scale=1" name="viewport"/>,
 <meta content="no" name="msapplication-tap-highlight"/>,
 <meta content="index, follow" name="robots"/>,
 <meta content="index, follow, max-snippet:-1, max-image-preview:large, max-video-preview:-1" name="googlebot"/>,
 <meta content="index, follow, max-snippet:-1, max-image-preview:large, max-video-preview:-1" name="bingbot"/>,
 <meta content="en_US" property="og:locale"/>,
 <meta content="article" property="og:type"/>,
 <meta content="Press Releases - United States Department of State" property="og:title"/>,
 <meta content="https://www.state.gov/press-releases/" property="og:url"/>,
 <meta content="United States Department of State" property="og:site_name"/>,
 <meta content="2020-04-06T19:10:28+00:00" property="article:modified_time"/>,
 <meta content="https://www.state.gov/wp-content/uploads/2019/12/dos_seal.png" property="og:image"/>,
 <meta content="1200" property="og:image:width"/>,

In [15]:
# Find all "p" tags in the soup.
soup.find_all(name="p")

[<p class="nav__policies-header">Policy <span class="nav__policies-header-decor">Issues</span></p>,
 <p class="nav__countries-header">Countries &amp; <span class="nav__countries-header-decor">Areas</span></p>,
 <p class="nav__about-header"><span class="nav__about-header-decor">About</span></p>,
 <p class="nav__about-list-header">Mission</p>,
 <p class="nav__about-list-header">Leadership</p>,
 <p class="nav__about-list-header">Administrative</p>,
 <p class="nav__about-list-header">History</p>,
 <p class="nav__about-list-header">Directories</p>,
 <p class="nav__about-list-header">Visit</p>,
 <p class="nav__bureaus-offices-header js-panel-header">Bureaus &amp; <span class="nav__bureaus-offices-header-decor">Offices</span></p>,
 <p class="collection-header__body">
 
 						The Office of the Spokesperson releases statements, media notes, notices to the press and fact sheets on a daily basis. These are posted to our website as they are released throughout the day.
 					</p>,
 <p class="coll

In [16]:
# Find all "li" tags in the soup.
soup.find_all(name="li")

[<li class="menu-item menu-item-type-post_type menu-item-object-page menu-item-7457" id="menu-item-7457"><a href="https://www.state.gov/press/">Press</a></li>,
 <li class="menu-item menu-item-type-post_type menu-item-object-page menu-item-7458" id="menu-item-7458"><a href="https://www.state.gov/business/">Business</a></li>,
 <li class="menu-item menu-item-type-post_type menu-item-object-page menu-item-7459" id="menu-item-7459"><a href="https://www.state.gov/employees/">Employees</a></li>,
 <li class="menu-item menu-item-type-post_type menu-item-object-page menu-item-7460" id="menu-item-7460"><a href="https://www.state.gov/job-seekers/">Job Seekers</a></li>,
 <li class="menu-item menu-item-type-post_type menu-item-object-page menu-item-7461" id="menu-item-7461"><a href="https://www.state.gov/students/">Students</a></li>,
 <li class="menu-item menu-item-type-post_type menu-item-object-page menu-item-7462" id="menu-item-7462"><a href="https://www.state.gov/travelers/">Travelers</a></li>,


In [17]:
# In addition to the tag names, you can find
# elements or tags according to their attributes
# such as "class", "id", "href", etc...

# Find all tags in the soup with the class "collection-header"
soup.find(class_="collection-header")

<header class="collection-header">
<div class="frame">
<!-- Eyebrow -->
<!-- Title -->
<h1 class="stars-above collection-header__title">

					Press Releases
				</h1>
<!-- Description -->
<p class="collection-header__body">

						The Office of the Spokesperson releases statements, media notes, notices to the press and fact sheets on a daily basis. These are posted to our website as they are released throughout the day.
					</p>
<div class="page-header__actions">
<a class="button button--outline-white button--arrow button--inline-block" href="http://service.govdelivery.com/service/subscribe.html?code=USSTATEBPA_8" target="_blank">
<div class="button__content">
		Subscribe<span class="icon-svg">
<svg role="presentation" version="1.1" viewbox="0 0 985 512" xmlns="http://www.w3.org/2000/svg">
<defs>
<style>
		.cls-1{fill:#c1a783}
	</style>
</defs>
<path class="cls-1" d="M688.939 48.406l36.644-36.644 244.047 244.047-36.644 36.643-244.047-244.047z"></path>
<path class="cls-1" d="M688.858 

In [18]:
# You can also chain together multiple "find" and "find_all"
# and other methods associated with BeautifulSoup4 objects.

# Find the first tag with the class "collection-header" and
# within that tag, find all "p" tags.
soup.find(class_="collection-header").find_all(name = "p")

[<p class="collection-header__body">
 
 						The Office of the Spokesperson releases statements, media notes, notices to the press and fact sheets on a daily basis. These are posted to our website as they are released throughout the day.
 					</p>]

### Hands on...

In [30]:
# Attempt to identify and retrieve the list of individual press
# release documents that are listed on the Press Releases webpage.

soup.find_all(class_="collection-result__link")#.find_all(name = "li")

[<a class="collection-result__link" href="https://www.state.gov/chiles-independence-day-2/">
 
 		Chile’s Independence Day
 	</a>,
 <a class="collection-result__link" href="https://www.state.gov/united-states-republic-of-lithuania-memorandum-of-understanding-on-5g-security/">
 
 		United States – Republic of Lithuania Memorandum of Understanding on 5G Security
 	</a>,
 <a class="collection-result__link" href="https://www.state.gov/the-united-states-designates-hizballah-companies-and-official/">
 
 		The United States Designates Hizballah Companies and Official
 	</a>,
 <a class="collection-result__link" href="https://www.state.gov/the-united-states-sanctions-cyber-actors-backed-by-iranian-intelligence-ministry/">
 
 		The United States Sanctions Cyber Actors Backed by Iranian Intelligence Ministry
 	</a>,
 <a class="collection-result__link" href="https://www.state.gov/the-united-states-announces-an-additional-5-million-for-education-cannot-wait/">
 
 		The United States Announces an Ad

### One possible solution

In [31]:
# Find the first tag with the class "collection-results" and
# within that tag, find all "li" tags.
soup.find(class_="collection-results").find_all(name = "li")

[<li class="collection-result">
 <!-- add label if a report custom post in inculded in collection-->
 <p class="collection-result__date">Press Statement</p>
 <a class="collection-result__link" href="https://www.state.gov/chiles-independence-day-2/">
 
 		Chile’s Independence Day
 	</a>
 <div class="collection-result-meta">
 <span>Michael R. Pompeo</span>
 <span>September 17, 2020</span>
 </div>
 </li>,
 <li class="collection-result">
 <!-- add label if a report custom post in inculded in collection-->
 <p class="collection-result__date">Media Note</p>
 <a class="collection-result__link" href="https://www.state.gov/united-states-republic-of-lithuania-memorandum-of-understanding-on-5g-security/">
 
 		United States – Republic of Lithuania Memorandum of Understanding on 5G Security
 	</a>
 <div class="collection-result-meta">
 <span>September 17, 2020</span>
 </div>
 </li>,
 <li class="collection-result">
 <!-- add label if a report custom post in inculded in collection-->
 <p class="coll

In [32]:
# Find the first tag with the class "collection-results" and
# within that tag, find all "a" tags.
soup.find(class_="collection-results").find_all(name = "a")

[<a class="collection-result__link" href="https://www.state.gov/chiles-independence-day-2/">
 
 		Chile’s Independence Day
 	</a>,
 <a class="collection-result__link" href="https://www.state.gov/united-states-republic-of-lithuania-memorandum-of-understanding-on-5g-security/">
 
 		United States – Republic of Lithuania Memorandum of Understanding on 5G Security
 	</a>,
 <a class="collection-result__link" href="https://www.state.gov/the-united-states-designates-hizballah-companies-and-official/">
 
 		The United States Designates Hizballah Companies and Official
 	</a>,
 <a class="collection-result__link" href="https://www.state.gov/the-united-states-sanctions-cyber-actors-backed-by-iranian-intelligence-ministry/">
 
 		The United States Sanctions Cyber Actors Backed by Iranian Intelligence Ministry
 	</a>,
 <a class="collection-result__link" href="https://www.state.gov/the-united-states-announces-an-additional-5-million-for-education-cannot-wait/">
 
 		The United States Announces an Ad

In [33]:
# Since this final example contains all of the links
# to individual press releases, we should store the 
# result set in its own object: "results"

results = soup.find(class_="collection-results").find_all(name = "a")

In [34]:
results

[<a class="collection-result__link" href="https://www.state.gov/chiles-independence-day-2/">
 
 		Chile’s Independence Day
 	</a>,
 <a class="collection-result__link" href="https://www.state.gov/united-states-republic-of-lithuania-memorandum-of-understanding-on-5g-security/">
 
 		United States – Republic of Lithuania Memorandum of Understanding on 5G Security
 	</a>,
 <a class="collection-result__link" href="https://www.state.gov/the-united-states-designates-hizballah-companies-and-official/">
 
 		The United States Designates Hizballah Companies and Official
 	</a>,
 <a class="collection-result__link" href="https://www.state.gov/the-united-states-sanctions-cyber-actors-backed-by-iranian-intelligence-ministry/">
 
 		The United States Sanctions Cyber Actors Backed by Iranian Intelligence Ministry
 	</a>,
 <a class="collection-result__link" href="https://www.state.gov/the-united-states-announces-an-additional-5-million-for-education-cannot-wait/">
 
 		The United States Announces an Ad

#### Objective - Recap

In [None]:
# Import modules
import requests
from bs4 import BeautifulSoup

# Specify a webpage
state_dept = 'https://www.state.gov/press-releases/'

# Get the webpage
webpage = requests.get(url = state_dept)

# Save the webpage
html_file = open(file = "web_documents/press_release_directory.html", mode = "w", encoding = "utf-8")
html_file.write(webpage.text)
html_file.close()

# Create the soup
soup = BeautifulSoup(markup = webpage.text)

# Create a list of results by finding all links "a" within the "collection-results" element.
results = soup.find(class_="collection-results").find_all(name = "a")

#### Part 2 - Practice

In [None]:
# define an object that will go to a specific URL and then get that webpage
??? = 'https://www.state.gov/press-releases/'
webpage = requests.get( url = ???)

In [None]:
# display the webpage's text
webpage.???

In [None]:
# create a 'soup' object from the webpage's text
soup = BeautifulSoup(markup = ???.text)

In [None]:
# Identify an appropriate method for identifying the listed hyperlinks to the individual 
# documents as a list results containing the "a" links (i.e. URLs / HREF tags)

soup.find(???).find_all(???)

In [None]:
# Create the results object by finding links from within the soup
results = soup.find(???).find_all(???)

In [None]:
# Display the results
results

# Part 3 - Extracting Targetted Information
### Objective - Extracting the data and metadata, specifically hyperlinks, from individual tag items

In [35]:
# Run the following steps to generate a results object containing a list of
# all documents linked on the Press Releases webpage.
state_dept = 'https://www.state.gov/press-releases/'
webpage = requests.get(url = state_dept)
soup = BeautifulSoup(markup = webpage.text)
results = soup.find(class_="collection-results").find_all(name = "a")

# Display the results
results

[<a class="collection-result__link" href="https://www.state.gov/chiles-independence-day-2/">
 
 		Chile’s Independence Day
 	</a>,
 <a class="collection-result__link" href="https://www.state.gov/united-states-republic-of-lithuania-memorandum-of-understanding-on-5g-security/">
 
 		United States – Republic of Lithuania Memorandum of Understanding on 5G Security
 	</a>,
 <a class="collection-result__link" href="https://www.state.gov/the-united-states-designates-hizballah-companies-and-official/">
 
 		The United States Designates Hizballah Companies and Official
 	</a>,
 <a class="collection-result__link" href="https://www.state.gov/the-united-states-sanctions-cyber-actors-backed-by-iranian-intelligence-ministry/">
 
 		The United States Sanctions Cyber Actors Backed by Iranian Intelligence Ministry
 	</a>,
 <a class="collection-result__link" href="https://www.state.gov/the-united-states-announces-an-additional-5-million-for-education-cannot-wait/">
 
 		The United States Announces an Ad

In [36]:
# Create a new object from the first item
# in the list of results.
first_result = results[0]

In [37]:
# Display the tag object to make sure you 
# know what type of info you are looking at.
first_result

<a class="collection-result__link" href="https://www.state.gov/chiles-independence-day-2/">

		Chile’s Independence Day
	</a>

In [38]:
# Inspect the first_result data type.
type(first_result)

bs4.element.Tag

In [39]:
# Display the text associated with the
# first_result tag.
first_result.text

'\n\n\t\tChile’s Independence Day\n\t'

In [40]:
# Display the metadata or attributes.  This is information
# about the tag that is not visibly displayed on the webpage.
first_result.attrs

{'href': 'https://www.state.gov/chiles-independence-day-2/',
 'class': ['collection-result__link']}

In [41]:
# Using the information above, you can extract the text-value
# of the attributes for an individual tag.  For example
# first_result contains an html/css "class" attribute.
first_result['class']

['collection-result__link']

In [42]:
# The same process works for getting the hyperlink ("href")
first_result['href']

'https://www.state.gov/chiles-independence-day-2/'

In [43]:
# Now that we have isolated the URL for a specific page
# you can repeat the process of using Requests to "get"
# the webpage.  In the example below, the "first_webpage"
# object is created by using Requests to get the webpage
# using the "href" attribute from the first_result tag object.
first_webpage = requests.get(url=first_result['href'])

In [44]:
# Finally, just like before, we can save the first_webpage
# to the computer for use later on.

# 1) Open the Python connection to the file
# 2) Write contents to the file
# 3) Close the Python connection to the file

html_file = open(file = "web_documents/first_webpage.html", mode = "w", encoding = "utf-8")
html_file.write(first_webpage.text)
html_file.close()

#### Objective - Recap

In [None]:
# Import modules
import requests
from bs4 import BeautifulSoup

# Specify a webpage
state_dept = 'https://www.state.gov/press-releases/'

# Get the webpage
webpage = requests.get(url = state_dept)

# Save the webpage
html_file = open(file = "web_documents/press_release_directory.html", mode = "w", encoding = "utf-8")
html_file.write(webpage.text)
html_file.close()

# Create the soup
soup = BeautifulSoup(markup = webpage.text)

# Create a list of results by finding all links "a" within the "collection-results" element.
results = soup.find(class_="collection-results").find_all(name = "a")

# Create an object (first_result) from the first item in the "results" list
first_result = results[0]

# Get the webpage for the first_result
first_webpage = requests.get(url=first_result['href'])

# Save the new webpage to a new file
html_file = open(file = "web_documents/first_webpage.html", mode = "w", encoding = "utf-8")
html_file.write(first_webpage.text)
html_file.close()

### Part 3 - Practice
The code below uses the same code as above, but with ??? replacing many key areas. You can practice by simply replicating what is above.  Or you can also try using a different URL, saving the file with a different name, or even changing the names of individual objects/variables.

The main goal is to (A) get a webpage that as a list ("li") of items, (B) extracting the URLs ("a") as a list of results from said webpage, and then (C) get and save a new webpage using one of the links from the list of results.

In [None]:
# Run the following steps to generate a results object containing a list of
# all documents linked on the Press Releases webpage.
state_dept = 'https://www.state.gov/press-releases/'
webpage = requests.get(url = state_dept)
soup = BeautifulSoup(markup = webpage.text)
results = soup.find(class_="collection-results").find_all(name = "a")

# Display the results
results

In [None]:
# Define an object that is a specific item (1, 2, 3, or 4, etc...) in results.
next_result = results[???]

In [None]:
# Display the next_result
next_result

In [None]:
# Display the datatype, type(), of next_result
type(next_result)

In [None]:
# Display the contents of the next_result object
next_result.???

In [None]:
# Display the text of the next_result object
???.???

In [None]:
# Display the attributes (attrs) of the next_result object
next_result.???

In [None]:
# Display the hyperlink (href) from the next_result object
next_result['???']

In [None]:
# Using next_result['href'], get the webpage from the URL
next_webpage = requests.get(url = ???)

In [None]:
# Display the next_webpage's html text
???.text

In [None]:
# 1) Open the Python connection to the file
# 2) Write contents to the file
# 3) Close the Python connection to the file

html_file = open(file = "web_documents/next_webpage.html", mode = "w", encoding = "utf-8")
html_file.write(???.text)
html_file.close()

# Part 4 - Further Practice with the NEW YORK TIMES (www.nytimes.com)

In [46]:
# Import modules
import requests
from bs4 import BeautifulSoup

# Specify a webpage
nyt = 'https://www.nytimes.com/section/todayspaper?redirect_uri=https%3A%2F%2Fwww.nytimes.com%2F'

# Get the webpage
webpage = requests.get(url = nyt)

# Create the soup
soup = BeautifulSoup(markup = webpage.text)

In [47]:
soup

<!DOCTYPE html>
<html lang="en" xmlns:og="http://opengraphprotocol.org/schema/">
<head>
<title data-rh="true">The New York Times in Print for Thursday, Sep. 17, 2020 - The New York Times</title>
<meta content="en-US" data-rh="true" itemprop="inLanguage"/><meta content="collection" data-rh="true" id="applicationName" name="applicationName"/><meta content="todays-new-york-times" data-rh="true" name="nyt-collection:identifier"/><meta content="todays-new-york-times" data-rh="true" name="CN"/><meta content="issuecollection" data-rh="true" name="nyt-collection:type"/><meta content="column" data-rh="true" name="CT"/><meta content="Today’s Paper" data-rh="true" name="nyt-collection:display-name"/><meta content="" data-rh="true" name="nyt-collection:tagline"/><meta content="" data-rh="true" name="nyt-collection:promotional-image"/><meta content="collection" data-rh="true" name="PT"/><meta content="100000007345096" data-rh="true" name="asset_id"/><meta content="todays-new-york-times" data-rh="tr

### Your challenge, can you figure out how to identify and find html tags on the New York Times homepage that contain headline text?

In [None]:
# Here are some examples of different methods and tools and ideas that may help give you a starting point
# soup.find(...)
# soup.find_all(...)
# soup.find(...).find(...).find_all(...)
# soup.find(...).parent.find_all(...)

soup.........

In [57]:
soup.find(class_= ["css-byk1jx","css-ds6ff4"])

<h2 class="css-byk1jx e4e4i5l1"><a data-rref="" href="/2020/09/16/us/hurricane-sally-landfall.html">Hurricane Sally Slams the Florida Panhandle With Deluge of Rain</a></h2>

In [56]:
soup.find_all(name = "h2")

[<h2 class="css-q1brm6"> <!-- -->The Front Page</h2>,
 <h2 class="css-1dv1kvn">Highlights</h2>,
 <h2 class="css-byk1jx e4e4i5l1"><a data-rref="" href="/2020/09/16/us/hurricane-sally-landfall.html">Hurricane Sally Slams the Florida Panhandle With Deluge of Rain</a></h2>,
 <h2 class="css-byk1jx e4e4i5l1"><a data-rref="" href="/2020/09/16/us/politics/trump-cdc-covid-vaccine.html">Trump Scorns His Own Scientists Over Virus Data</a></h2>,
 <h2 class="css-byk1jx e4e4i5l1"><a data-rref="" href="/2020/09/17/us/fires-oregon-detroit.html">A Desperate Bid for Survival as Fire Closed In on an Oregon Mountain Town</a></h2>,
 <h2 class="css-byk1jx e4e4i5l1"><a data-rref="" href="/2020/09/16/business/boeing-737-max-house-report.html">House Report Condemns Boeing and F.A.A. in 737 Max Disasters</a></h2>,
 <h2 class="css-ds6ff4 e1f68otr0">Her Husband Abused Her. But Getting a Divorce Was an Ordeal.</h2>,
 <h2 class="css-ds6ff4 e1f68otr0">War Crime Risk Grows for U.S. Over Saudi Strikes in Yemen</h2>,
 

In [58]:
soup.find_all(class_= ["css-byk1jx","css-ds6ff4"])

[<h2 class="css-byk1jx e4e4i5l1"><a data-rref="" href="/2020/09/16/us/hurricane-sally-landfall.html">Hurricane Sally Slams the Florida Panhandle With Deluge of Rain</a></h2>,
 <h2 class="css-byk1jx e4e4i5l1"><a data-rref="" href="/2020/09/16/us/politics/trump-cdc-covid-vaccine.html">Trump Scorns His Own Scientists Over Virus Data</a></h2>,
 <h2 class="css-byk1jx e4e4i5l1"><a data-rref="" href="/2020/09/17/us/fires-oregon-detroit.html">A Desperate Bid for Survival as Fire Closed In on an Oregon Mountain Town</a></h2>,
 <h2 class="css-byk1jx e4e4i5l1"><a data-rref="" href="/2020/09/16/business/boeing-737-max-house-report.html">House Report Condemns Boeing and F.A.A. in 737 Max Disasters</a></h2>,
 <h2 class="css-ds6ff4 e1f68otr0">Her Husband Abused Her. But Getting a Divorce Was an Ordeal.</h2>,
 <h2 class="css-ds6ff4 e1f68otr0">War Crime Risk Grows for U.S. Over Saudi Strikes in Yemen</h2>,
 <h2 class="css-ds6ff4 e1f68otr0">Japan’s New Leader Picks His Team: Familiar Men, and Fewer Wome

In [60]:
soup.find(class_ = "css-11jjg").find_all(name = "h2")

[<h2 class="css-byk1jx e4e4i5l1"><a data-rref="" href="/2020/09/16/us/hurricane-sally-landfall.html">Hurricane Sally Slams the Florida Panhandle With Deluge of Rain</a></h2>,
 <h2 class="css-byk1jx e4e4i5l1"><a data-rref="" href="/2020/09/16/us/politics/trump-cdc-covid-vaccine.html">Trump Scorns His Own Scientists Over Virus Data</a></h2>,
 <h2 class="css-byk1jx e4e4i5l1"><a data-rref="" href="/2020/09/17/us/fires-oregon-detroit.html">A Desperate Bid for Survival as Fire Closed In on an Oregon Mountain Town</a></h2>,
 <h2 class="css-byk1jx e4e4i5l1"><a data-rref="" href="/2020/09/16/business/boeing-737-max-house-report.html">House Report Condemns Boeing and F.A.A. in 737 Max Disasters</a></h2>]

In [75]:
classlist = []

for i in soup.find_all(name="a"):
    try:
        if i["class"] not in classlist: 
            classlist.append(i["class"])
    except:
        continue
        
classlist

[['css-1f8er69'],
 ['css-nuvmzp'],
 ['css-t66y1h', 'e1huz5gh1'],
 ['css-1k0lris'],
 ['css-2bwtzy'],
 ['css-nm3jss'],
 ['css-jq1cx6']]

In [82]:
# using list comprehension 
# to remove duplicated  
# from list  
unique_class = [] 
[unique_class.append(x) for x in classlist if x not in unique_class]
unique_class

[['css-1f8er69'],
 ['css-nuvmzp'],
 ['css-t66y1h', 'e1huz5gh1'],
 ['css-1k0lris'],
 ['css-2bwtzy'],
 ['css-nm3jss'],
 ['css-jq1cx6']]

In [72]:
set([1,2,1,3,1])

{1, 2, 3}

In [86]:
soup.find_all(class_="css-tskdi9")

[<p class="css-tskdi9 e4e4i5l4">The sluggish storm veered east and intensified before making landfall near the Alabama and Florida state line. Residents and officials said they were not anticipating a direct hit.</p>,
 <p class="css-tskdi9 e4e4i5l4">A public scolding of the C.D.C. chief was only the latest but perhaps the starkest instance when the president has rejected not just the policy advice of his public health officials but the facts and information that they provided.</p>,
 <p class="css-tskdi9 e4e4i5l4">After wildfires left them trapped on the shores of a reservoir near Detroit, Ore., dozens of people and nine firefighters mounted a last stand, hoping for a miracle.</p>,
 <p class="css-tskdi9 e4e4i5l4">A committee’s Democrats say two fatal crashes were a “horrific culmination” of engineering flaws, mismanagement and oversight lapses.</p>]

## Atlanta Journal Constitution Project

In [87]:
# Import modules
import requests
from bs4 import BeautifulSoup

# Specify a webpage
ajc = 'https://www.ajc.com/senior-care-quality-report/homes/'

# Get the webpage
webpage_ajc = requests.get(url = ajc)

# Create the soup
soup = BeautifulSoup(markup = webpage_ajc.text)

In [88]:
soup

<!DOCTYPE html>
<html>
<head>
<title>Unprotected: An AJC investigation of Georgia senior care facilities</title><meta content="width=device-width,initial-scale=1" data-n-head="1" name="viewport"/><meta content="An AJC investigation of assisted living facilities and personal care homes in Georgia for seniors. Included are news stories and information for patients about each facility." data-hid="description" data-n-head="1" name="description"/><meta charset="utf-8" data-n-head="1"/><meta content="text/html; charset=utf-8" data-n-head="1" http-equiv="content-type"/><meta content="IE=Edge,chrome=1" data-n-head="1" http-equiv="X-UA-Compatible"/><meta content="width=device-width,initial-scale=1,maximum-scale=1,user-scalable=no" data-n-head="1" name="viewport"/><meta content="black" data-n-head="1" name="apple-mobile-web-app-status-bar-style"/><meta content="@ajc" data-hid="twitter:site" data-n-head="1" name="twitter:site"/><meta content="https://ajc.com/senior-care-quality-report/img/social-

In [116]:
soup.find(name = "title")

<title>Unprotected: An AJC investigation of Georgia senior care facilities</title>

In [122]:
soup.find(name = "body")

<body>
<div id="__nuxt"><style>#nuxt-loading{visibility:hidden;opacity:0;position:absolute;left:0;right:0;top:0;bottom:0;display:flex;justify-content:center;align-items:center;flex-direction:column;animation:nuxtLoadingIn 10s ease;-webkit-animation:nuxtLoadingIn 10s ease;animation-fill-mode:forwards;overflow:hidden}@keyframes nuxtLoadingIn{0%{visibility:hidden;opacity:0}20%{visibility:visible;opacity:0}100%{visibility:visible;opacity:1}}@-webkit-keyframes nuxtLoadingIn{0%{visibility:hidden;opacity:0}20%{visibility:visible;opacity:0}100%{visibility:visible;opacity:1}}#nuxt-loading>div,#nuxt-loading>div:after{border-radius:50%;width:5rem;height:5rem}#nuxt-loading>div{font-size:10px;position:relative;text-indent:-9999em;border:.5rem solid #f5f5f5;border-left:.5rem solid #fff;-webkit-transform:translateZ(0);-ms-transform:translateZ(0);transform:translateZ(0);-webkit-animation:nuxtLoading 1.1s infinite linear;animation:nuxtLoading 1.1s infinite linear}#nuxt-loading.error>div{border-left:.5r

In [150]:
soup.find(name = "body").find(name = "div").find("div").find("div") #.find(name="container")

<div>Loading...</div>

In [151]:
soup.find_all(class_ = "section") #.find_all(name = "h2")

[]

In [100]:
# ".find_all()" is similar, but returns a list of all the tags with matching criteria.

soup.find_all(name = "meta")

[<meta content="width=device-width,initial-scale=1" data-n-head="1" name="viewport"/>,
 <meta content="An AJC investigation of assisted living facilities and personal care homes in Georgia for seniors. Included are news stories and information for patients about each facility." data-hid="description" data-n-head="1" name="description"/>,
 <meta charset="utf-8" data-n-head="1"/>,
 <meta content="text/html; charset=utf-8" data-n-head="1" http-equiv="content-type"/>,
 <meta content="IE=Edge,chrome=1" data-n-head="1" http-equiv="X-UA-Compatible"/>,
 <meta content="width=device-width,initial-scale=1,maximum-scale=1,user-scalable=no" data-n-head="1" name="viewport"/>,
 <meta content="black" data-n-head="1" name="apple-mobile-web-app-status-bar-style"/>,
 <meta content="@ajc" data-hid="twitter:site" data-n-head="1" name="twitter:site"/>,
 <meta content="https://ajc.com/senior-care-quality-report/img/social-web2.jpg" data-hid="og:image" data-n-head="1" property="og:image"/>,
 <meta content="58

In [108]:
soup.find_all(name="div")

[<div id="__nuxt"><style>#nuxt-loading{visibility:hidden;opacity:0;position:absolute;left:0;right:0;top:0;bottom:0;display:flex;justify-content:center;align-items:center;flex-direction:column;animation:nuxtLoadingIn 10s ease;-webkit-animation:nuxtLoadingIn 10s ease;animation-fill-mode:forwards;overflow:hidden}@keyframes nuxtLoadingIn{0%{visibility:hidden;opacity:0}20%{visibility:visible;opacity:0}100%{visibility:visible;opacity:1}}@-webkit-keyframes nuxtLoadingIn{0%{visibility:hidden;opacity:0}20%{visibility:visible;opacity:0}100%{visibility:visible;opacity:1}}#nuxt-loading>div,#nuxt-loading>div:after{border-radius:50%;width:5rem;height:5rem}#nuxt-loading>div{font-size:10px;position:relative;text-indent:-9999em;border:.5rem solid #f5f5f5;border-left:.5rem solid #fff;-webkit-transform:translateZ(0);-ms-transform:translateZ(0);transform:translateZ(0);-webkit-animation:nuxtLoading 1.1s infinite linear;animation:nuxtLoading 1.1s infinite linear}#nuxt-loading.error>div{border-left:.5rem sol

In [112]:
classlist = []

for i in soup.find_all(name=all):
    try:
        if i["class"] not in classlist: 
            classlist.append(i["class"])
    except:
        continue
        
classlist

[]