## Web Scraping


In [8]:
from bs4 import BeautifulSoup
import requests

In [9]:
response = requests.get("http://dataquestio.github.io/web-scraping-pages/simple.html")
content = response.content
print(content)

b'<!DOCTYPE html>\n<html>\n    <head>\n        <title>A simple example page</title>\n    </head>\n    <body>\n        <p>Here is some simple content for this page.</p>\n    </body>\n</html>'


In [11]:
# Initialize the parser, and pass in the content we grabbed earlier.
parser = BeautifulSoup(content, 'html.parser')

# Get the body tag from the document.
# Since we passed in the top level of the doctument to the parser, we need to pick a branch off of the root.
# With BeautifulSoup, we can access branches by using tag types as attributes.
body = parser.body

# Get the p tag from the body.
p = body.p

# Print the text inside the p tag.
# Text is a property that gets the inside text of a tag.

head = parser.head
title_text = head.title.text

print(p.text)
print(title_text)

Here is some simple content for this page.
A simple example page


In [31]:
# using find all to scrape all tags

body = parser.find_all('body')
p = body[0].find_all('p')
print(p[0].text)
head = parser.find_all('head')
title  = head[0].find_all('title')
title_text = title[0].text
print(title_text)



Here is some simple content for this page.
A simple example page


In [43]:
# Get the page content and set up a new parser.
response = requests.get("http://dataquestio.github.io/web-scraping-pages/simple_ids.html")
content = response.content
parser = BeautifulSoup(content, 'html.parser')

# Pass in the ID attribute to only get the element with that specific ID.
first_paragraph = parser.find_all("p", id="first")[0]
second_paragraph = parser.find_all("p", id="second")[0]
print(first_paragraph.text)
second_paragraph_text = second_paragraph.text
print(second_paragraph_text)


first = parser.find_all("p",id="first")
first.text




                First paragraph.
            


                Second paragraph.
            



AttributeError: 'ResultSet' object has no attribute 'text'

In [46]:
# Get the website that contains classes.
response = requests.get("http://dataquestio.github.io/web-scraping-pages/simple_classes.html")
content = response.content
parser = BeautifulSoup(content, 'html.parser')

# Get the first inner paragraph
# Find all the paragraph tags with the class inner-text.
# Then, take the first element in that list.
first_inner_paragraph = parser.find_all("p", class_="inner-text")[0]
print(first_inner_paragraph.text)


                First paragraph.
            


In [52]:
#second_inner_paragraph = parser.find_all('p',class_ = 'inner-text')[1]
second_inner_paragraph_text = parser.find_all('p',class_ = 'inner-text')[1].text

#first_outer_paragraph = parser.find_all('p',class_ = 'outer-text')[0]
first_outer_paragraph_text = parser.find_all('p',class_ = 'outer-text')[0].text


'\n\n                First outer paragraph.\n            \n'

In [53]:
# Get the website that contains classes and IDs.
response = requests.get("http://dataquestio.github.io/web-scraping-pages/ids_and_classes.html")
content = response.content
parser = BeautifulSoup(content, 'html.parser')

# Select all of the elements that have the first-item class.
first_items = parser.select(".first-item")

# Print the text of the first paragraph (the first element with the first-item class).
print(first_items[0].text)


                First paragraph.
            


In [57]:
parser.select(".inner-text")[1].text

'\n                Second paragraph.\n            '

In [59]:
parser.select(".outer-text")[0].text

'\n\n                First outer paragraph.\n            \n'

In [61]:
first_outer_text = parser.select(".outer-text")[0].text
second_text = parser.select("#second")[0].text

In [62]:
# Get the Superbowl box score data.
response = requests.get("http://dataquestio.github.io/web-scraping-pages/2014_super_bowl.html")
content = response.content
parser = BeautifulSoup(content, 'html.parser')

# Find the number of turnovers the Seahawks committed.
turnovers = parser.select("#turnovers")[0]
seahawks_turnovers = turnovers.select("td")[1]
seahawks_turnovers_count = seahawks_turnovers.text
print(seahawks_turnovers_count)

1


In [77]:
# Total Plays for the New England Patriots
patriots_total_plays_count = parser.select("#total-plays")[0].select("td")[2].text

# Total Yards for the Seahawks
seahawks_total_yards_count = parser.select("#total-yards")[0].select("td")[1].text
seahawks_total_yards_count

'396'