In [5]:
from bs4 import BeautifulSoup
import requests

html = requests.get("https://www.oreilly.com/").text
soup = BeautifulSoup(html, 'html5lib')

In [6]:
# get first paragraph
print(soup.p)

<p class="mobileHidden">Build skills. <span class="nowrap">Solve problems.</span></p>


In [8]:
print(soup.p.text)   # print text of the first paragraph
print(soup.p.text.split())   # print words in list of the first paragraph

Build skills. Solve problems.
['Build', 'skills.', 'Solve', 'problems.']


In [14]:
# extract a tag's attributes by treating it like a dict
print(soup.p['class'])    # raises KeyError if attribute not found
print(soup.p.get('class'))   # returns None if no id

['mobileHidden']
['mobileHidden']


In [18]:
# get multiple tags at once
print(soup('p'))   # or soup.findall('p')
print("\n\n")

# getting tags with existing attributes|
paragraphs_with_ids = [p for p in soup('p') if p.get('class')]
print(paragraphs_with_ids)

[<p class="mobileHidden">Build skills. <span class="nowrap">Solve problems.</span></p>, <p>New expert playlists: Collections of handpicked content from <span class="nowrap">industry leaders</span></p>, <p>OâReilly online learning gives your team the knowledge they need to stay ahead with on-demand access to the latest OâReilly books, videos, and live training. Build skills with learning paths, live online courses, and collections of content selected by expertsâor solve a problem quickly through books and videos. Follow your organizationâs progress and go in-depth with reporting and insights tools. OâReilly members: Explore all our <a class="nowrap" href="https://learning.oreilly.com/playlists/discover/">expert playlists here.</a></p>, <p>Give your team the knowledge they need to stay ahead with on-demand access to the latest OâReilly books, videos, and live training courses through OâReilly online learning.</p>, <p>Build skills with learning paths and live training course

In [27]:
# find tags with a specific class
print(soup('p', {'class': 'conferences-detail-presenter'}))
print(soup('p', 'conferences-detail-presenter'))

[<p class="conferences-detail-presenter">Maggie Carroll, senior engineer, MAG Aerospace</p>, <p class="conferences-detail-presenter">Sam Newman, Technologist</p>]
[<p class="conferences-detail-presenter">Maggie Carroll, senior engineer, MAG Aerospace</p>, <p class="conferences-detail-presenter">Sam Newman, Technologist</p>]


In [26]:
[p.get('class') for p in paragraphs_with_ids]

[['mobileHidden'],
 ['conferences-detail-tagline'],
 ['conferences-detail-location'],
 ['conferences-detail-dates'],
 ['conferences-detail-presenter'],
 ['conferences-detail-tagline'],
 ['conferences-detail-location'],
 ['conferences-detail-dates'],
 ['conferences-detail-presenter']]

In [30]:
"""
Whenever you want to scrape data from a website you should first check to see
if it has some sort of access policy looking at terms and conditions

We should also check for robots.txt file that tells webcrawlers how to behave
"""

# try http://shop.oreilly.com/robots.txt
#    Crawl-delay: 30
#    Request-rate: 1/30

# The first tells us that we should wait 30 seconds between requests.
# The second says that we should request only one page every 30 seconds
#    So basically they're two different ways of saying the same thing

'\nWhenever you want to scrape data from a website you should first check to see\nif it has some sort of access policy looking at terms and conditions\n\nWe should also check for robots.txt file that tells webcrawlers how to behave\n'