In [1]:
import requests

page = requests.get("http://dataquestio.github.io/web-scraping-pages/simple.html")
page

<Response [200]>

In [2]:
page.status_code # a status code of 200 means that the page downloaded successfully.
                 # 4 or 5 status code indicates an error

200

In [3]:
page.content # print out the HTML content of the page using the content property

b'<!DOCTYPE html>\n<html>\n    <head>\n        <title>A simple example page</title>\n    </head>\n    <body>\n        <p>Here is some simple content for this page.</p>\n    </body>\n</html>'

In [7]:
# use BeautifulSoup library to parse this document, and extract text from the p tag.
from bs4 import BeautifulSoup
soup = BeautifulSoup(page.content, 'html.parser')

In [9]:
# now print out the HTML content of the page, formatted nicely.
print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <title>
   A simple example page
  </title>
 </head>
 <body>
  <p>
   Here is some simple content for this page.
  </p>
 </body>
</html>


In [14]:
# tags are nested, we can move thru the structure one level at a time.
list(soup.children)

['html', '\n', <html>
 <head>
 <title>A simple example page</title>
 </head>
 <body>
 <p>Here is some simple content for this page.</p>
 </body>
 </html>]

In [15]:
# let's see what the type of each element in the list is
[type(item) for item in list(soup.children)]
# Tag object allows us to navigate through an HTML document, and extract other tags and text.

[bs4.element.Doctype, bs4.element.NavigableString, bs4.element.Tag]

In [16]:
# select the html tag and its children by taking the third item in the list
html = list(soup.children)[2]

In [17]:
# Each item in the list returned by the children property is also a BeautifulSoup object, 
# so we can also call the children method on html.
# Now, we can find the children inside the html tag:
list(html.children)

['\n', <head>
 <title>A simple example page</title>
 </head>, '\n', <body>
 <p>Here is some simple content for this page.</p>
 </body>, '\n']

In [20]:
# There are two tags head and body. Extract the text inside the p tag
body = list(html.children)[3]

In [21]:
# Now we can get the p tag by finding the children of the body tag.
list(body.children)

['\n', <p>Here is some simple content for this page.</p>, '\n']

In [22]:
# Isolate p tag
p = list(body.children)[1]

In [23]:
print(p)

<p>Here is some simple content for this page.</p>


In [24]:
# Now use get_text method to extract all of the text inside tag.
p.get_text()

'Here is some simple content for this page.'

## Finding all instances of a tag at once

What we did above was useful for figuring out how to navigate a page, but it took a lot of commands to do something fairly simple. If we want to extract a single tag, we can instead use the find_all method, which will find all the instances of a tag on a page.

In [25]:
soup = BeautifulSoup(page.content, 'html.parser')
soup.find_all('p')

[<p>Here is some simple content for this page.</p>]

In [26]:
# it returns a list so we have to loop thru or use list indexing, it to extract text
soup.find_all('p')[0].get_text()

'Here is some simple content for this page.'

In [27]:
# If you instead only want to find the first instance of a tag, you can use the find method,
# which will return a single BeautifulSoup object:
soup.find('p')

<p>Here is some simple content for this page.</p>

## Searching for tags by class and id

We introduced classes and ids earlier, but it probably wasn't clear why they were useful. Classes and ids are used by CSS to determine which HTML elements to apply certain styles to. We can also use them when scraping to specify specific elements we want to scrape. To illustrate this principle, we'll work with the following page:

In [30]:
# dowload a page and create a BeautifulSoup object:
page = requests.get("http://dataquestio.github.io/web-scraping-pages/ids_and_classes.html")
soup = BeautifulSoup(page.content, 'html.parser')
soup

<html>
<head>
<title>A simple example page</title>
</head>
<body>
<div>
<p class="inner-text first-item" id="first">
                First paragraph.
            </p>
<p class="inner-text">
                Second paragraph.
            </p>
</div>
<p class="outer-text first-item" id="second">
<b>
                First outer paragraph.
            </b>
</p>
<p class="outer-text">
<b>
                Second outer paragraph.
            </b>
</p>
</body>
</html>

In [32]:
# use find_all method to search for items by class or by id.
# will search for any p tag that has the class outer-text
soup.find_all('p', class_='outer-text')

[<p class="outer-text first-item" id="second">
 <b>
                 First outer paragraph.
             </b>
 </p>, <p class="outer-text">
 <b>
                 Second outer paragraph.
             </b>
 </p>]

In [33]:
# look for any tag that has the class outer-text
soup.find_all(class_="outer-text")

[<p class="outer-text first-item" id="second">
 <b>
                 First outer paragraph.
             </b>
 </p>, <p class="outer-text">
 <b>
                 Second outer paragraph.
             </b>
 </p>]

In [34]:
# or we can search for elements by id
soup.find_all(id="first")

[<p class="inner-text first-item" id="first">
                 First paragraph.
             </p>]

## Using CSS Selectors

You can also search for items using CSS selectors. These selectors are how the CSS language allows developers to specify HTML tags to style. Here are some examples:

    p a — finds all a tags inside of a p tag.
    body p a — finds all a tags inside of a p tag inside of a body tag.
    html body — finds all body tags inside of an html tag.
    p.outer-text — finds all p tags with a class of outer-text.
    p#first — finds all p tags with an id of first.
    body p.outer-text — finds any p tags with a class of outer-text inside of a body tag.


In [35]:
# BeautifulSoup objects support searching a page via CSS selectors using the select method. 
# We can use CSS selectors to find all the p tags in our page that are inside of a div like 
# this:
soup.select("div p")

[<p class="inner-text first-item" id="first">
                 First paragraph.
             </p>, <p class="inner-text">
                 Second paragraph.
             </p>]

## Downloading weather data

We now know enough to proceed with extracting information about the local weather from the National Weather Service website. The first step is to find the page we want to scrape. We'll extract weather information about downtown San Francisco from this page.
https://forecast.weather.gov/MapClick.php?lat=37.7772&lon=-122.4168

In [44]:
# Download the web page containing the forecast.
# Create a BeautifulSoup class to parse the page.
# Find the div with id seven-day-forecast, and assign to seven_day
# Inside seven_day, find each individual forecast item.
# Extract and print the first forecast item.

page = requests.get("https://forecast.weather.gov/MapClick.php?lat=37.7772&lon=-122.4168")
soup = BeautifulSoup(page.content, 'html.parser')
seven_day = soup.find(id="seven-day-forecast")
forecast_items = seven_day.find_all(class_="tombstone-container")
tonight = forecast_items[0]
print(tonight.prettify())

<div class="tombstone-container">
 <p class="period-name">
  Tonight
  <br/>
  <br/>
 </p>
 <p>
  <img alt="Tonight: Mostly clear, with a low around 57. West southwest wind 11 to 16 mph, with gusts as high as 21 mph. " class="forecast-icon" src="newimages/medium/nfew.png" title="Tonight: Mostly clear, with a low around 57. West southwest wind 11 to 16 mph, with gusts as high as 21 mph. "/>
 </p>
 <p class="short-desc">
  Mostly Clear
 </p>
 <p class="temp temp-low">
  Low: 57 °F
 </p>
</div>


##### Extracting information from the page

As you can see, inside the forecast item tonight is all the information we want. There are 4 pieces of information we can extract:

    The name of the forecast item — in this case, Tonight.
    The description of the conditions — this is stored in the title property of img.
    A short description of the conditions — in this case, Mostly Clear.
    The temperature low — in this case, 57 degrees.

We'll extract the name of the forecast item, the short description, and the temperature first, since they're all similar:

In [45]:
period = tonight.find(class_="period-name").get_text()
short_desc = tonight.find(class_="short-desc").get_text()
temp = tonight.find(class_="temp").get_text()

print(period)
print(short_desc)
print(temp)

Tonight
Mostly Clear
Low: 57 °F


In [51]:
# Now, we can extract the title attribute from the img tag. To do this, 
# we just treat the BeautifulSoup object like a dictionary,
# and pass in the attribute we want as a key:
img = tonight.find("img")
desc = img['title']

print(desc)

Tonight: Mostly clear, with a low around 57. West southwest wind 11 to 16 mph, with gusts as high as 21 mph. 


## Extracting all the information from the page

Now that we know how to extract each individual piece of information, we can combine our knowledge with css selectors and list comprehensions to extract everything at once.

In the below code, we:

    Select all items with the class period-name inside an item with the class tombstone-container in seven_day.
    Use a list comprehension to call the get_text method on each BeautifulSoup object.


In [52]:
period_tags = seven_day.select(".tombstone-container .period-name")
periods = [pt.get_text() for pt in period_tags]
periods

['Tonight',
 'Saturday',
 'SaturdayNight',
 'Sunday',
 'SundayNight',
 'Monday',
 'MondayNight',
 'Tuesday',
 'TuesdayNight']

In [55]:
# As you can see above, our technique gets us each of the period names, in order.
# We can apply the same technique to get the other 3 fields:
short_descs = [sd.get_text() for sd in seven_day.select(".tombstone-container .short-desc")]
temps = [t.get_text() for t in seven_day.select(".tombstone-container .temp")]
descs = [d["title"] for d in seven_day.select(".tombstone-container img")]

print(short_descs)
print(temps)
print(descs)

['Mostly Clear', 'Sunny', 'Mostly Clear', 'Sunny andBreezy', 'Mostly Clearand Breezythen MostlyClear', 'Sunny andBreezy', 'Mostly Clearand Breezythen MostlyClear', 'Sunny', 'Mostly Clear']
['Low: 57 °F', 'High: 72 °F', 'Low: 55 °F', 'High: 70 °F', 'Low: 54 °F', 'High: 69 °F', 'Low: 54 °F', 'High: 68 °F', 'Low: 54 °F']
['Tonight: Mostly clear, with a low around 57. West southwest wind 11 to 16 mph, with gusts as high as 21 mph. ', 'Saturday: Sunny, with a high near 72. West wind 11 to 21 mph, with gusts as high as 28 mph. ', 'Saturday Night: Mostly clear, with a low around 55. West wind 15 to 21 mph, with gusts as high as 26 mph. ', 'Sunday: Sunny, with a high near 70. Breezy, with a west wind 14 to 23 mph, with gusts as high as 29 mph. ', 'Sunday Night: Mostly clear, with a low around 54. Breezy, with a west wind 15 to 23 mph, with gusts as high as 29 mph. ', 'Monday: Sunny, with a high near 69. Breezy. ', 'Monday Night: Mostly clear, with a low around 54. Breezy. ', 'Tuesday: Sunny, w

## Combining our data into a Pandas Dataframe

We can now combine the data into a Pandas DataFrame and analyze it. A DataFrame is an object that can store tabular data, making data analysis easy. If you want to learn more about Pandas, check out our free to start course here.

In order to do this, we'll call the DataFrame class, and pass in each list of items that we have. We pass them in as part of a dictionary. Each dictionary key will become a column in the DataFrame, and each list will become the values in the column:

In [56]:
import pandas as pd
weather = pd.DataFrame({
        "period": periods,
        "short_desc": short_descs,
        "temp": temps,
        "desc": descs
})
weather

Unnamed: 0,desc,period,short_desc,temp
0,"Tonight: Mostly clear, with a low around 57. W...",Tonight,Mostly Clear,Low: 57 °F
1,"Saturday: Sunny, with a high near 72. West win...",Saturday,Sunny,High: 72 °F
2,"Saturday Night: Mostly clear, with a low aroun...",SaturdayNight,Mostly Clear,Low: 55 °F
3,"Sunday: Sunny, with a high near 70. Breezy, wi...",Sunday,Sunny andBreezy,High: 70 °F
4,"Sunday Night: Mostly clear, with a low around ...",SundayNight,Mostly Clearand Breezythen MostlyClear,Low: 54 °F
5,"Monday: Sunny, with a high near 69. Breezy.",Monday,Sunny andBreezy,High: 69 °F
6,"Monday Night: Mostly clear, with a low around ...",MondayNight,Mostly Clearand Breezythen MostlyClear,Low: 54 °F
7,"Tuesday: Sunny, with a high near 68.",Tuesday,Sunny,High: 68 °F
8,"Tuesday Night: Mostly clear, with a low around...",TuesdayNight,Mostly Clear,Low: 54 °F


In [61]:
# We can now do some analysis on the data. For example, we can use a regular expression and 
# the Series.str.extract method to pull out the numeric temperature values:
temp_nums = weather["temp"].str.extract("(?P<temp_num>\d+)", expand=False)
weather["temp_num"] = temp_nums.astype('int')
temp_nums

0    57
1    72
2    55
3    70
4    54
5    69
6    54
7    68
8    54
Name: temp_num, dtype: object

In [62]:
# Find the mean of all the high and low temperatures:
weather["temp_num"].mean()

61.44444444444444

In [65]:
# Only select rows that happen at night:
is_night = weather["temp"].str.contains("Low") # does row contain "Low"
weather["is_night"] = is_night # create new column and populate with is_night
is_night

0     True
1    False
2     True
3    False
4     True
5    False
6     True
7    False
8     True
Name: temp, dtype: bool

In [64]:
weather[is_night]

Unnamed: 0,desc,period,short_desc,temp,temp_num,is_night
0,"Tonight: Mostly clear, with a low around 57. W...",Tonight,Mostly Clear,Low: 57 °F,57,True
2,"Saturday Night: Mostly clear, with a low aroun...",SaturdayNight,Mostly Clear,Low: 55 °F,55,True
4,"Sunday Night: Mostly clear, with a low around ...",SundayNight,Mostly Clearand Breezythen MostlyClear,Low: 54 °F,54,True
6,"Monday Night: Mostly clear, with a low around ...",MondayNight,Mostly Clearand Breezythen MostlyClear,Low: 54 °F,54,True
8,"Tuesday Night: Mostly clear, with a low around...",TuesdayNight,Mostly Clear,Low: 54 °F,54,True
