In [132]:
from __future__ import print_function

from bs4 import BeautifulSoup

import requests

# Beautiful Soup on test data

Documentation: https://www.crummy.com/software/BeautifulSoup/bs4/doc/

Below, we create a simple HTML page that include some frequently used tags. 
Note, however, that we have also left one paragraph tag unclosed. 

In [133]:
source = """
<!DOCTYPE html> 
<html>  
  <head>
    <title>Scraping</title>
  </head>
  <body class="col-sm-12">
    <h1>section1</h1>
    <p>paragraph1</p>
    <p>paragraph2</p>
    <div class="col-sm-2">
      <h2>section2</h2>
      <p>paragraph3</p>
      <p>unclosed
    </div>
  </body>
</html>  
"""

soup = BeautifulSoup(source, "html.parser")

Once the soup object has been created successfully, we can execute a number of queries on the DOM. 
First we request all data from the `head` tag. 
Note that while it looks like a list of strings was returned, actually, a `bs4.element.Tag` type is returned. 
These examples explore how to extract tags, the text from tags, how to filter queries based on 
attributes, how to retreive attributes from a returned query, and how the BeautifulSoup engine 
is tolerant of unclosed tags. 
Notice in the actual HTML source, the last paragraph is not closed. 

In [134]:
print(soup.prettify())
# BeautifulSoup engine corrects the HTML source by including </p> to the unclosed paragraph

<!DOCTYPE html>
<html>
 <head>
  <title>
   Scraping
  </title>
 </head>
 <body class="col-sm-12">
  <h1>
   section1
  </h1>
  <p>
   paragraph1
  </p>
  <p>
   paragraph2
  </p>
  <div class="col-sm-2">
   <h2>
    section2
   </h2>
   <p>
    paragraph3
   </p>
   <p>
    unclosed
   </p>
  </div>
 </body>
</html>



In [135]:
print('Head:')
print('', soup.find_all("head"))
# [<head>\n<title>Scraping</title>\n</head>]

Head:
 [<head>
<title>Scraping</title>
</head>]


In [136]:
print('\nType of head:')
print('', map(type, soup.find_all("head")))
# [<class 'bs4.element.Tag'>]


Type of head:
 <map object at 0x7f818c809a30>


In [137]:
print('\nTitle tag:')
print('', soup.find("title"))
# <title>Scraping</title>


Title tag:
 <title>Scraping</title>


In [138]:
print('\nTitle text:')
print('', soup.find("title").text)
# Scraping


Title text:
 Scraping


In [139]:
divs = soup.find_all("div", attrs={"class": "col-sm-2"})
print('\nDiv with class=col-sm-2:')
print('', divs)
# [<div class="col-sm-2">....</div>]


Div with class=col-sm-2:
 [<div class="col-sm-2">
<h2>section2</h2>
<p>paragraph3</p>
<p>unclosed
    </p></div>]


In [140]:
print('\nClass of first div:')
print('', divs[0].attrs['class'])
# [u'col-sm-2']


Class of first div:
 ['col-sm-2']


In [141]:
print('\nAll paragraphs:')
print('', soup.find_all("p"))
# [<p>paragraph1</p>, 
#  <p>paragraph2</p>, 
#  <p>paragraph3</p>, 
#  <p>unclosed\n    </p>]


All paragraphs:
 [<p>paragraph1</p>, <p>paragraph2</p>, <p>paragraph3</p>, <p>unclosed
    </p>]


# Beautilful soup on real data 

In this example I will show how you can use BeautifulSoup to retreive information from live web pages. 
We make use of The Guardian newspaper, and retreive the HTML from an arbitrary article. 
We then create the BeautifulSoup object, and query the links that were discovered in the DOM. 
Since a large number are returned, we then apply attribute filters that let us reduce significantly 
the number of returned links. 
I selected the filters selected for this example in order to focus on the names in the paper. 
The parameterisation of the attributes was discovered by using the `inspect` functionality available in a modern web broser. Web browsers including Google Chrome or Microsoft Firefox support this functionality. You could right click on a part of a web page, and click `inspect` to view source of that part of a web page. 

In [142]:
url = 'https://www.theguardian.com/books/2022/jan/24/the-big-idea-should-animals-have-the-same-rights-as-humans'
req = requests.get(url)
source = req.text
soup = BeautifulSoup(source, 'html.parser')

In [143]:
print(source)
# To view the complete HTML source of a web page in a web browser, right click on the page, and click "View Page Source."

<!doctype html>
        <html lang="en">
            <head>
			    
<!--

We are hiring, ever thought about joining us?
https://workforus.theguardian.com/careers/product-engineering/


                                    GGGGGGGGG
                           GGGGGGGGGGGGGGGGGGGGGGGGGG
                       GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG
                    GGGGGGGGGGGGGGGGG      GG   GGGGGGGGGGGGG
                  GGGGGGGGGGGG        GGGGGGGGG      GGGGGGGGGG
                GGGGGGGGGGG         GGGGGGGGGGGGG       GGGGGGGGG
              GGGGGGGGGG          GGGGGGGGGGGGGGGGG     GGGGGGGGGGG
             GGGGGGGGG           GGGGGGGGGGGGGGGGGGG    GGGGGGGGGGGG
            GGGGGGGGG           GGGGGGGGGGGGGGGGGGGGGG  GGGGGGGGGGGGG
           GGGGGGGGG            GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG
           GGGGGGGG             GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG
          GGGGGGGG              GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG
          GGGGGGGG              GGGGGGGGGGG

In [144]:
links = soup.find_all('a') # Find all <a> tags 
links

[<a class="dcr-1y2qbjm" href="#maincontent">Skip to main content</a>,
 <a class="dcr-1y2qbjm" href="#navigation">Skip to navigation</a>,
 <a class="dcr-1ark7u0" data-link-name="nav2 : topbar : edition-picker: UK" href="/preference/edition/uk">UK edition</a>,
 <a class="dcr-1gagq2d" data-link-name="nav2 : topbar : edition-picker: US" href="/preference/edition/us">US edition</a>,
 <a class="dcr-1gagq2d" data-link-name="nav2 : topbar : edition-picker: AU" href="/preference/edition/au">Australian edition</a>,
 <a class="dcr-1gagq2d" data-link-name="nav2 : topbar : edition-picker: INT" href="/preference/edition/int">International edition</a>,
 <a class="dcr-17bh4jj" data-link-name="nav2 : logo" href="/"><span class="dcr-92oqeo">The Guardian - Back to home</span><svg class="dcr-3f5rmr" fill="none" viewbox="0 0 236 92" xmlns="http://www.w3.org/2000/svg"><path d="M53.627 41.257l4.038-2.096V7.513h-3.054l-7.46 9.867h-.842l.476-10.999h32.341l.468 10.999h-.885L71.4 7.513h-3.128v31.581l4.063 2.138v

In [145]:
# Open the URL: https://www.theguardian.com/world/2021/jan/21/johnson-raises-fears-of-covid-lockdown-in-england-continuing-into-summertime 
# in a web browser, right click on the link text "Boris Johnson", "England", and "Priti Patel" formatted in red colour and click Inspect Element.
# Compare the attributes of these three links with other links of the web page. 
# You will notice, these three links have their "data-component" set as "auto-linked-tag".


# Find all <a></a> whose value of attribute 'data-component' is 'auto-linked-tag'

links = soup.find_all('a', attrs={
    'data-component': 'auto-linked-tag'
})

for link in links: 
    print(link['href'], link.text)

## Task 1: Extracting Linked News Stories

URL: https://www.theguardian.com/books/2022/jan/24/the-big-idea-should-animals-have-the-same-rights-as-humans

Open the above URL in your browser. You will notice that parts of the main news story are hyperlinked to other news stories published previously. For instance, the first paragraph is linked to a new story on animal welfare (sentience) bill. Your first task is to extract the links to these other news stories in the main news.  

Hint 1: 05_web_scraping_beautiful_soup.ipynb has a code block to extract topics hyperlinked within a new story. 

Hint 2: These can be identified with attribute 'data-link-name':'in body link' in \<a> \</a>

In [146]:
# Solution to Task 1
# This task is similar to the the code block above where we try to extract some clicks. 
# Open the web page in a web browser and inspect the hyper links of other linked news stories. 
# Notice each of these links have attribute "data-link-name" set as "in body link"

# Let us extract these:

links = soup.find_all('a', attrs={
    'data-link-name':'in body link'
})

for link in links: 
    print(link['href'], link.text)

https://www.theguardian.com/world/2021/may/12/animals-to-be-formally-recognised-as-sentient-beings-in-uk-law animal welfare (sentience) bill
https://www.theguardian.com/world/2021/nov/19/boiling-of-live-lobsters-could-be-banned-in-uk-under-proposed-legislation decapod crustaceans (such as crabs and lobsters)
https://www.theguardian.com/commentisfree/2021/may/16/animals-feel-humans-evidence-sentient Jonathan Birch
https://www.theguardian.com/books/2013/sep/13/my-hero-rene-descartes-kelman René Descartes
https://www.scientificamerican.com/article/the-mind-of-an-octopus/ Octopuses are “probably the closest we will come to meeting an intelligent alien”
http://www.greatapeproject.uk/ The Great Ape Project
https://guardianbookshop.com/are-we-smart-enough-to-know-how-smart-animals-are-9781783783069 Are We Smart Enough 
https://guardianbookshop.com/are-we-smart-enough-to-know-how-smart-animals-are-9781783783069 to Know How Smart Animals Are?
https://guardianbookshop.com/other-minds-97800082262

## Task 2: Extracting Topics or Categories 

URL: https://www.theguardian.com/books/2022/jan/24/the-big-idea-should-animals-have-the-same-rights-as-humans 

~~Guardian's website tags the news story with a list of topics. Your second task is to find these topics. 

~~Hint: These can be identified with attribute 'class':'submeta__link' in \<a> \</a> 

It was supposed to be an easy task but we learn that the hint does not help! Guardian seems to have updated their website only recently and some of the class names in the CSS are auto-generated, making it difficult to scrape data. So here is a messy solution to Task 2. 

In [147]:
# Solution to Task 2

# In the source we see topics are in a div with 'data-print-layout':'hide'"

url = 'https://www.theguardian.com/books/2022/jan/24/the-big-idea-should-animals-have-the-same-rights-as-humans'
req = requests.get(url)
source = req.text
soup = BeautifulSoup(source, 'html.parser')


relevant_divs = soup.find_all('div', attrs={
    'data-print-layout':'hide'
})

print("\n\nDivs with 'data-print-layout':'hide' = ", len(relevant_divs))

# One of these divs has Topics. 
# We notice that text "Topics" is within a span. We need to find the parent div containing this span. 
# Within this parent div, we see that topics are listed in the second child div.

for div in relevant_divs:
    soup_div = BeautifulSoup(str(div), 'html.parser')
    spans = soup_div.find_all('span')

    for span in spans:
        if span.text == 'Topics':
            print("\n\nSpan with 'Topics' found: ", span)
            
            topics_div = soup_div.find_all('div')
            # Topics_div is a list. Print prettify each of the div. We notice that the topics are in second div.
            
            print(topics_div[1].prettify())
            
            topics = topics_div[1].find_all('a')
            print("\n\nTopics found: ", len(topics))
            for topic in topics:
                print(topic.text[:20])



Divs with 'data-print-layout':'hide' =  8


Span with 'Topics' found:  <span class="dcr-12ae8vg">Topics</span>
<div class="dcr-lwa3gj">
 <ul class="dcr-1r2wmvc">
  <li class="dcr-v3nkdw">
   <a class="dcr-12mkso4" href="/books/books">
    Books
   </a>
  </li>
  <li class="dcr-1231g41">
   <a class="dcr-12mkso4" href="/books/series/big-idea">
    The big idea
   </a>
  </li>
 </ul>
 <ul class="dcr-1r2wmvc">
  <li class="dcr-bd6g3a">
   <a class="dcr-12mkso4" href="/tone/features">
    features
   </a>
  </li>
 </ul>
</div>



Topics found:  3
Books
The big idea
features


# Chaining queries

Now, let us conisder a more general query that might be done on a website such as this. 
We will query the base technology page, and attempt to list all articles that pertain to this main page

In [148]:
url = 'https://www.theguardian.com/uk/technology'
req = requests.get(url)
source = req.text
soup = BeautifulSoup(source, 'html.parser')

After inspecting the DOM (via the `inspect` tool in my browser), I see that the attributes that define 
a `technology` article are: 
    
    class = "js-headline-text"

In [149]:
articles = soup.find_all('a', attrs={
    'class': 'js-headline-text'
})

print(len(articles))

for article in articles: 
#     print(article['href'])
    print(article['href'][:50], article.text[:50])

46
https://www.theguardian.com/technology/2022/jan/26 Spotify to remove Neil Young music in feud over Jo
https://www.theguardian.com/technology/2022/jan/27 Anti-vaxxers making ‘at least $2.5m’ a year from p
https://www.theguardian.com/technology/2022/jan/26 Company sees record profit as electric car deliver
https://www.theguardian.com/technology/2022/jan/26 Robot successfully performs keyhole surgery on pig
https://www.theguardian.com/technology/2022/jan/25 Software company beats expectations with $18.8bn p
https://www.theguardian.com/technology/2022/jan/26 Everything you need to know about DAOs
https://www.theguardian.com/commentisfree/2022/jan Return to the office or turn digital nomad? If onl
https://www.theguardian.com/commentisfree/2022/jan Book readers have realised that you can’t replace 
https://www.theguardian.com/business/2022/jan/10/r Rolls-Royce: Covid has spurred record sales of our
https://www.theguardian.com/uk-news/2021/dec/27/co Covid lockdowns may have increased UK te

With this set of articles, it is now possible to chain further querying, for example with code 
similar to the following 

```python
for article in articles: 
    req = requests.get(article['href'])
    source = req.text 
    soup = BeautifulSoup(source, 'html.parser') 
    
    ... and so on...
```

However, I won't go into much detail about this now. For scraping like this tools, such as `scrapy` are more 
appropriate than `BeautifulSoup` since they are designed for multithreadded web crawling. 
Once again, however, I urge caution and hope that before any crawling is initiated you determine whether 
crawling is within the terms of use of the website. 
If in doubt contact the website administrators. 

https://scrapy.org/

## Task 3. Listing All News Stories in a Section  

URL: https://www.theguardian.com/uk/technology 

In the chaining code block above (in 05_web_scraping_beautiful_soup.ipynb), we tried listing all news stories on the technology page of the Guardian's website. We notice that "js-headline-text" class fetches non-technology stories as well.  

Your task is to filter only technology related stories that are listed under the Technology division of the Guardian's webpage  

Hint 1: This requires extracting the source of the div that lists the Technology new stories. 

Hint 2: Filter for div with attribute 'data-title': 'Technology'

In [150]:
# Solution to Task 3
# Open the technology page in a web browser.
# When we compare the links to technology stories, we notice these links do not have any differentiating attribute
# However on closer inspection, we notice the Technology stories (six of them) reside in a division with "data-title" attribute set as "Technology".
# To extract these stories, we will: 
# (1) fetch the source of div with data-title as Technology
# (2) pass it to BeautifulSoup 
# (3) fetch all links with class js-headline-text

tech_div = soup.find_all('div', attrs={
    'data-title': 'Technology'
})

soup_tech = BeautifulSoup(str(tech_div), 'html.parser')

soup_tech_articles = soup_tech.find_all('a', attrs={
    'class': 'js-headline-text'
})

print('Total links:', len(soup_tech_articles))

for article in soup_tech_articles: 
    print(article['href'][:], article.text[:20])
    


Total links: 6
https://www.theguardian.com/technology/2022/jan/26/spotify-neil-young-joe-rogan-covid-misinformation Spotify to remove Ne
https://www.theguardian.com/technology/2022/jan/27/anti-vaxxers-making-at-least-25m-a-year-from-publishing-on-substack Anti-vaxxers making 
https://www.theguardian.com/technology/2022/jan/26/tesla-elon-musk-profit-electric-car-deliveries-soar Company sees record 
https://www.theguardian.com/technology/2022/jan/26/robot-successfully-performs-keyhole-surgery-on-pigs-without-human-help Robot successfully p
https://www.theguardian.com/technology/2022/jan/25/microsoft-quarterly-earnings-revenue-tech Software company bea
https://www.theguardian.com/technology/2022/jan/26/techscape-daos-nfts Everything you need 


## Task 4. List 50 Most Recent Technology-Related News Stories  

URL: https://www.theguardian.com/uk/technology 

On Guardian's technology home page, you will notice a link to “All Stories.” If you cannot locate it visually, use the browser’s find tool (Ctrl + F) and search for “All Stories.” Click the link to “All Stories” and observe the structure of the web page listing all stories. Here is the direct link to that web page: https://www.theguardian.com/technology/all 

Your task is to extract 50 most recent technology stories published by Guardian.  

Hint 1: This will require you to loop through multiple pages and maintain a counter of stories. 

Hint 2: Click on page 2. Observe the URL string.  

Hint 3: URL "../technology/all" maps to "/technology?page=1". 

In [151]:
# Solution to Task 4
# This task is similar to the other tasks. 
# The only difference is we need to loop through multiple pages of Guardian's website while maintaining a counter of the links fetched 

base_url = 'https://www.theguardian.com/technology?page='
article_count = 1
page_num = 1

while article_count <= 50:
    req = requests.get(base_url + str(page_num))
    source = req.text
    soup = BeautifulSoup(source, 'html.parser')
    
    articles = soup.find_all('a', attrs={
    'class': 'js-headline-text'})
    print('\n\nPage #', page_num)
    for article in articles: 
        print(article_count, article['href'][:], article.text[:20])
        article_count += 1
        if article_count > 50:
            break
    page_num += 1



Page # 1
1 https://www.theguardian.com/technology/2022/jan/26/spotify-neil-young-joe-rogan-covid-misinformation Spotify to remove Ne
2 https://www.theguardian.com/business/2022/jan/27/indian-ride-hailing-app-ola-to-open-100m-electric-car-facility-in-coventry Indian ride-hailing 
3 https://www.theguardian.com/technology/2022/jan/27/anti-vaxxers-making-at-least-25m-a-year-from-publishing-on-substack Anti-vaxxers making 
4 https://www.theguardian.com/music/2022/jan/27/uk-watchdog-to-study-music-streaming-market-amid-claims-of-raw-deal-for-artists-and-fans UK watchdog to study
5 https://www.theguardian.com/technology/2022/jan/26/tesla-elon-musk-profit-electric-car-deliveries-soar Tesla sees record pr
6 https://www.theguardian.com/technology/2022/jan/26/robot-successfully-performs-keyhole-surgery-on-pigs-without-human-help Robot successfully p
7 https://www.theguardian.com/business/2022/jan/26/ocado-develops-robots-to-enable-faster-cheaper-deliveries Ocado develops robot
8 https://www.the