In [1]:
# Web scraping practice: scrape quotes website
import requests
import bs4


In [2]:
# Send HTTP request
res = requests.get("http://quotes.toscrape.com/")

In [3]:
res

<Response [200]>

In [4]:
# View the website src code in response
res.text

'<!DOCTYPE html>\n<html lang="en">\n<head>\n\t<meta charset="UTF-8">\n\t<title>Quotes to Scrape</title>\n    <link rel="stylesheet" href="/static/bootstrap.min.css">\n    <link rel="stylesheet" href="/static/main.css">\n</head>\n<body>\n    <div class="container">\n        <div class="row header-box">\n            <div class="col-md-8">\n                <h1>\n                    <a href="/" style="text-decoration: none">Quotes to Scrape</a>\n                </h1>\n            </div>\n            <div class="col-md-4">\n                <p>\n                \n                    <a href="/login">Login</a>\n                \n                </p>\n            </div>\n        </div>\n    \n\n<div class="row">\n    <div class="col-md-8">\n\n    <div class="quote" itemscope itemtype="http://schema.org/CreativeWork">\n        <span class="text" itemprop="text">“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”</span>\n        <sp

In [5]:
# Parse the source code with Beautiful Soup
soup = bs4.BeautifulSoup(res.text,"lxml")

In [6]:
soup

<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8"/>
<title>Quotes to Scrape</title>
<link href="/static/bootstrap.min.css" rel="stylesheet"/>
<link href="/static/main.css" rel="stylesheet"/>
</head>
<body>
<div class="container">
<div class="row header-box">
<div class="col-md-8">
<h1>
<a href="/" style="text-decoration: none">Quotes to Scrape</a>
</h1>
</div>
<div class="col-md-4">
<p>
<a href="/login">Login</a>
</p>
</div>
</div>
<div class="row">
<div class="col-md-8">
<div class="quote" itemscope="" itemtype="http://schema.org/CreativeWork">
<span class="text" itemprop="text">“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”</span>
<span>by <small class="author" itemprop="author">Albert Einstein</small>
<a href="/author/Albert-Einstein">(about)</a>
</span>
<div class="tags">
            Tags:
            <meta class="keywords" content="change,deep-thoughts,thinking,world" itemprop="keywords"/>
<a class="t

In [7]:
# Find the authors in current page
authors=soup.find_all("small",{"class":"author"})

In [8]:
for content in authors:
    print(content.contents)

['Albert Einstein']
['J.K. Rowling']
['Albert Einstein']
['Jane Austen']
['Marilyn Monroe']
['Albert Einstein']
['André Gide']
['Thomas A. Edison']
['Eleanor Roosevelt']
['Steve Martin']


In [9]:
# Find all quotes in current page
quotes=soup.find_all("span",{"class":"text"})
for content in quotes:
    print(content.contents)

['“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”']
['“It is our choices, Harry, that show what we truly are, far more than our abilities.”']
['“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”']
['“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”']
["“Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”"]
['“Try not to become a man of success. Rather become a man of value.”']
['“It is better to be hated for what you are than to be loved for what you are not.”']
["“I have not failed. I've just found 10,000 ways that won't work.”"]
["“A woman is like a tea bag; you never know how strong it is until it's in hot water.”"]
['“A day without sunshine is like, you know, night.”']


In [10]:
# Find the top 10 tags
# Specify to find the tags in the tags-box class
tags=soup.find("div",{"class":"tags-box"}).find_all("a")
for tag in tags:
    print(tag.contents)

['love']
['inspirational']
['life']
['humor']
['books']
['reading']
['friendship']
['friends']
['truth']
['simile']


In [11]:
# Find all unique authors in all pages

In [12]:
# Static solution (loop to the known number of pages)
# TODO: loop through pages without knowledge of number of pages in the website
page=1
authors=set()
while(page<=10):
    url = f"http://quotes.toscrape.com/page/{page}"
    res = requests.get(url)
    soup=bs4.BeautifulSoup(res.text,"lxml")
    authors_tags=soup.find_all("small",{"class":"author"})
    for author_tag in authors_tags:
        authors.update(author_tag.contents)
    page=page+1
print(list(authors))

['Terry Pratchett', 'Charles M. Schulz', 'Dr. Seuss', 'Mark Twain', 'Charles Bukowski', 'Haruki Murakami', 'Khaled Hosseini', 'W.C. Fields', 'George Eliot', 'Albert Einstein', 'Douglas Adams', 'Thomas A. Edison', 'Martin Luther King Jr.', 'William Nicholson', 'Jane Austen', 'Jorge Luis Borges', 'Mother Teresa', 'Alfred Tennyson', 'George Bernard Shaw', 'Garrison Keillor', 'Eleanor Roosevelt', 'J.M. Barrie', 'Ernest Hemingway', 'Marilyn Monroe', 'Helen Keller', 'J.D. Salinger', 'Pablo Neruda', 'E.E. Cummings', 'J.K. Rowling', 'Bob Marley', 'Steve Martin', 'André Gide', 'Elie Wiesel', 'Stephenie Meyer', 'George Carlin', 'Harper Lee', 'C.S. Lewis', 'Suzanne Collins', "Madeleine L'Engle", 'John Lennon', 'Alexandre Dumas fils', 'Allen Saunders', 'Ayn Rand', 'James Baldwin', 'Ralph Waldo Emerson', 'Jimi Hendrix', 'George R.R. Martin', 'J.R.R. Tolkien', 'Jim Henson', 'Friedrich Nietzsche']


In [13]:
# Loop through page regardless of number of pages
page=1
authors=set()
while True:
    url = f"http://quotes.toscrape.com/page/{page}"
    res = requests.get(url)
    soup=bs4.BeautifulSoup(res.text,"lxml")
    authors_tags=soup.find_all("small",{"class":"author"})
    if authors_tags == []: # When reach past max page number, no quotes and authors will be displayed to scrape.
        break;  # Break the loop to print the list
    for author_tag in authors_tags:
        
        authors.update(author_tag.contents)
    page=page+1
print(list(authors))

['Terry Pratchett', 'Charles M. Schulz', 'Dr. Seuss', 'Mark Twain', 'Charles Bukowski', 'Haruki Murakami', 'Khaled Hosseini', 'W.C. Fields', 'George Eliot', 'Albert Einstein', 'Douglas Adams', 'Thomas A. Edison', 'Martin Luther King Jr.', 'William Nicholson', 'Jane Austen', 'Jorge Luis Borges', 'Mother Teresa', 'Alfred Tennyson', 'George Bernard Shaw', 'Garrison Keillor', 'Eleanor Roosevelt', 'J.M. Barrie', 'Ernest Hemingway', 'Marilyn Monroe', 'Helen Keller', 'J.D. Salinger', 'Pablo Neruda', 'E.E. Cummings', 'J.K. Rowling', 'Bob Marley', 'Steve Martin', 'André Gide', 'Elie Wiesel', 'Stephenie Meyer', 'George Carlin', 'Harper Lee', 'C.S. Lewis', 'Suzanne Collins', "Madeleine L'Engle", 'John Lennon', 'Alexandre Dumas fils', 'Allen Saunders', 'Ayn Rand', 'James Baldwin', 'Ralph Waldo Emerson', 'Jimi Hendrix', 'George R.R. Martin', 'J.R.R. Tolkien', 'Jim Henson', 'Friedrich Nietzsche']
