## Step 1 fundamentals

In [1]:
import requests

In [2]:
url = "https://quotes.toscrape.com"
response = requests.get(url)

In [3]:
response.status_code

200

In [4]:
response.text[:500]

'<!DOCTYPE html>\n<html lang="en">\n<head>\n\t<meta charset="UTF-8">\n\t<title>Quotes to Scrape</title>\n    <link rel="stylesheet" href="/static/bootstrap.min.css">\n    <link rel="stylesheet" href="/static/main.css">\n    \n    \n</head>\n<body>\n    <div class="container">\n        <div class="row header-box">\n            <div class="col-md-8">\n                <h1>\n                    <a href="/" style="text-decoration: none">Quotes to Scrape</a>\n                </h1>\n            </div>\n            <div cla'

## BeautifulSoup

In [5]:
from bs4 import BeautifulSoup

In [6]:
soup = BeautifulSoup(response.text, "html.parser")
soup

<!DOCTYPE html>

<html lang="en">
<head>
<meta charset="utf-8"/>
<title>Quotes to Scrape</title>
<link href="/static/bootstrap.min.css" rel="stylesheet"/>
<link href="/static/main.css" rel="stylesheet"/>
</head>
<body>
<div class="container">
<div class="row header-box">
<div class="col-md-8">
<h1>
<a href="/" style="text-decoration: none">Quotes to Scrape</a>
</h1>
</div>
<div class="col-md-4">
<p>
<a href="/login">Login</a>
</p>
</div>
</div>
<div class="row">
<div class="col-md-8">
<div class="quote" itemscope="" itemtype="http://schema.org/CreativeWork">
<span class="text" itemprop="text">“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”</span>
<span>by <small class="author" itemprop="author">Albert Einstein</small>
<a href="/author/Albert-Einstein">(about)</a>
</span>
<div class="tags">
            Tags:
            <meta class="keywords" content="change,deep-thoughts,thinking,world" itemprop="keywords"/>
<a class="

### Scraping the first quote

In [7]:
first_quote = soup.select_one('.quote')
first_quote.prettify()

'<div class="quote" itemscope="" itemtype="http://schema.org/CreativeWork">\n <span class="text" itemprop="text">\n  “The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”\n </span>\n <span>\n  by\n  <small class="author" itemprop="author">\n   Albert Einstein\n  </small>\n  <a href="/author/Albert-Einstein">\n   (about)\n  </a>\n </span>\n <div class="tags">\n  Tags:\n  <meta class="keywords" content="change,deep-thoughts,thinking,world" itemprop="keywords"/>\n  <a class="tag" href="/tag/change/page/1/">\n   change\n  </a>\n  <a class="tag" href="/tag/deep-thoughts/page/1/">\n   deep-thoughts\n  </a>\n  <a class="tag" href="/tag/thinking/page/1/">\n   thinking\n  </a>\n  <a class="tag" href="/tag/world/page/1/">\n   world\n  </a>\n </div>\n</div>\n'

### Extracting a single quote

In [8]:
text = first_quote.select_one('.text').get_text()
author = first_quote.select_one('.author').getText()
tags = [tag.get_text() for tag in first_quote.select('.tag')]

print(f"Quote: {text}\nAuthor: {author}\nTags: {tags}")

Quote: “The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”
Author: Albert Einstein
Tags: ['change', 'deep-thoughts', 'thinking', 'world']


### Extracting all quotes

In [9]:
quotes = []
for quote in soup.select('.quote'):
    text = quote.select_one('.text').get_text()
    author = quote.select_one('.author').get_text()
    tags = [tag.get_text() for tag in quote.select('.tag')]
    quotes.append({
        'text': text,
        'author': author,
        'tags': tags
    })
print("Total quotes found:", len(quotes))
for q in quotes:
    print(q)

Total quotes found: 10
{'text': '“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”', 'author': 'Albert Einstein', 'tags': ['change', 'deep-thoughts', 'thinking', 'world']}
{'text': '“It is our choices, Harry, that show what we truly are, far more than our abilities.”', 'author': 'J.K. Rowling', 'tags': ['abilities', 'choices']}
{'text': '“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”', 'author': 'Albert Einstein', 'tags': ['inspirational', 'life', 'live', 'miracle', 'miracles']}
{'text': '“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”', 'author': 'Jane Austen', 'tags': ['aliteracy', 'books', 'classic', 'humor']}
{'text': "“Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”", 'author': 'Marilyn Monroe', 'tags': ['be-yourse

### Handling Pagination

In [14]:
import pprint


base_url = "https://quotes.toscrape.com"
url = base_url

while True:
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    
    for q in soup.select('.quote'):
        text = q.select_one('.text').get_text()
        author = q.select_one('.author').get_text()
        tags = [tag.get_text() for tag in q.select('.tag')]
        quotes.append({"text": text, "author": author, "tags": tags})
        
    next_button = soup.select_one(".next > a")
    if next_button:
        url = base_url + next_button["href"]
    else:
        break
print("Total quotes scraped:", len(quotes))
print("First 3 quotes:")
for q in quotes[:3]:
    print(q)

Total quotes scraped: 510
First 3 quotes:
{'text': '“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”', 'author': 'Albert Einstein', 'tags': ['change', 'deep-thoughts', 'thinking', 'world']}
{'text': '“It is our choices, Harry, that show what we truly are, far more than our abilities.”', 'author': 'J.K. Rowling', 'tags': ['abilities', 'choices']}
{'text': '“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”', 'author': 'Albert Einstein', 'tags': ['inspirational', 'life', 'live', 'miracle', 'miracles']}
