In [3]:

# Web Scraping

# Steps:
# Download a webpage using requests
# Parse HTML using BeautifulSoup
# Extract data (quotes, authors, tags)
# Store results in a pandas DataFrame
# Save results to a CSV file
# (Optional) Scrape multiple pages


# Step 0: Install libraries (Colab-friendly)
# - requests: to fetch web pages
# - beautifulsoup4: to parse HTML
# - lxml: fast HTML parser (recommended)

!pip -q install requests beautifulsoup4 lxml


In [4]:
# Step 1: Import libraries
import time                     # used to pause between requests (polite scraping)
import requests                 # downloads the HTML page
from bs4 import BeautifulSoup   # parses HTML
import pandas as pd             # stores results in tables (DataFrames)


In [5]:
# Practice website for scraping
BASE_URL = "https://quotes.toscrape.com/"
page_url = BASE_URL  # first page

page_url


'https://quotes.toscrape.com/'

In [6]:
# Step 3 — Download the page HTML using requests
# requests.get(url) returns a Response object.

# response.status_code tells us if it worked (200 means OK)
# response.text contains the HTML as a string
# We also add a User-Agent header (good practice).

headers = {
    # A user-agent tells the website "who" is requesting the page.
    # This is a common and polite practice in web scraping.
    "User-Agent": "Mozilla/5.0 (compatible; BeginnerScraper/1.0)"
}

response = requests.get(page_url, headers=headers, timeout=20)

# Check if the request worked
response.status_code


200

In [7]:
# Look at the first ~500 characters of the HTML (just to confirm we got content)
html = response.text
html[:500]


'<!DOCTYPE html>\n<html lang="en">\n<head>\n\t<meta charset="UTF-8">\n\t<title>Quotes to Scrape</title>\n    <link rel="stylesheet" href="/static/bootstrap.min.css">\n    <link rel="stylesheet" href="/static/main.css">\n    \n    \n</head>\n<body>\n    <div class="container">\n        <div class="row header-box">\n            <div class="col-md-8">\n                <h1>\n                    <a href="/" style="text-decoration: none">Quotes to Scrape</a>\n                </h1>\n            </div>\n            <div cla'

In [8]:
# Step 4 — Parse HTML using BeautifulSoup
# BeautifulSoup converts raw HTML into a structured object that we can search. We also specify a parser:
# "lxml" is fast and usually robust.

soup = BeautifulSoup(html, "lxml")

# Print the page title to confirm we parsed correctly
soup.title.text


'Quotes to Scrape'

In [9]:
# Step 5 — Inspect the page structure (what to scrape)
# On this website, each quote is inside a <div class="quote"> ... </div>.

# Inside each quote block, we can usually find:

# quote text (class="text")
# author name (class="author")
# tags (class="tag")
# We will extract these fields into a table.


# Find all quote blocks on the page
quote_blocks = soup.find_all("div", class_="quote")

# How many quotes are on the page?
len(quote_blocks)


10

In [10]:
# Let's inspect the first quote block (raw HTML snippet)

quote_blocks[0]


<div class="quote" itemscope="" itemtype="http://schema.org/CreativeWork">
<span class="text" itemprop="text">“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”</span>
<span>by <small class="author" itemprop="author">Albert Einstein</small>
<a href="/author/Albert-Einstein">(about)</a>
</span>
<div class="tags">
            Tags:
            <meta class="keywords" content="change,deep-thoughts,thinking,world" itemprop="keywords"/>
<a class="tag" href="/tag/change/page/1/">change</a>
<a class="tag" href="/tag/deep-thoughts/page/1/">deep-thoughts</a>
<a class="tag" href="/tag/thinking/page/1/">thinking</a>
<a class="tag" href="/tag/world/page/1/">world</a>
</div>
</div>

In [11]:
# Step 6 — Extract quote text, author, and tags
# We loop over quote blocks and pull out:

# quote_text
# author
# tags (a list joined into a comma-separated string)
# Then we store each record as a list (row)

# HTML examples to extract from
# <span class="text">This is a quote</span>
# <small class="author">Albert Einstein</small>

rows = []

for qb in quote_blocks:
    # Extract the quote text (remove fancy quotes by stripping whitespace)
    quote_text = qb.find("span", class_="text").get_text(strip=True)

    # Extract the author name
    author = qb.find("small", class_="author").get_text(strip=True)

    # Extract tags (there can be multiple tags per quote)
    tag_elements = qb.find_all("a", class_="tag")
    tags = [t.get_text(strip=True) for t in tag_elements]

    # Store results
    rows.append({
        "quote": quote_text,
        "author": author,
        "tags": ", ".join(tags)  # join list into a single string
    })

# Convert results into a DataFrame
df = pd.DataFrame(rows)

df.head()


Unnamed: 0,quote,author,tags
0,“The world as we have created it is a process ...,Albert Einstein,"change, deep-thoughts, thinking, world"
1,"“It is our choices, Harry, that show what we t...",J.K. Rowling,"abilities, choices"
2,“There are only two ways to live your life. On...,Albert Einstein,"inspirational, life, live, miracle, miracles"
3,"“The person, be it gentleman or lady, who has ...",Jane Austen,"aliteracy, books, classic, humor"
4,"“Imperfection is beauty, madness is genius and...",Marilyn Monroe,"be-yourself, inspirational"


In [None]:
# save to file

output_file = "quotes_page1.csv"
df.to_csv(output_file, index=False)

output_file
