### Web Scraping
- Check for access to a site - 200 status code
- get data in html
- get text data as a whole
- filter data to display specific data e.g quotes - title, author values
- access data with html different selectors and tags
- save data to csv file
- get data from all pages
- save data from all pages to csv
- scrape data using selenium automation tool

In [12]:
import requests
import re
from bs4 import BeautifulSoup

r = requests.get('https://quotes.toscrape.com/')
print(r.status_code)
# print(r.text)

if "blocked" in r.text:
    print("We have been blocked")
elif r.status_code == 200:
   print("success")

print(r.headers.get("content-type","unknown"))

texts = re.findall(r'\$[0-9,.]+', r.text)
print(texts)


200
success
text/html; charset=utf-8
[]


In [27]:
soup = BeautifulSoup(r.text, "html.parser")
# print(soup)

links = soup.find_all("a")
# print(links)

quotes = soup.find_all("div", "quote")
# print(quotes)

for quote in quotes:
    text = quote.find("span", class_= "text").get_text()
    author = quote.find("small", class_="author").get_text()
    print(f"Quote: {text}\nAuthor: {author}")
    print("-----------------------------------------------------------")



Quote: “The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”
 Author: Albert Einstein
-----------------------------------------------------------
Quote: “It is our choices, Harry, that show what we truly are, far more than our abilities.”
 Author: J.K. Rowling
-----------------------------------------------------------
Quote: “There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”
 Author: Albert Einstein
-----------------------------------------------------------
Quote: “The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”
 Author: Jane Austen
-----------------------------------------------------------
Quote: “Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”
 Author: Marilyn Monroe
---------------------------------------------------------

In [101]:
import csv 
import requests
from bs4 import BeautifulSoup

base_url = 'https://quotes.toscrape.com/page/{}/'

csv_file= "quotes.csv"
with open(csv_file, mode='w', newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["Quote", "Author"])

    page = 1
    while True:
        url = base_url.format(page)
        response = requests.get(url)

        if response.status_code != 200:
            print(f"Error to retrieve page {page}. status code: {response.status_code}")
            break

        soup = BeautifulSoup(response.text, 'html.parser')
        quotes= soup.select(".quote")

        if not quotes:
            break

        for quote in quotes:
            text = quote.select_one(".text").get_text()
            author = quote.select_one(".author").get_text()
            writer.writerow([text, author])
        page += 1

print(f"All Quotes saved to {csv_file}")


All Quotes saved to quotes.csv


### toscrape Books

In [60]:
response = requests.get('https://books.toscrape.com/catalogue/tipping-the-velvet_999/index.html')

if response.status_code == 200:
    print("Success")
    
    # Test if you can get the text
    # print(response.text)

soup = BeautifulSoup(response.text, "html.parser")
product_details = soup.select(".product_page")
# print(product_details)

image_url = soup.find("div", class_="item active").find("img").get('src')
# print(image_url)

product_details = soup.find("div", class_="product_main")

product_name = product_details.find("h1").get_text()
# print(product_name)

product_price = product_details.select_one(".price_color").get_text()
# print(product_price)

product_in_stock = product_details.find("p", class_="instock availability").get_text().strip()
# print(product_in_stock)

product_description = product_details.find_all("p")
# print(product_description)

print(f'''
    Book Name: {product_name.title()}
    price: {product_price}
    inStock: {product_in_stock}
    image_url: {image_url}
      ''')

Success

    Book Name: Tipping The Velvet
    price: Â£53.74
    inStock: In stock (20 available)
    image_url: ../../media/cache/08/e9/08e94f3731d7d6b760dfbfbc02ca5c62.jpg
      


### get all books

In [108]:
response = requests.get('https://books.toscrape.com/')

if response.status_code == 200:
    print("Success")
    
    # Test if you can get the text
    # print(response.text)


soup = BeautifulSoup(response.text, "html.parser")
all_books = soup.find('ol', class_="row").find_all("li", class_="col-xs-6 col-sm-4 col-md-3 col-lg-3" )
# print(all_books)

for book in all_books:
    book_name = book.find("h3").find("a").get("title")
    print(f"Name: {book_name}")
    book_price = book.find("div", "product_price").find("p", "price_color").get_text()
    print(f"price: {book_price}")
    book_in_stock = book.find("div", "product_price").find("p", class_="instock availability").get_text().strip()
    print(f"in Stock: {book_in_stock}")
    print("-----------------------------------------------------------\n")


    




Success
Name: A Light in the Attic
price: Â£51.77
in Stock: In stock
-----------------------------------------------------------
Name: Tipping the Velvet
price: Â£53.74
in Stock: In stock
-----------------------------------------------------------
Name: Soumission
price: Â£50.10
in Stock: In stock
-----------------------------------------------------------
Name: Sharp Objects
price: Â£47.82
in Stock: In stock
-----------------------------------------------------------
Name: Sapiens: A Brief History of Humankind
price: Â£54.23
in Stock: In stock
-----------------------------------------------------------
Name: The Requiem Red
price: Â£22.65
in Stock: In stock
-----------------------------------------------------------
Name: The Dirty Little Secrets of Getting Your Dream Job
price: Â£33.34
in Stock: In stock
-----------------------------------------------------------
Name: The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull
price: Â£17.93
in Stock: In 

In [None]:
import requests
import re

page = 1
while True:
    r = requests.get(f'https://quotes.toscrape.com/page/{page}/')
    print(r.status_code)
    
    quotes = soup.select(".quote")
    # print(quotes)
    if not quotes:
        break


    for quote in quotes:
        text = quote.select_one(".text").get_text()
        author = quote.select_one(".author").get_text()
        print(f"Quote: {text}\nAuthor: {author}")
        print("-----------------------------------------------------------")
    
    page += 1


200


In [112]:
import csv 
import requests
from bs4 import BeautifulSoup

books_base_url = 'https://books.toscrape.com/catalogue/page-{}.html'
csv_file = "books.csv"

with open(csv_file, mode='w', newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["Name", "Price", "inStock"])

    page = 1
    while True:
        url = books_base_url.format(page)
        response = requests.get(url)

        if response.status_code != 200:
            print(f"Error to retrieve page {page}. status code: {response.status_code}")
            break

        soup = BeautifulSoup(response.text, 'html.parser')
        books= soup.find('ol', class_="row").find_all("li", class_="col-xs-6 col-sm-4 col-md-3 col-lg-3" )

        if not books:
            break

        for book in all_books:
            book_name = book.find("h3").find("a").get("title")
            book_price = book.find("div", "product_price").find("p", "price_color").get_text()
            book_in_stock = book.find("div", "product_price").find("p", class_="instock availability").get_text().strip()
            writer.writerow([book_name, book_price, book_in_stock])
        
        page += 1
print(f"All Quotes saved to {csv_file}")


Error to retrieve page 51. status code: 404
All Quotes saved to books.csv
