In [1]:
# We are scraping a website to get the highest rated books
# url = https://books.toscrape.com/index.html

# STEPS-
# 1. Find the website to scrape.
# 2. Decide the information that you want to scrape. In this case i want to scrape book_name, price, rating, UPC, Genre.
# 3. Make a CSV file that has the scraped information.
# 4. We can find the highest rated books from each genre or just overall highest rated books.
#

In [2]:
pip install requests --upgrade --quiet

In [3]:
import requests

In [4]:
url = 'http://books.toscrape.com/'

response = requests.get(url)

In [5]:
print("Status Code:", response.status_code)
# if this value is between 200 and 299, it means that the site has been scraped successfully

Status Code: 200


In [6]:
page_content = response.text
print("Content length:", len(page_content))
print("First 500 characters:\n", page_content[:500])

Content length: 51294
First 500 characters:
 <!DOCTYPE html>
<!--[if lt IE 7]>      <html lang="en-us" class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->
<!--[if IE 7]>         <html lang="en-us" class="no-js lt-ie9 lt-ie8"> <![endif]-->
<!--[if IE 8]>         <html lang="en-us" class="no-js lt-ie9"> <![endif]-->
<!--[if gt IE 8]><!--> <html lang="en-us" class="no-js"> <!--<![endif]-->
    <head>
        <title>
    All products | Books to Scrape - Sandbox
</title>

        <meta http-equiv="content-type" content="text/html; charset=UTF-8" /


In [8]:
with open('books.html', 'w') as f:
    f.write(page_content)
# books.html is copy of the url

In [9]:
pip install beautifulsoup4 --upgrade --quiet

In [11]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')
# we parsed the website

In [12]:
# As the genre name is in an <a> tag, let's see the number of <a> tags
a_tags = soup.find_all('a')
print("Number of <a> tags:", len(a_tags))

# there are many <a> tags, so we can't get the genre links directly

Number of <a> tags: 94


In [13]:
#We are getting the genre link

from urllib.parse import urljoin

# Select all genre links (excluding the top "Books" link)
genre_tags = soup.select('ul.nav.nav-list > li > ul > li > a')

genre_links = []

for tag in genre_tags:
    genre_name = tag.text.strip()
    genre_url = urljoin(url, tag['href'])  # build full URL
    genre_links.append((genre_name, genre_url))

print(f"Found {len(genre_links)} genres")
for genre in genre_links[:5]:
    print(genre)


Found 50 genres
('Travel', 'http://books.toscrape.com/catalogue/category/books/travel_2/index.html')
('Mystery', 'http://books.toscrape.com/catalogue/category/books/mystery_3/index.html')
('Historical Fiction', 'http://books.toscrape.com/catalogue/category/books/historical-fiction_4/index.html')
('Sequential Art', 'http://books.toscrape.com/catalogue/category/books/sequential-art_5/index.html')
('Classics', 'http://books.toscrape.com/catalogue/category/books/classics_6/index.html')


In [17]:
# We are getting all the book links
all_book_urls = []
book_genres = []

for genre_name, genre_url in genre_links:
    #print(f"Collecting book URLs in genre: {genre_name}")

    current_url = genre_url
    while True:
        response = requests.get(current_url, headers={'User-Agent': 'Mozilla/5.0'})
        soup = BeautifulSoup(response.text, 'html.parser')

        books = soup.select('h3 > a')
        for book in books:
            book_url = urljoin(current_url, book['href'])
            if 'catalogue/' not in book_url:
                book_url = book_url.replace('books/', 'catalogue/books/')
            all_book_urls.append(book_url)
            book_genres.append(genre_name)

        # Go to next page if it exists
        next_button = soup.select_one('li.next > a')
        if next_button:
            current_url = urljoin(current_url, next_button['href'])
        else:
            break

print(f"Total books found: {len(all_book_urls)}")

Total books found: 1000


In [18]:
for book_url in all_book_urls[:5]:
    print(book_url)

http://books.toscrape.com/catalogue/its-only-the-himalayas_981/index.html
http://books.toscrape.com/catalogue/full-moon-over-noahs-ark-an-odyssey-to-mount-ararat-and-beyond_811/index.html
http://books.toscrape.com/catalogue/see-america-a-celebration-of-our-national-parks-treasured-sites_732/index.html
http://books.toscrape.com/catalogue/vagabonding-an-uncommon-guide-to-the-art-of-long-term-world-travel_552/index.html
http://books.toscrape.com/catalogue/under-the-tuscan-sun_504/index.html


In [30]:
# We are collecting the data now
book_data = []

for idx, book_url in enumerate(all_book_urls):
    try:
        response = requests.get(book_url, headers={'User-Agent': 'Mozilla/5.0'})
        soup = BeautifulSoup(response.text, 'html.parser')

        book_name = soup.select_one('h1').text.strip()
        price = soup.select_one('p.price_color').text.strip().replace('£', '')


        rating_tag = soup.select_one('p.star-rating')
        rating = rating_tag['class'][1] if rating_tag else 'Unknown'

        upc = ''
        table = soup.select('table.table.table-striped')
        if table:
            rows = table[0].select('tr')
            for row in rows:
                if 'UPC' in row.text:
                    upc = row.select_one('td').text.strip()
                    break

        genre_name = book_genres[idx]

        book_data.append({
            'book_name': book_name,
            'price': price,
            'ratings': rating,
            'upc_code': upc,
            'genre': genre_name
        })

        if idx % 100 == 0:
            print(f"Scraped {idx} / {len(all_book_urls)} books")

    except Exception as e:
        print(f"Error scraping {book_url}: {e}")


Scraped 0 / 1000 books
Scraped 100 / 1000 books
Scraped 200 / 1000 books
Scraped 300 / 1000 books
Scraped 400 / 1000 books
Scraped 500 / 1000 books
Scraped 600 / 1000 books
Scraped 700 / 1000 books
Scraped 800 / 1000 books
Scraped 900 / 1000 books


In [31]:
# Making the CSV file
import pandas as pd

df = pd.DataFrame(book_data)
df.to_csv('books.csv', index=False)
print("Saved to books.csv")


Saved to books.csv


In [37]:
df = pd.read_csv('books.csv')
df.head()

Unnamed: 0,book_name,price,ratings,upc_code,genre
0,It's Only the Himalayas,Â45.17,Two,a22124811bfa8350,Travel
1,Full Moon over Noahâs Ark: An Odyssey to Mou...,Â49.43,Four,ce60436f52c5ee68,Travel
2,See America: A Celebration of Our National Par...,Â48.87,Three,f9705c362f070608,Travel
3,Vagabonding: An Uncommon Guide to the Art of L...,Â36.94,Two,1809259a5a5f1d8d,Travel
4,Under the Tuscan Sun,Â37.33,Three,a94350ee74deaa07,Travel


In [33]:
len(df)

1000

In [38]:
df['ratings'].unique()



array(['Two', 'Four', 'Three', 'One', 'Five'], dtype=object)

In [39]:
rating_map = {
    'One': 1,
    'Two': 2,
    'Three': 3,
    'Four': 4,
    'Five': 5
}

df['ratings'] = df['ratings'].map(rating_map)


In [40]:
df['ratings'].unique()

array([2, 4, 3, 1, 5])

In [41]:
df.head()

Unnamed: 0,book_name,price,ratings,upc_code,genre
0,It's Only the Himalayas,Â45.17,2,a22124811bfa8350,Travel
1,Full Moon over Noahâs Ark: An Odyssey to Mou...,Â49.43,4,ce60436f52c5ee68,Travel
2,See America: A Celebration of Our National Par...,Â48.87,3,f9705c362f070608,Travel
3,Vagabonding: An Uncommon Guide to the Art of L...,Â36.94,2,1809259a5a5f1d8d,Travel
4,Under the Tuscan Sun,Â37.33,3,a94350ee74deaa07,Travel


In [44]:
top_rated_books = df.sort_values('ratings', ascending=False).groupby('genre').first().reset_index()

In [45]:
top_rated_books.to_csv('top_books_by_genre.csv', index=False)


In [49]:
output = pd.read_csv('top_books_by_genre.csv')
len(output)


50

In [50]:
output.head()

Unnamed: 0,genre,book_name,price,ratings,upc_code
0,Academic,Logan Kade (Fallen Crest High #5.5),Â13.12,2,7093cf549cd2e7de
1,Add a comment,Modern Romance,Â28.26,5,caf4fe9311f1dc59
2,Adult Fiction,Fifty Shades Freed (Fifty Shades #3),Â15.36,5,ed813a848580ba50
3,Art,Ways of Seeing,Â39.51,5,66a4e422b212726a
4,Autobiography,Life Without a Recipe,Â59.04,5,c53d9fefcda371e9
