In [1]:
# Install required libraries directly in Jupyter Notebook
!pip install requests beautifulsoup4 pandas



In [2]:
# Import libraries
import pandas as pd      # for data handling
import requests       # to make HTTP requests
from bs4 import BeautifulSoup   # to parse HTML content
import time     # to delay requests
from urllib.parse import quote_plus # to format URLs correctly
import re  # For regular expressions

In [3]:
# Load dataset into a pandas DataFrame
df_books = pd.read_csv('book_covers_visual_dataset.csv')

# Display the first few rows to verify it's loaded correctly
df_books.head()

Unnamed: 0,Title,Author,Publisher,Genre,Price,Promoted,Dominant_color,Cover_type,Visual_style,Title_word_count,Author_name_prominent,Award_recognition,First_published_date,Page_count,Estimated_reading_time,Rating
0,The Subtle Art of Not Giving a Fuck,Mark Manson,,,19.99,1,red,hardcover,typographic,8,0,1,,,,
1,Start with Why,Simon Sinek,,,12.99,0,white,paperback,typographic,3,0,0,,,,
2,The Housemaid,Freida McFadden,,,11.35,1,blue,paperback,illustration,2,0,1,,,,
3,The Housemaid´s Secret,Freida McFadden,,,9.59,1,red,paperback,illustration,3,0,1,,,,
4,Atomic Habits,James Clear,,,19.15,1,white,paperback,typographic,2,0,1,,,,


In [4]:
# Clean text by removing extra whitespace
def clean_string(string):
    cleaned = re.sub(r'\s+', ' ', string).strip()
    return cleaned

In [5]:
# Define headers that make the request look like it comes from a real browser
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:124.0) "
                  "Gecko/20100101 Firefox/124.0",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.9",
    "Accept-Encoding": "gzip, deflate, br",
    "Connection": "keep-alive",
    "DNT": "1",
    "Upgrade-Insecure-Requests": "1"
}

In [6]:
# This function searches Goodreads for a book and returns the URL of the first result
def get_goodreads_url(title):
    # Convert the title into a URL-friendly format (replace spaces with +, etc.)
    query = quote_plus(title)
    
    # Build the search URL for Goodreads
    search_url = f"https://www.goodreads.com/search?q={query}"

    # Send the request using our realistic browser headers
    response = requests.get(search_url, headers=headers)

    # If the response fails, print error and return None
    if response.status_code != 200:
        print(f"Error fetching '{title}' from Goodreads")
        return None

    # Parse the HTML content of the search results page
    soup = BeautifulSoup(response.content, 'html.parser')

    # Look for the first book result in the search results
    first_result = soup.select_one('a.bookTitle')

    # If a result is found, build and return the full book URL
    if first_result:
        return "https://www.goodreads.com" + first_result['href']
    else:
        # If no book was found, show a message and return None
        print(f"No results found for '{title}'")
        return None

In [7]:
# This function scrapes the missing fields from a Goodreads book page
def scrape_book_details(url):
    headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:124.0) "
                  "Gecko/20100101 Firefox/124.0",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.9",
    "Accept-Encoding": "gzip, deflate, br",
    "Connection": "keep-alive",
    "DNT": "1",
    "Upgrade-Insecure-Requests": "1"
}

    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        print(f"Error accessing {url}")
        return None

    soup = BeautifulSoup(response.content, "html.parser")

    # --- Page count ---
    try:
        pages = soup.select_one(".FeaturedDetails p[data-testid='pagesFormat']").text.strip().split()[0]
    except:
        pages = None

    # --- Rating ---
    try:
        rating = soup.find(class_="RatingStatistics__rating").text.strip()
    except:
        rating = None

    # --- Genre ---
    try:
        genre_links = soup.select("div[data-testid='genresList'] a")
        genre_list = [g.text.strip() for g in genre_links[:2]]
        genre = ", ".join(genre_list)
    except:
        genre = None

    # --- First published date ---
    try:
        pub_info = soup.select_one("p[data-testid='publicationInfo']").text.strip()
        pub_date = re.search(r"First published (.+)", pub_info)
        pub_date = pub_date.group(1) if pub_date else None
    except:
        pub_date = None

    # --- Publisher from embedded JSON ---
    try:
        script = soup.find("script", string=lambda s: s and '"publisher"' in s)
        json_text = script.string
        publisher = re.search(r'"publisher":"(.*?)"', json_text).group(1)
    except:
        publisher = None

    return {
        "Page_count": pages,
        "Rating": rating,
        "Genre": genre,
        "First_published": pub_date,
        "Publisher": publisher
    }

In [8]:
# Test one book to check all 5 fields from Goodreads
test_title = "The Subtle Art of Not Giving a Fuck"

# Step 1: Get Goodreads URL from the title
url = get_goodreads_url(test_title)
print("Goodreads URL:", url)

# Step 2: Extract book details (all 5 fields)
if url:
    book_details = scrape_book_details(url)
    print("Book details extracted:")
    for key, value in book_details.items():
        print(f"{key}: {value}")
else:
    print("Could not find a Goodreads page for this title.")


Goodreads URL: https://www.goodreads.com/book/show/49682914-the-subtle-art-of-not-giving-a-fuck?from_search=true&from_srp=true&qid=yReuOXfqPy&rank=1
Book details extracted:
Page_count: 206
Rating: 3.87
Genre: Nonfiction, Self Help
First_published: January 1, 2016
Publisher: HarperCollins


In [9]:
# Create empty lists for each field
new_pages = []
new_ratings = []
new_genres = []
new_dates = []
new_publishers = []

# Loop over each title
for title in df_books['Title']:
    print(f"Searching Goodreads for: {title}")
    
    url = get_goodreads_url(title)
    if url:
        details = scrape_book_details(url)
        if details:
            # Append the extracted values or None if missing
            new_pages.append(details.get("Page_count"))
            new_ratings.append(details.get("Rating"))
            new_genres.append(details.get("Genre"))
            
            # --- Format the date ---
            raw_date = details.get("First_published")
            try:
                date_obj = pd.to_datetime(raw_date, errors='coerce')
                formatted_date = date_obj.strftime('%Y-%m-%d') if not pd.isna(date_obj) else None
            except:
                formatted_date = None
            new_dates.append(formatted_date)

            new_publishers.append(details.get("Publisher"))
        else:
            new_pages.append(None)
            new_ratings.append(None)
            new_genres.append(None)
            new_dates.append(None)
            new_publishers.append(None)
    else:
        new_pages.append(None)
        new_ratings.append(None)
        new_genres.append(None)
        new_dates.append(None)
        new_publishers.append(None)
    
    # Wait to avoid overloading the server
    time.sleep(5)

Searching Goodreads for: The Subtle Art of Not Giving a Fuck
Searching Goodreads for: Start with Why
Searching Goodreads for: The Housemaid
Searching Goodreads for: The Housemaid´s Secret
Searching Goodreads for: Atomic Habits
Searching Goodreads for: Onyx Storm
Searching Goodreads for: The Vegetarian 
Searching Goodreads for: Elphie
Searching Goodreads for: Nothing Like The Movies
Searching Goodreads for: 1984
Searching Goodreads for: The Seven Husbands of Evelyn Hugo
Searching Goodreads for: The Striker
Searching Goodreads for: We All Live Here
Searching Goodreads for: Story of My Life
Searching Goodreads for: Little Women
Searching Goodreads for: The Secret History
Searching Goodreads for: The Fourth Consort
Searching Goodreads for: Caraval
Searching Goodreads for: Normal People
Searching Goodreads for: Orbital
Searching Goodreads for: The Pumpkin Spice Café
Searching Goodreads for: Wild Side
Searching Goodreads for: Cleopatra and Frankenstein
Searching Goodreads for: Taming 7
Searc

In [10]:
# Check how many entries were successfully scraped for each field
print("Page_count filled:", sum(x is not None for x in new_pages))
print("Rating filled:", sum(x is not None for x in new_ratings))
print("Genre filled:", sum(x is not None for x in new_genres))
print("First_published filled:", sum(x is not None for x in new_dates))
print("Publisher filled:", sum(x is not None for x in new_publishers))

Page_count filled: 80
Rating filled: 80
Genre filled: 80
First_published filled: 79
Publisher filled: 80


In [11]:
# Add the new columns to df_books
df_books["Page_count"] = new_pages
df_books["Rating"] = new_ratings
df_books["Genre"] = new_genres
df_books["First_published"] = new_dates
df_books["Publisher"] = new_publishers

# Check a sample
df_books[["Title", "Page_count", "Rating", "Genre", "First_published", "Publisher"]].head()

Unnamed: 0,Title,Page_count,Rating,Genre,First_published,Publisher
0,The Subtle Art of Not Giving a Fuck,206,3.87,"Nonfiction, Self Help",2016-01-01,HarperCollins
1,Start with Why,256,4.1,"Business, Leadership",2009-10-29,Portfolio
2,The Housemaid,329,4.31,"Thriller, Mystery",2022-04-26,Bookouture
3,The Housemaid´s Secret,Audible,4.21,"Thriller, Mystery",2023-02-15,Editora Arqueiro
4,Atomic Habits,319,4.34,"Nonfiction, Self Help",2018-10-18,Avery


In [12]:
# Save the full dataset with all columns
df_books.to_csv("book_covers_dataset_with_goodreads.csv", index=False)
print("File saved as 'book_covers_dataset_with_goodreads.csv'")

File saved as 'book_covers_dataset_goodreads.csv'
