In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import pickle
import time
import csv
import json
import requests
from bs4 import BeautifulSoup

# Start WebDriver session
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)

# Load Goodreads
driver.get("https://www.goodreads.com")

# Load saved cookies for authentication
try:
    with open("cookies.pkl", "rb") as file:
        cookies = pickle.load(file)
        for cookie in cookies:
            driver.add_cookie(cookie)
    print("✅ Cookies loaded successfully!")
    driver.refresh()
    time.sleep(3)
except FileNotFoundError:
    print("❌ Cookie file not found. Login required.")

# Extract book URLs from Goodreads list
goodreads_list_url = "https://www.goodreads.com/list/show/312.Best_Humorous_Books"
driver.get(goodreads_list_url)
time.sleep(5)

soup = BeautifulSoup(driver.page_source, "html.parser")
book_urls = []
seen_urls = set()

for book in soup.select("a.bookTitle"):
    book_url = "https://www.goodreads.com" + book["href"]
    if book_url not in seen_urls:
        book_urls.append(book_url)
        seen_urls.add(book_url)
    if len(book_urls) >= 10:  # Stop after collecting 10 unique books
        break

print(f"🔍 Extracted {len(book_urls)} book URLs from the list.")

# Open CSV file for saving data
with open("Best_Humorous_Books.csv", "w", newline="", encoding="utf-8-sig") as file:
    writer = csv.writer(file)
    writer.writerow([
        "Book Name", "Image", "Book Format", "Published Date", 
        "Number of Pages", "Language", "Authors", "Rating", 
        "Rating Count", "Review Count", "Genres", "ASIN", "Publisher", "ISBN-13", 
        "Amazon Affiliate Link", "Google Books Preview Link"
    ])

    # Loop through Goodreads book URLs
    for full_url in book_urls:
        driver.get(full_url)
        time.sleep(5)

        # **Hover over the section to reveal the button**
        try:
            book_details_section = WebDriverWait(driver, 5).until(
                EC.presence_of_element_located((By.CLASS_NAME, "BookDetails"))
            )
            actions = ActionChains(driver)
            actions.move_to_element(book_details_section).perform()
            time.sleep(2)
        except:
            print("⚠️ Could not find book details section.")

        # **Click the 'Book details & editions' button**
        try:
            button = WebDriverWait(driver, 5).until(
                EC.element_to_be_clickable((By.XPATH, "//button[contains(@aria-label, 'Book details and editions')]"))
            )
            button.click()
            time.sleep(2)
        except:
            print("⚠️ Could not find or click the 'Book details & editions' button.")

        # Extract book details
        try:
            published_date = driver.find_element(By.CSS_SELECTOR, "p[data-testid='publicationInfo']").text.strip()
        except:
            published_date = "N/A"

        # **Extract Data from JSON-LD (Metadata in Page Source)**
        book_name, image, book_format, number_of_pages, in_language_json = "N/A", "N/A", "N/A", "N/A", "N/A"
        person_names, rating_value, rating_count, review_count, genres = "N/A", "N/A", "N/A", "N/A", "N/A"

        try:
            soup = BeautifulSoup(driver.page_source, "html.parser")
            script_tag = soup.find("script", type="application/ld+json")

            if script_tag:
                data = json.loads(script_tag.string)
                book_name = data.get("name", "N/A")
                image = data.get("image", "N/A")
                book_format = data.get("bookFormat", "N/A")
                number_of_pages = data.get("numberOfPages", "N/A")
                in_language_json = data.get("inLanguage", "N/A")

                authors = data.get("author", [])
                person_names = ", ".join([person.get("name", "N/A") for person in authors]) if isinstance(authors, list) else authors.get("name", "N/A")

                aggregate_rating = data.get("aggregateRating", {})
                rating_value = aggregate_rating.get("ratingValue", "N/A")
                rating_count = aggregate_rating.get("ratingCount", "N/A")
                review_count = aggregate_rating.get("reviewCount", "N/A")

            # Extract genres
            genre_tags = soup.select("a[href*='/genres/']")
            genres = ", ".join([genre.text.strip() for genre in genre_tags]) if genre_tags else "N/A"

        except Exception as e:
            print(f"⚠️ Error extracting JSON-LD data: {e}")

        # **Extract ASIN, Publisher, and ISBN-13**
        asin, publisher, isbn13 = "N/A", "N/A", "N/A"

        try:
            script_tags = soup.find_all("script", type="application/json")

            for script in script_tags:
                json_data = json.loads(script.string)
                
                if isinstance(json_data, dict):
                    book_details = json_data.get("props", {}).get("pageProps", {}).get("apolloState", {})
                    
                    for key in book_details:
                        if key.startswith("Book:"):
                            book = book_details[key]
                            asin = book.get("details", {}).get("asin", "N/A")
                            publisher = book.get("details", {}).get("publisher", "N/A")
                            isbn13 = book.get("details", {}).get("isbn13", "N/A")
                            break  # Exit loop once book data is found

        except Exception as e:
            print(f"⚠️ Error extracting ASIN, Publisher, and ISBN-13: {e}")

        # **Generate Amazon Affiliate Link**
        if asin != "N/A":
            amazon_affiliate_link = f"https://www.amazon.in/dp/{asin}/ref=nosim?tag=thopdev-21"
        else:
            amazon_affiliate_link = "N/A"

        # **Find Google Books Preview Link**
        google_books_preview_link = "N/A"
        if isbn13 != "N/A":
            google_books_api_url = f"https://www.googleapis.com/books/v1/volumes?q=isbn:{isbn13}"
            try:
                response = requests.get(google_books_api_url)
                data = response.json()
                if "items" in data and len(data["items"]) > 0:
                    volume_id = data["items"][0]["id"]
                    google_books_preview_link = f"https://www.google.co.in/books/edition/_/{volume_id}?hl=en&gbpv=1"
            except Exception as e:
                print(f"⚠️ Error fetching Google Books preview link: {e}")

        # Save data to CSV (Ensure ASIN & ISBN-13 are stored as text to keep leading zeros)
        writer.writerow([
            book_name, image, book_format, published_date, 
            number_of_pages, in_language_json, person_names, 
            rating_value, rating_count, review_count, genres, 
            f"'{asin}", publisher, f"'{isbn13}", amazon_affiliate_link, google_books_preview_link
        ])
        print(f"✅ Scraped: {book_name} | ASIN: {asin} | ISBN-13: {isbn13} | Amazon Link: {amazon_affiliate_link} | Google Books: {google_books_preview_link}")

# Close WebDriver
driver.quit()
print("🚀 Task completed! All book details saved in 'Best_Humorous_Books.csv'.")
