#Web Scrapping in Goodreads Quotes

In [25]:
import csv
import requests
from bs4 import BeautifulSoup
import re

def clean_text(text):
    # Clean the text from non-ASCII characters
    cleaned_text = re.sub(r'[^\x00-\x7F]+', '', text)
    return cleaned_text.strip()

def scrape_goodreads_quotes(num_pages=10):
    base_url = "https://www.goodreads.com/quotes?page={}"
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"}

    # Create a CSV file
    with open('goodreads_quotes.csv', 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['Author', 'Quote', 'Source', 'Tags', 'Likes']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        try:
            # Loop through the pages
            for page_num in range(1, num_pages + 1):
                url = base_url.format(page_num)
                response = requests.get(url, headers=headers)
                response.raise_for_status()  # Handle errors if the request is not successful

                soup = BeautifulSoup(response.text, 'html.parser')

                # Extract data from each quote
                quotes = soup.find_all('div', class_='quoteDetails')
                for quote in quotes:
                    author = quote.find('span', class_='authorOrTitle').text.strip()

                    # Use contents to access text elements
                    quote_text_elem = quote.find('div', class_='quoteText')
                    quote_text = clean_text(quote_text_elem.contents[0].strip()) if quote_text_elem.contents else 'N/A'

                    # Handle cases where the element with the sought-after id is not found
                    source_elem = quote.find('span', id=lambda x: x and x.startswith('quote_book_link_'))
                    source = source_elem.text.strip() if source_elem else 'N/A'

                    tags_elem = quote.find('div', class_='greyText smallText left')
                    tags = ', '.join(tag.text.strip() for tag in tags_elem.find_all('a')) if tags_elem else 'N/A'

                    likes_elem = quote.find('div', class_='right')
                    likes_text = likes_elem.text.strip() if likes_elem else 'N/A'

                    # Remove the word "likes" from the text
                    likes = likes_text.replace(' likes', '') if likes_text != 'N/A' else 'N/A'

                    # Write data to the CSV file
                    writer.writerow({'Author': author, 'Quote': quote_text, 'Source': source, 'Tags': tags, 'Likes': likes})

            print(f"Scraping successful. Data saved in the 'goodreads_quotes.csv' file.")

        except requests.exceptions.RequestException as e:
            print(f"Error: {e}")
            print("Scraping failed.")

if __name__ == "__main__":
    # Set the number of pages to be scraped (adjust as needed)
    num_pages_to_scrape = 5
    scrape_goodreads_quotes(num_pages=num_pages_to_scrape)


Scraping berhasil. Data disimpan dalam file 'goodreads_quotes.csv'.
