# 1. Libraries


In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
!pip install selenium
!pip install webdriver_manager

Collecting selenium
  Downloading selenium-4.18.1-py3-none-any.whl (10.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m28.2 MB/s[0m eta [36m0:00:00[0m
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.24.0-py3-none-any.whl (460 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m460.2/460.2 kB[0m [31m35.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.11.1-py3-none-any.whl (17 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl (10 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl (24 kB)
Collecting h11<1,>=0.9.0 (from wsproto>=0.14->trio-websocket~=0.9->selenium)
  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?

In [3]:
import os
import re
import requests
import time
import random
import pandas as pd
import unicodedata

from io import BytesIO
from PIL import Image
from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from urllib.parse import urlparse

# 2. Start crawling data




In [4]:
# Step 1: Initialize Google Chrome browser
chrome_options = webdriver.ChromeOptions()

chrome_options.add_argument("start-maximized")
chrome_options.add_argument('--headless=new')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

driver = webdriver.Chrome(options=chrome_options)

In [5]:
# Step 2: Create empty folder for storing data
root_dir = './goodreads_choiceawards'
os.makedirs(root_dir, exist_ok=True)

img_root_dir = './goodreads_choiceawards_imgs'
os.makedirs(img_root_dir, exist_ok=True)

start_year = 2011
end_year = 2024
genre_id = 1
existing_genres = set()

In [6]:
# Step 3: Get list of books for each genre of each year
for year in range(2022, 2023):
    # Access to table page
    main_url = f'https://www.goodreads.com/choiceawards/best-books-{year}'
    driver.get(main_url)
    time.sleep(1)

    # Get list of articles (list of URLs)
    books_lst_xpath = '//div[@class="categoryContainer"]/div/a'
    books_tags = driver.find_elements(
        By.XPATH,
        books_lst_xpath
    )
    books_year_urls = [
        books_tag.get_attribute('href') \
        for books_tag in books_tags
    ]

    print(books_year_urls) # Just to check errors

    # Step 4: Get list of nominees of each genre of each year
    for books_year_url in books_year_urls:
        # Access to book content page
        driver.get(books_year_url)
        time.sleep(1)

        # Get list of articles (list of URLs)
        books_nominees_xpath = '//a[@class="pollAnswer__bookLink"]'
        nominees_tags = driver.find_elements(
            By.XPATH,
            books_nominees_xpath
        )
        # Assuming you already have the nominees_tags list
        books_nominees_urls = [nominee.get_attribute('href')
                              for nominee in nominees_tags]

        # Print the URLs (for debugging)
        print(books_nominees_urls)

        # Step 5: Access to book brief content

        for nominee in books_nominees_urls:
            # Access to book content page
            driver.get(nominee)
            time.sleep(1)

        # Try to get main content tag
        main_content_xpath = '//div[@class="BookPage__mainContent"]'
        main_content_tag = driver.find_element(
            By.XPATH,
            main_content_xpath
        )

        # Get title of the book
        title = main_content_tag.find_element(
                By.TAG_NAME,
                'h1'
            ).text.strip()

        # Get author
        author_xpath = '//span[@class="ContributorLink__name"]'
        author = main_content_tag.find_element(
            By.XPATH,
            author_xpath
        )

        # Get price
        price_xpath = '//*[@id="__next"]/div[2]/main/div[1]/div[1]/div/div[2]/div[2]/div/div[1]/button/span[1]'
        price = main_content_tag.find_element(
            By.XPATH,
            price_xpath)

        # Get genre
        genre_xpath = '//*[@id="__next"]/div[2]/main/div[1]/div[2]/div[2]/div[2]/div[6]/ul/span[1]/span[2]/a/span'
        try:
          genre = main_content_tag.find_element(
              By.XPATH,
              genre_xpath)
        except:
          genre = 'Unknown'
          continue

        # Author information
        author_info_xpath = '//*[@id="__next"]/div[2]/main/div[1]/div[2]/div[2]/div[2]/div[9]/div[3]/div[1]/div/div/span'
        author_info = main_content_tag.find_element(
            By.XPATH,
            author_info_xpath)

        # Publication date
        date_xpath = '//*[@id="__next"]/div[2]/main/div[1]/div[2]/div[2]/div[2]/div[7]/div/span[1]/span/div/p[2]'
        date_text = main_content_tag.find_element(By.XPATH, date_xpath).text
        date_parts = date_text.split(" ")
        date = " ".join(date_parts[2:])  # Exclude the "First published" part

        # Get description
        description_xpath = '//*[@id="__next"]/div[2]/main/div[1]/div[2]/div[2]/div[2]/div[5]/div/div[1]/div/div/span'
        description = main_content_tag.find_element(
            By.XPATH,
            description_xpath
        )

        # Get rating
        rating_xpath = '//*[@id="__next"]/div[2]/main/div[1]/div[2]/div[2]/div[2]/div[2]/a/div[1]/div'
        rating = main_content_tag.find_element(
            By.XPATH,
            rating_xpath
        )

        # Get list of images (list of tags)
        imgs_lst_xpath = '//*[@id="__next"]/div[2]/main/div[1]/div[1]/div/div[1]/div/div/div/div/div/div/img'
        imgs_tags = driver.find_elements(
            By.XPATH,
            imgs_lst_xpath
        )

        # Step 6: Combine all the text data of the book
        # Assuming everything is WebElement objects
        author_text = author.text
        try:
          price_text = f"{price.text.split('$')[1]}"
        except:
          price_text = '0.00'
          continue
        genre_text = genre.text
        description_text = description.text.strip()
        rating_text = rating.text

        # Combine the extracted text content
        final_content_lst = [title, author_text, genre_text, price_text, author_info.text, date, description_text, rating_text]
        final_content = '\n\n'.join(final_content_lst)

        # Step 7: Prepare data for making queries

        # Write the name for the file (will save later)
        parsed_url = urlparse(nominee)
        path = parsed_url.path
        parts = path.split("/")[1:]
        my_title = parts[-1].split("-")
        file_name = "-".join(my_title[1:])  # Exclude the "book-show" part

        # ID: Extract Work ID of the book
        id = re.findall(r"\d+", nominee)
        id_text = id[0]

        # Quantity: Generate a random number between 0 and 100
        if price_text == '0.00':
          quantity = 0
        else:
          quantity = random.randint(0, 100)

        # Escape single quotes in author_info.text and description_text
        escaped_author_info = author_info.text.replace("'", "''")
        escaped_description = description_text.replace("'", "''")

        # Write img path for query
        img_path = f'images/{file_name}.jpg'

        # Step 8: Generate SQL Queries for Database
        # Generate SQL Queries for Genres table
        with open('genres.txt', "a+") as file:
            for line in file:
              line = line.strip()
              # print(line)
              if line.startswith("INSERT INTO"):
                  genre = line.split("'")[1]
                  existing_genres.add(genre)

        if genre_text not in existing_genres:
            # Normalize like this: genre-name
            slug = re.sub(r'[^a-zA-Z0-9]+', '-', unicodedata.normalize('NFKD', genre_text).encode('ascii', 'ignore').decode('utf-8')).strip('-').lower()


            # Append the new genre with the incremented ID to the file
            with open('genres.txt', "a") as append_file:
                append_file.write(f"INSERT INTO Genres VALUES('{slug}', '{genre_text}')\n")

            existing_genres.add(genre_text)
            print(f"Added genre: {genre_text} with ID: {slug} and number: {genre_id}")
            genre_id += 1


        # Generate the SQL query with escaped values
        sql_query_books = "INSERT INTO products VALUES ('{}', '{}', '{}', '{}', {}, {}, '{}', '{}', '{}', '{}', {})".format(
            id_text, title, author_text, genre_text, price_text, quantity, escaped_author_info, date, escaped_description, img_path, rating_text
        )

        # Open the file in write mode and write the SQL query of the product to a new line
        with open('products.txt', 'a') as file:
            file.write(sql_query_books + '\n')

        # Book savepath
        book_filename = f'{file_name}.txt'
        book_savepath = os.path.join(
            root_dir,
            book_filename
        )
        with open(book_savepath, 'w') as f:
          f.write(final_content)

        # Step 6: get imgs
        # Get list of image urls
        img_urls = [
            imgs_tag.get_attribute('src') \
                for imgs_tag in imgs_tags
        ]

        for img_url in img_urls:
            # Get image information
            img_url_resp = requests.get(img_url)
            try:
                # Read image
                img = Image.open(
                    BytesIO(img_url_resp.content)
                )
            except:
                continue

            # Convert image to RGB format if needed
            if img.mode == 'P':
                img = img.convert('RGB')

            # Save crawled image
            img_name = f'{file_name}.jpg'
            img_save_path = os.path.join(img_root_dir, img_name)
            img.save(img_save_path)

        # Move back to previous page
        driver.back()

['https://www.goodreads.com/choiceawards/best-fiction-books-2022', 'https://www.goodreads.com/choiceawards/best-mystery-thriller-books-2022', 'https://www.goodreads.com/choiceawards/best-historical-fiction-books-2022', 'https://www.goodreads.com/choiceawards/best-fantasy-books-2022', 'https://www.goodreads.com/choiceawards/best-romance-books-2022', 'https://www.goodreads.com/choiceawards/best-science-fiction-books-2022', 'https://www.goodreads.com/choiceawards/best-horror-books-2022', 'https://www.goodreads.com/choiceawards/best-humor-books-2022', 'https://www.goodreads.com/choiceawards/best-nonfiction-books-2022', 'https://www.goodreads.com/choiceawards/best-memoir-autobiography-books-2022', 'https://www.goodreads.com/choiceawards/best-history-biography-books-2022', 'https://www.goodreads.com/choiceawards/best-graphic-novels-comics-2022', 'https://www.goodreads.com/choiceawards/best-poetry-books-2022', 'https://www.goodreads.com/choiceawards/best-debut-novel-2022', 'https://www.goodre

In [7]:
!cp -r '/content/goodreads_choiceawards' '/content/gdrive/MyDrive/AI/My mini and cute projects/1. Crawl/goodreads-choiceawards-books-crawl/2022'
!cp -r '/content/goodreads_choiceawards_imgs' '/content/gdrive/MyDrive/AI/My mini and cute projects/1. Crawl/goodreads-choiceawards-books-crawl/2022'
!cp '/content/products.txt' '/content/gdrive/MyDrive/AI/My mini and cute projects/1. Crawl/goodreads-choiceawards-books-crawl/2022'
!cp '/content/genres.txt' '/content/gdrive/MyDrive/AI/My mini and cute projects/1. Crawl/goodreads-choiceawards-books-crawl/2022'