In [1]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [2]:
# Creates a new Chrome webdriver
options = webdriver.ChromeOptions()
options.add_argument('headless')
driver = webdriver.Chrome(options=options)

# Trying Selenium

In [3]:
def get_book_info(book_id):
    url = f'https://www.google.com/books/edition/_/{book_id}?hl=en'

    # Pulls data
    driver.get(url)

    # Clicks on the "More" button to expand the subject field
    try:
        more_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'span[data-t="kno-fv-exp"]')))
        more_button.click()
    except:
        pass

    # Extracts the book information
    book_info = {'id': book_id}

    # Title
    try:
        subject_element = driver.find_element(By.CSS_SELECTOR, 'div.zNLTKd')
        book_info['title'] = subject_element.text.strip()
    except:
        pass

    # Sub-Title
    try:
        subject_element = driver.find_element(By.CSS_SELECTOR, 'div.Cxh5Uc')
        book_info['sub_title'] = subject_element.text.strip()
    except:
        pass

    # ISBN
    try:
        isbn_element = driver.find_element(By.XPATH, '//span[contains(@class, "isbn") or contains(text(), "ISBN")]/following-sibling::span')
        book_info['isbn'] = isbn_element.text.strip()
    except:
        pass

    # Page count
    try:
        page_count_element = driver.find_element(By.XPATH, '//span[contains(text(), "Page count")]/following-sibling::span')
        book_info['page_count'] = page_count_element.text.strip()
    except:
        pass

    # Publishing date
    try:
        publishing_date_element = driver.find_element(By.XPATH, '//span[contains(text(), "Published")]/following-sibling::span')
        book_info['publishing_date'] = publishing_date_element.text.strip()
    except:
        pass

    # Form
    try:
        form_element = driver.find_element(By.XPATH, '//span[contains(text(), "Form")]/following-sibling::span')
        book_info['form'] = form_element.text.strip()
    except:
        pass

    # Publisher
    try:
        publisher_element = driver.find_element(By.XPATH, '//span[contains(text(), "Publisher")]/following-sibling::span')
        book_info['publisher'] = publisher_element.text.strip()
    except:
        pass

    # Language
    try:
        language_element = driver.find_element(By.XPATH, '//span[contains(text(), "Language")]/following-sibling::span')
        book_info['language'] = language_element.text.strip()
    except:
        pass

    # Author
    try:
        author_element = driver.find_element(By.XPATH, '//span[contains(text(), "Author")]/following-sibling::span')
        book_info['author'] = author_element.text.strip()
    except:
        pass

    # Illustrator
    try:
        illustrator_element = driver.find_element(By.XPATH, '//span[contains(text(), "Illustrator")]/following-sibling::span')
        book_info['illustrator'] = illustrator_element.text.strip()
    except:
        pass

    # Originally published
    try:
        originally_published_element = driver.find_element(By.XPATH, '//span[contains(text(), "Originally published")]/following-sibling::span')
        book_info['originally_published'] = originally_published_element.text.strip()
    except:
        pass

    # Genres
    try:
        genres_element = driver.find_element(By.XPATH, '//span[contains(text(), "Genres")]/following-sibling::span')
        book_info['genres'] = genres_element.text.strip()
    except:
        pass

    # Subject
    try:
        subject_element = driver.find_element(By.XPATH, '//span[contains(text(), "Subject")]/following-sibling::span')
        book_info['subject'] = subject_element.text.strip()
    except:
        pass

    # Awards
    try:
        subject_element = driver.find_element(By.XPATH, '//span[contains(text(), "Awards")]/following-sibling::span')
        book_info['awards'] = subject_element.text.strip()
    except:
        pass

    # Nominations
    try:
        subject_element = driver.find_element(By.XPATH, '//span[contains(text(), "Nominations")]/following-sibling::span')
        book_info['nominations'] = subject_element.text.strip()
    except:
        pass

    # Characters
    try:
        subject_element = driver.find_element(By.XPATH, '//span[contains(text(), "Characters")]/following-sibling::span')
        book_info['characters'] = subject_element.text.strip()
    except:
        pass

    # description
    try:
        subject_element = driver.find_element(By.CSS_SELECTOR, 'div.Y0Qrof')
        book_info['description'] = subject_element.text.strip()
    except:
        pass

    # Returns dictionary
    return book_info


# Pulling whole data

In [4]:
# Reading in the book_ids
book_ids = pd.read_csv('../Data/final_books_ids.csv')
testing = book_ids
testing

Unnamed: 0,isbn13
0,h2Y-PgAACAAJ
1,FBXRzgEACAAJ
2,DAAAAAAACAAJ
3,LH5C9q83T6wC
4,62CEzQEACAAJ
...,...
2685,tcWMPAAACAAJ
2686,O2JfAAAAMAAJ
2687,y4kgSgAACAAJ
2688,TaQZzgEACAAJ


In [5]:
# Create an empty list to store the book_info dictionaries
book_info_list = []

# Loop over the book_ids and call the get_book_info function for each id
for book_id in testing['isbn13']:
    book_info = get_book_info(book_id)
    if book_info is not None:
        book_info_list.append(book_info)

In [6]:
# Create a dataframe from the list of dictionaries
testing_df = pd.DataFrame(book_info_list)

# Print the dataframe
testing_df

Unnamed: 0,id,title,isbn,page_count,publishing_date,form,publisher,language,author,illustrator,originally_published,genres,subject,awards,nominations,characters,description,sub_title
0,h2Y-PgAACAAJ,Harry Potter and the Chamber of Secrets,"9780439554893, 0439554896",341,1999,Hardcover,Scholastic Press,English,J. K. Rowling,Mary GrandPré,"July 2, 1998","Novel, Fantasy Fiction, Bildungsroman, High fa...",,Nestlé Smarties Book Prize for 9 to 11 years,Guardian Children's Fiction Prize,"Harry Potter, Hermione Granger, Lord Voldemort...",When the Chamber of Secrets is opened again at...,
1,FBXRzgEACAAJ,Harry Potter and the Prisoner of Azkaban,"9780439655484, 043965548X",560,May 2004,Trade paperback,Arthur A. Levine Books,English,J. K. Rowling,Mary GrandPré,"July 8, 1999","Novel, Fantasy Fiction",,"Locus Award for Best Fantasy Novel, Nestlé Sma...","Carnegie Medal for Writing, Hugo Award for Bes...","Harry Potter, Hermione Granger, Sirius Black, ...",The third book in J.K. Rowling's bestselling s...,
2,DAAAAAAACAAJ,Harry Potter,"9780439682589, 0439682584",2000,October 2004,Trade paperback,Scholastic (Us),English,J. K. Rowling,,,,,,,,,"5 Years of Magic, Adventure, and Mystery at Ho..."
3,LH5C9q83T6wC,7,"9780976540601, 0976540606",152,2005,Paperback,Nimble Books LLC,English,W. Frederick Zimmerman,,December 2005,,"Children's stories, English, Fantasy fiction, ...",,,,Through the magic of print-on-demand technolog...,"Unauthorized Harry Potter Book Seven News ; ""H..."
4,62CEzQEACAAJ,Harry Potter and the Prisoner of Azkaban,,435,1999,,Arthur A. Levine Books,English,J. K. Rowling,Mary GrandPré,"July 8, 1999","Novel, Fantasy Fiction",,"Locus Award for Best Fantasy Novel, Nestlé Sma...","Carnegie Medal for Writing, Hugo Award for Bes...","Harry Potter, Hermione Granger, Sirius Black, ...","""During his third year at Hogwarts School for ...",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2685,tcWMPAAACAAJ,Bella Y Oscura / Beautiful And Dark (Novela (B...,"9788432217289, 843221728X",204,2006,Paperback,Seix Barral,Spanish,Rosa Montero,,1993,"Allegory, Domestic Fiction","Domestic fiction, Children -- Fiction -- Spain...",,,,"La historia de la infancia, vivida y soñada, d...",
2686,O2JfAAAAMAAJ,La tía Julia y el escribidor,"9788432203237, 8432203238",447,1977,Hardcover,Seix Barral,Spanish,Mario Vargas Llosa,,1977,"Novel, Humorous Fiction","Aunts -- Fiction, Courtship -- Fiction, Love s...",,,,La tía Julia y el escribidor es una novela sem...,
2687,y4kgSgAACAAJ,O cavalo e o seu rapaz,"9789722330558, 9722330551",,,,Editorial Presença,Portuguese,"C. S. Lewis, Pauline Baynes",Pauline Baynes,"September 6, 1954","Novel, Fantasy, Fantasy Fiction, Children's li...","Children -- Juvenile fiction, Fantasy fiction,...",,,"Shasta, Aravis, Bree, Rabadash, Hwin, Aslan, S...",The Horse and His Boy is a novel for children ...,
2688,TaQZzgEACAAJ,As crónicas de Nárnia,"9789722329989, 9722329987",,,,Editorial Presença,Portuguese,"C. S. Lewis, A. Gonçalves Rodrigues, Ana Falcã...",Pauline Baynes,1956,,"Children -- Juvenile fiction, Fantasy fiction,...",,,,,


In [7]:
# Quit the driver
driver.quit()

In [None]:
# Exporting testing_df so i dont accidentally erase the data
testing_df.to_csv('All_books.csv', index=False)