In [1]:
## Add in the Selenium framework to automate Chrome:

import json
import selenium
import selenium.webdriver as webdriver


In [2]:
## Create and add in our custom options:

custom_options = webdriver.ChromeOptions()
custom_options.add_argument('headless')

In [3]:
## Edgecase:
## Needed for Mybinder.org Google Cloud deployment when saving browser data to cache and forcing /tmp/

custom_options.add_argument('--disable-dev-shm-usage')

In [4]:
## Create the browser:

browser = webdriver.Chrome(options=custom_options)

In [5]:
## Edgecase:
## - NewYorkTimes Fiction Years are consistent beyond 1940.
## - Non-Fiction starts around the year 2000.

year_slider = widgets.IntSlider(
    min=1941,
    max=2019,
    step=1,
    value=2010,
    description='Year:',
    disabled=False,
    orientation='horizontal',
)

year_textbox = widgets.IntText(
    min=1941,
    max=2019,
    value=year_slider.value,
    description='1941-2019:',
    orientation='horizontal',
    disabled=False
)


widgets.jslink((year_textbox, 'value'), (year_slider, 'value'))


display(year_slider, year_textbox)


IntSlider(value=2010, description='Year:', max=2019, min=1941)

IntText(value=2010, description='1941-2019:')

In [6]:
## Create the New York Times Best Seller Historical List:

nytimes_url = 'https://en.wikipedia.org/wiki/The_New_York_Times_Fiction_Best_Sellers_of_{}'.format(year_slider.value)

In [7]:
print( nytimes_url )

https://en.wikipedia.org/wiki/The_New_York_Times_Fiction_Best_Sellers_of_2010


In [8]:
browser.get(nytimes_url)

In [9]:
## Import the selection tools from Selenium:

from selenium.webdriver.common.by import By 

In [10]:
## Table selection of row data via CSS_SELECTOR and child selection via nth-child()

element_titles = browser.find_elements(By.CSS_SELECTOR, 'td:nth-last-child(3) ~ td:nth-child(2)' )
element_authors = browser.find_elements(By.CSS_SELECTOR, 'td:nth-last-child(3) ~ td:nth-child(3)' )



In [11]:
## Create JSON structure via a Python Dictionary

unsorted_books = [{
    
    'title': title.get_attribute('innerText'),
    'author': author.get_attribute('innerText')}
    
    for title, author in zip( element_titles,element_authors )
]


print( unsorted_books[0] )

{'title': 'The Lost Symbol', 'author': 'Dan Brown'}


In [12]:
## Debug:
## confirm that all results are the same length, and scraped properly

print('json:' , len(unsorted_books) )
print('tites:', len(element_titles) )
print('authors:', len(element_authors) )

json: 38
tites: 38
authors: 38


In [13]:
## Notes:

## Map the json.dumps,
## to encode the dictionaries as json objects which are immutable.

## Set can then be used to produce an iterable of unique immutables.

## Finally, we convert back to our dictionary representation using json.loads
## Note that initially, one must sort by keys to arrange the dictionaries in a unique form.

books = sorted(list(
    map(
        json.loads, set( map(json.dumps, unsorted_books) )
    )
),key=lambda key_value: key_value['title']) 

## Debug:
## confirm that books are now sorted by alphanum via title, 
## print( books )

In [14]:
## UI widget for book selection

ui_list = []

for item in books:
    ui_list.append('{} : {}'.format(item['title'],item['author'])) 

ui_list.sort()

In [15]:
ui_index = widgets.IntText(
    min=0,
    max=(len(ui_list)-1),
    value=0,
    step=1,
    description='index:',
    disabled=False
)


ui_dropdown = widgets.Dropdown(
    options=ui_list,
    description='books:',
    disabled=False,
)


widgets.jslink( (ui_index, 'value'), (ui_dropdown, 'index'))

display(ui_dropdown)

## Debug:
## confirm that book selection and ui widget are the correct output
# print(books[ui_dropdown.index]['title'])
# print(books[ui_dropdown.index]['author'])

Dropdown(description='books:', options=('61 Hours : Lee Child', 'American Assassin : Vince Flynn', 'Caught : H…

In [16]:
ui_html = widgets.HTML
ui_book_selection = widgets.SelectMultiple(
    options=ui_list,
    rows=5,
    description='Book Selection:',
    disabled=False
)

display( ui_book_selection )

SelectMultiple(description='Book Selection:', options=('61 Hours : Lee Child', 'American Assassin : Vince Flyn…

In [24]:
book_covers_to_fetch = []

for item in range(len(ui_book_selection.index)):
    selected = ui_book_selection.index[item]
    book_covers_to_fetch.append(
        str(
            '{} {}'.format(books[selected]['title'],books[selected]['author'])
        )
    )


print(book_covers_to_fetch)

["The Girl Who Kicked the Hornets' Nest Stieg Larsson"]


In [25]:
## Documentation:
## Complex css selector, to avoid using loops and itterators with javascript or python
## Example:
## browser.find_element(By.CSS_SELECTOR, 'a[title*="{}"]'.format( books[ui_dropdown.index]['title']) )

In [26]:
## We will need to URL Encode and parse search_query for ListenNotes.com 

import urllib.parse

In [27]:
## Edgecase:
## Wikipedia is unreliable for finding consitent book covers!
## Use GoodReads search urls to select the first result
## Example: 
## https://www.goodreads.com/search?utf8=%E2%9C%93&q={TITLE}+{AUTHOR}&search_type=books

selected_title = books[ui_dropdown.index]['title']
selected_author = books[ui_dropdown.index]['author']

book_query =  urllib.parse.quote( selected_title + ' ' + selected_author )
goodreads_query = 'https://www.goodreads.com/search?utf8=%E2%9C%93&q={}&search_type=books'.format(book_query)


print('query: ', goodreads_query)
print('title:', selected_title )
print('author:', selected_author )

query:  https://www.goodreads.com/search?utf8=%E2%9C%93&q=61%20Hours%20Lee%20Child&search_type=books
title: 61 Hours
author: Lee Child


In [28]:
book_queries = []

for item in book_covers_to_fetch:
    book_queries.append(
        'https://www.goodreads.com/search?utf8=%E2%9C%93&q={}&search_type=books'.format(urllib.parse.quote(item))
    )

print( book_queries )

['https://www.goodreads.com/search?utf8=%E2%9C%93&q=The%20Girl%20Who%20Kicked%20the%20Hornets%27%20Nest%20Stieg%20Larsson&search_type=books']


In [22]:
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


In [39]:
# book_cover_url = browser.find_element(By.CSS_SELECTOR, 'a[title*="{}"]'.format( selected_title ) ).get_attribute('href')




book_covers = []

for item in range(len(ui_book_selection.index)):
    selected_item = ui_book_selection.index[item]
    selected_title = books[selected_item]['title']
    browser.get( book_queries[item] )
    browser.implicitly_wait(1)
    ## Edgecase:
    ## Some titles are truncated and shortend when selecting the image text
    ## Use the span selection text for more reliable results.
    possible_urls = browser.find_elements(By.CSS_SELECTOR, '.bookTitle > span')
    for result in range(len(possible_urls)):
        print( possible_urls[result].get_attribute('innerText') )
        print( result )


The Girl Who Kicked the Hornet's Nest by Stieg Larsson | Summary & Study Guide
0
The Girl Who Kicked the Hornet's Nest (Millennium, #3)
1
Stieg Larsson Set: Girl with the Dragon Tattoo, the Girl Who Played with Fire, the Girl Who Kicked the Hornets' Nest
2
The Girl Who Kicked the Hornet's Nest (Millennium: The Graphic Novels, #3)
3
The Girl Who Kicked the Hornet's Nest
4
The Unauthorized Guide to the Real World Stories Behind Stieg Larsson's the Girl Who Kicked the Hornet's Nest
5
Stieg Larsson Millennium Trilogy Collection 4 Books Set (The Girl with the Dragon Tattoo, The Girl Who Kicked the Hornets' Nest, The Girl Who Played with Fire, The Girl in the Spider's Web: Continuing Stieg Larsson's Millennium Series)
6
The Girl Who Kicked the Hornet's Nest, vol 3
7


In [None]:
    
    
#     book_cover_url = browser.find_element(By.CSS_SELECTOR, 'a[title*="{}"]'.format(selected_title)).get_attribute('href')
    # book_cover_url = browser.find_element(By.CSS_SELECTOR, 'a[title*="{}"]'.format(selected_title)).get_attribute('href')
#     browser.get( book_cover_url )
#     timeout_delay = 3
#     try:
#         element_present = EC.presence_of_element_located((By.CLASS_NAME, 'enlargeCover'))
#         WebDriverWait(browser, timeout_delay).until(element_present)
#         cover_image = browser.find_element(By.CSS_SELECTOR, '.editionCover > img').get_attribute('src')
#         book_covers.append( cover_image )
#     except TimeoutException:
#         print('Error: Timed Out Waiting For Element Presence')


    
    # Debug:
    # book_covers.append( browser.get_screenshot_as_png() )
    ## Edgecase:
    ## Fails on using CSS_SELECTOR in a loop, works fine outside the loop when selecting an individual selector
#     cover_image = browser.execute_script('''
#         document.querySelector('.editionCover').getAttribute('src')
#     ''')
    # cover_image = browser.find_element(By.CSS_SELECTOR, '.editionCover > img').get_attribute('src')
#     book_covers.append( browser.find_element(By.CSS_SELECTOR, '.editionCover > img').get_attribute('src'))


In [None]:
print( book_covers )

In [None]:
## Debug:
# image( book_covers[2] )

In [None]:
# browser.get(goodreads_query)

In [None]:
## Debug:
## confirm that no CAPTCHA were loaded
## image( browser.get_screenshot_as_png() )

In [None]:
goodreads_url = browser.find_element(By.CSS_SELECTOR, 'a[title*="{}"]'.format( selected_title ) ).get_attribute('href')

In [None]:
browser.get( goodreads_url )

In [None]:
## Debug:
# image( browser.get_screenshot_as_png() )

In [None]:
goodreads_book_cover = browser.find_element(By.CSS_SELECTOR, '.editionCover > img').get_attribute('src')

In [None]:
image( goodreads_book_cover )

In [None]:
# element_urls = []

# for item in books:
#     css_regex = 'a[title*="{}"]'.format(item['title'])
#     element_urls.append(browser.find_element(By.CSS_SELECTOR, css_regex))

# print( len(element_urls) )

In [None]:
# documentation: https://seleniumhq.github.io/selenium/docs/api/py/webdriver/selenium.webdriver.common.action_chains.html

# from selenium.webdriver.common.action_chains import ActionChains


In [None]:

# actions = ActionChains(browser)

# hover_over_element = actions.move_to_element(element_url)
# browser.implicitly_wait(1)
# hover_over_element.perform()





---

In [None]:

# widgets.SelectMultiple(
#     options=['Apples', 'Oranges', 'Pears'],
#     value=['Oranges'],
#     #rows=10,
#     description='Fruits',
#     disabled=False
# )
# import html

# html_options = []

# for item in books:
#     title = html.escape(item['title'])
#     html_options.append('<option value={}>{}</option>'.format(title, title) )


# html_select = '<select id="books" name="books" multiple>{}</select>'.format(str(html_options))
# html_source = '<label for="books">Choose 3 Books</label><br>'
# html_button = '''
# <br> <button name="order" id="order">Book Selection</button>
# <p id="output">
# </p>
# '''


# ui_selected_books = widgets.HTML(
#     value=ui_html_source,
#     placeholder='Some HTML',
#     description='Some HTML',
# )


# display( ui_selected_books  )

In [None]:
vars( ui_selected_books )