In [1]:
from ipywidgets import Layout, Button, Box
from ipywidgets import IntProgress
from IPython.display import display
import time

In [2]:
## Add in the Selenium framework to automate Chrome:

import json
import selenium
import selenium.webdriver as webdriver


In [3]:
## Create and add in our custom options:

custom_options = webdriver.ChromeOptions()
custom_options.add_argument('headless')

In [4]:
## Edgecase:
## Needed for Mybinder.org Google Cloud deployment when saving browser data to cache and forcing /tmp/

custom_options.add_argument('--disable-dev-shm-usage')

In [5]:
## Create the browser:

browser = webdriver.Chrome(options=custom_options)

In [6]:
## Edgecase:
## - NewYorkTimes Fiction Years are consistent beyond 1940.
## - Non-Fiction starts around the year 2000.

year_slider = widgets.IntSlider(
    min=1941,
    max=2019,
    step=1,
    value=2010,
    description='Year:',
    disabled=False,
    orientation='horizontal',
)

year_textbox = widgets.IntText(
    min=1941,
    max=2019,
    value=year_slider.value,
    description='1941-2019:',
    orientation='horizontal',
    disabled=False
)


widgets.jslink((year_textbox, 'value'), (year_slider, 'value'))

def on_change(value_to_watch):
    year_slider.value = value_to_watch['new']
    nytimes_url = 'https://en.wikipedia.org/wiki/The_New_York_Times_Fiction_Best_Sellers_of_{}'.format(value_to_watch['new'])

year_slider.observe(on_change, names='value')

display(year_slider, year_textbox)


IntSlider(value=2010, description='Year:', max=2019, min=1941)

IntText(value=2010, description='1941-2019:')

In [7]:
## Create the New York Times Best Seller Historical List:


nytimes_url = 'https://en.wikipedia.org/wiki/The_New_York_Times_Fiction_Best_Sellers_of_{}'.format(year_slider.value)
print( nytimes_url )


https://en.wikipedia.org/wiki/The_New_York_Times_Fiction_Best_Sellers_of_1986


In [8]:
browser.get(nytimes_url)

In [9]:
## Import the selection tools from Selenium:

from selenium.webdriver.common.by import By 

In [10]:
## Table selection of row data via CSS_SELECTOR and child selection via nth-child()

element_titles = browser.find_elements(By.CSS_SELECTOR, 'td:nth-last-child(3) ~ td:nth-child(2)' )
element_authors = browser.find_elements(By.CSS_SELECTOR, 'td:nth-last-child(3) ~ td:nth-child(3)' )



In [11]:
## Create JSON structure via a Python Dictionary

unsorted_books = [{
    
    'title': title.get_attribute('innerText'),
    'author': author.get_attribute('innerText')}
    
    for title, author in zip( element_titles,element_authors )
]

## Debug:
# print( unsorted_books[0] )

In [12]:
## Debug:
## confirm that all results are the same length, and scraped properly

# print('json:' , len(unsorted_books) )
# print('tites:', len(element_titles) )
# print('authors:', len(element_authors) )

In [13]:
## Notes:

## Map the json.dumps,
## to encode the dictionaries as json objects which are immutable.

## Set can then be used to produce an iterable of unique immutables.

## Finally, we convert back to our dictionary representation using json.loads
## Note that initially, one must sort by keys to arrange the dictionaries in a unique form.

books = sorted(list(
    map(
        json.loads, set( map(json.dumps, unsorted_books) )
    )
),key=lambda key_value: key_value['title']) 

## Debug:
## confirm that books are now sorted by alphanum via title, 
## print( books )

In [14]:
## UI widget for book selection

ui_list = []

for item in books:
    ui_list.append('{} : {}'.format(item['title'],item['author'])) 

ui_list.sort()

In [15]:
## Note:
## Deprecated in favor of multi selection

# ui_index = widgets.IntText(
#     min=0,
#     max=(len(ui_list)-1),
#     value=0,
#     step=1,
#     description='index:',
#     disabled=False
# )


# ui_dropdown = widgets.Dropdown(
#     options=ui_list,
#     description='books:',
#     disabled=False,
# )


# widgets.jslink( (ui_index, 'value'), (ui_dropdown, 'index'))

# display(ui_dropdown)

## Debug:
## confirm that book selection and ui widget are the correct output
# print(books[ui_dropdown.index]['title'])
# print(books[ui_dropdown.index]['author'])

In [16]:
# ui_html = widgets.HTML

ui_book_selection = widgets.SelectMultiple(
    options=ui_list,
    rows=5,
    layout={'width': '80%'},
    description='Book Selection:',
    disabled=False
)

display( ui_book_selection )

SelectMultiple(description='Book Selection:', layout=Layout(width='80%'), options=('A Perfect Spy : John le Ca…

In [18]:
book_covers_to_fetch = []

for item in range(len(ui_book_selection.index)):
    selected = ui_book_selection.index[item]
    book_covers_to_fetch.append(books[selected]['title'])
#     book_covers_to_fetch.append(
#         str(
#             '{} {}'.format(books[selected]['title'],books[selected]['author'])
#         )
#     )

## Debug:
## - Confirm that user has not deselected any items
print(book_covers_to_fetch)

['A Perfect Spy', 'It', 'Last of the Breed', 'Red Storm Rising', 'The Bourne Supremacy', 'Wanderlust']


In [19]:
## Documentation:
## Complex css selector, to avoid using loops and itterators with javascript or python
## Example:
## browser.find_element(By.CSS_SELECTOR, 'a[title*="{}"]'.format( books[ui_dropdown.index]['title']) )

In [20]:
## We will need to URL Encode the following search queries:
## GoodReads.com
## ListenNotes.com 
 
import urllib.parse

In [21]:
## Edgecase:
## Wikipedia is unreliable for finding consitent book covers!
## Use GoodReads search urls to select the first result
## Example: 
## https://www.goodreads.com/search?utf8=%E2%9C%93&q={TITLE}+{AUTHOR}&search_type=books

book_queries = []

for item in range(len(book_covers_to_fetch)):
    selected = ui_book_selection.index[item]
    
    ## Edgecase:
    ## Searching for book titles that are two letters, or common words
    ## Must include the author name to generate results
    
    book_search_query = [
        books[selected]['author'],
        ' ',
        books[selected]['title']
    ]
    
    goodreads_url = [
        'https://www.goodreads.com/search?utf8=%E2%9C%93&q=',
        urllib.parse.quote(''.join(book_search_query)),
        '&search_type=books'
    ]
    
    book_queries.append(''.join(goodreads_url))

        
#     listnotes_url = 'https://www.goodreads.com/search?utf8=%E2%9C%93&q=',        
#     book_queries.append(
#         'https://www.goodreads.com/search?utf8=%E2%9C%93&q={}&search_type=books'.format(urllib.parse.quote(item))
#     )

for query in book_queries:
    print( query )

https://www.goodreads.com/search?utf8=%E2%9C%93&q=John%20le%20Carr%C3%A9%20A%20Perfect%20Spy&search_type=books
https://www.goodreads.com/search?utf8=%E2%9C%93&q=Stephen%20King%20It&search_type=books
https://www.goodreads.com/search?utf8=%E2%9C%93&q=Louis%20L%27Amour%20Last%20of%20the%20Breed&search_type=books
https://www.goodreads.com/search?utf8=%E2%9C%93&q=Tom%20Clancy%20Red%20Storm%20Rising&search_type=books
https://www.goodreads.com/search?utf8=%E2%9C%93&q=Robert%20Ludlum%20The%20Bourne%20Supremacy&search_type=books
https://www.goodreads.com/search?utf8=%E2%9C%93&q=Danielle%20Steel%20Wanderlust&search_type=books


In [22]:
book_images = []
book_covers = []
book_urls = []
url_match = None



In [23]:
## UIUX:
## Progress bar for loading the book queries
## TODO: could be a dynamic function, to call the len(list_to_messure)
## NOTE:
## increment with the following:
## ui_progress_bar.value += 1
## ui_progress_count += 1


ui_progress_count = 0
ui_progress_bar = widgets.IntProgress(
    min=0,
    max=len(book_queries),
    layout={'width': '100%'},
)

display(ui_progress_bar)
    
## Loop throught each Book Query:
for item in range(len(book_queries)):
    ui_progress_bar.value += 1
    
    
    selected_item = ui_book_selection.index[item]
    selected_title = book_covers_to_fetch[item]
    
    
    
    # Debug:
    print('now searching: ', book_queries[item] )
    browser.get( book_queries[item] )
    
    ## Edgecase:
    ## Popup may appear when loading multiple URLS for an email signup
    try:
        popup_elements = browser.find_elements(By.CSS_SELECTOR, 'img[alt^="Dismiss"')
        for icon in popup_elements:
            icon.click()
    except:
        print('goodreads: no popup')
    
    ## Edgecase:
    ## Some titles are truncated and shortend when selecting the image text
    ## Use the span selection text for more reliable results.
    result_elements = browser.find_elements(By.TAG_NAME, 'tr')
    
    ## Edgecase:
    ## Search the book results from the query for the EXACT match of the selected title
    ## Select the correct result and scrape the URL
    ## Test currect selected title against found table row results from seach query
    for result in range(len(result_elements)):
        title_result = result_elements[result].get_attribute('innerText').splitlines()[0].strip()
        ## Debug:
        # print(title_result)
        if len(title_result) is len(selected_title):
            ## Debug:
            # print('match: ', result)
            url_match = result_elements[result].find_element(By.TAG_NAME, 'a').get_attribute('href')
        else:
        ## Edgecase:
        ## There are books that truncate or add in extra sub titles to the match
        ## In that event, use a CSS_SELECTOR regex for nearest match.
            url_match = browser.find_element(By.CSS_SELECTOR, 'a[title*="{}"]'.format( selected_title ) ).get_attribute('href')

    ## Debug:
    # print('url_match: ', url_match )
    browser.get(url_match)
    cover_image = browser.find_element(By.CSS_SELECTOR, '.editionCover > img').get_attribute('src')
    book_covers.append( cover_image )
    ui_progress_count += 1

print('complete')

IntProgress(value=0, layout=Layout(width='100%'), max=6)

now searching:  https://www.goodreads.com/search?utf8=%E2%9C%93&q=John%20le%20Carr%C3%A9%20A%20Perfect%20Spy&search_type=books
goodreads: no popup
now searching:  https://www.goodreads.com/search?utf8=%E2%9C%93&q=Stephen%20King%20It&search_type=books
goodreads: no popup
now searching:  https://www.goodreads.com/search?utf8=%E2%9C%93&q=Louis%20L%27Amour%20Last%20of%20the%20Breed&search_type=books
goodreads: no popup
now searching:  https://www.goodreads.com/search?utf8=%E2%9C%93&q=Tom%20Clancy%20Red%20Storm%20Rising&search_type=books
goodreads: no popup
now searching:  https://www.goodreads.com/search?utf8=%E2%9C%93&q=Robert%20Ludlum%20The%20Bourne%20Supremacy&search_type=books
goodreads: no popup
now searching:  https://www.goodreads.com/search?utf8=%E2%9C%93&q=Danielle%20Steel%20Wanderlust&search_type=books
goodreads: no popup
complete


In [24]:
html_data = []
for image in range(len(book_covers)):
     html_data.append('<img sytle="display: inline-block" width="150" src="{}">'
        .format( book_covers[image] )
    )


ui_html = widgets.HTML(
    layout={'width': '100%'},
    value = str(html_data)
)

display( ui_html )

HTML(value='[\'<img sytle="display: inline-block" width="150" src="https://i.gr-assets.com/images/S/compressed…

In [25]:


# for item in range(len(ui_book_selection.index)):
#     selected = ui_book_selection.index[item]
#     print(books[selected]['title'], ' : ' ,books[selected]['author'])

# for item in book_covers_to_fetch:
#     book_queries.append(
#         'https://www.goodreads.com/search?utf8=%E2%9C%93&q={}&search_type=books'.format(urllib.parse.quote(item))
#     )

# print( book_queries )

In [26]:
stitcher_queries = []
listennotes_queries = []

## Run the code to create the podcast search query URLS:

for item in range(len(ui_book_selection.index)):
    selected = ui_book_selection.index[item]
    selected_book = [ books[selected]['title'],' ',books[selected]['author'] ]
    
    stitcher_query_url = [
        'https://www.stitcher.com/search?q=',
        urllib.parse.quote(''.join(selected_book)),
        '#episodes'
    ]
    
    listennotes_url = [
        'https://www.listennotes.com/search/?q=',
        urllib.parse.quote(''.join(selected_book)),
        '&sort_by_date=0&scope=episode&offset=0&language=Any%20language&len_min=0'
    ]

    stitcher_queries.append(''.join(stitcher_query_url))
    listennotes_queries.append(''.join(listennotes_url))
    
#     selected_book = [ str(books[selected]['title']), ' ', str(books[selected]['author'])]
#     selected_query = urllib.parse.quote(str(selected_book))
#     book_query_url = ['https://www.listennotes.com/search/?q=', selected_query, '&sort_by_date=0&scope=episode&offset=0&language=Any%20language&len_min=0']
#     podcast_queries.append(str(book_query_url))

In [27]:
## Edgecase:
## There are times when there will be ZERO results
## Example:
## https://www.stitcher.com/search?q=Random%20Harvest%20James%20Hilton#episodes
## Possible Solution:
## More work, pulling in both Sticher and ListenNotes results, take the one that has an option

for result in range(len(stitcher_queries)):
    print(
        str(result+1)+':',
        '\n',
        'stitcher: ',
        '\n ‣ ', stitcher_queries[result],
        '\n',
        'listennotes: ',
        '\n ‣', listennotes_queries[result]
    )



# for query in sticher_queries:
#     print('sticher:', query )
    
# for query in listennotes_queries:
#     print('listennotes:', query)

1: 
 stitcher:  
 ‣  https://www.stitcher.com/search?q=A%20Perfect%20Spy%20John%20le%20Carr%C3%A9#episodes 
 listennotes:  
 ‣ https://www.listennotes.com/search/?q=A%20Perfect%20Spy%20John%20le%20Carr%C3%A9&sort_by_date=0&scope=episode&offset=0&language=Any%20language&len_min=0
2: 
 stitcher:  
 ‣  https://www.stitcher.com/search?q=It%20Stephen%20King#episodes 
 listennotes:  
 ‣ https://www.listennotes.com/search/?q=It%20Stephen%20King&sort_by_date=0&scope=episode&offset=0&language=Any%20language&len_min=0
3: 
 stitcher:  
 ‣  https://www.stitcher.com/search?q=Last%20of%20the%20Breed%20Louis%20L%27Amour#episodes 
 listennotes:  
 ‣ https://www.listennotes.com/search/?q=Last%20of%20the%20Breed%20Louis%20L%27Amour&sort_by_date=0&scope=episode&offset=0&language=Any%20language&len_min=0
4: 
 stitcher:  
 ‣  https://www.stitcher.com/search?q=Red%20Storm%20Rising%20Tom%20Clancy#episodes 
 listennotes:  
 ‣ https://www.listennotes.com/search/?q=Red%20Storm%20Rising%20Tom%20Clancy&sort_by_da

In [28]:
## Load tools for waiting and testing for elements inside DOM

from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


In [29]:
## Note:
## Get Stitcher podcast results for each query:
## TODO: could refactor this into an object to call properties

stitcher_results = []
podcast_covers = []
podcast_descriptions = []
podcast_urls = []
podcast_episodes = []
podcast_publishers = []

## UIUX:
## Progress bar for loading the podcast queries
## TODO: could be a dynamic function, to call the len(list_to_messure)

ui_progress_count = 0
ui_progress_bar = widgets.IntProgress(
    min=0,
    max=len(book_queries),
    layout={'width': '100%'},
)

display(ui_progress_bar)    


## stitcher_query_url

for query in stitcher_queries:
    print( query )
    browser.get( query )
    browser.implicitly_wait(1)
    timeout_delay = 5
    try:
        ui_progress_bar.value += 1
        element_present = EC.presence_of_element_located((By.CLASS_NAME, 'play'))
        WebDriverWait(browser, timeout_delay).until(element_present)

        pod_cover = browser.find_element(By.CSS_SELECTOR, 'ul[id^="episodeResultsList"] > li:first-child > a > img')
        pod_description = browser.find_element(By.CSS_SELECTOR, 'ul[id^="episodeResultsList"] > li:first-child > a > p')
        pod_episode = browser.find_element(By.CSS_SELECTOR, 'ul[id^="episodeResultsList"] > li:first-child > a > h4')
        pod_producer = browser.find_element(By.CSS_SELECTOR, 'ul[id^="episodeResultsList"] > li:first-child > a > div[class^="info"]')

        
        podcast_episodes.append(pod_episode.get_attribute('innerText'))
        podcast_covers.append(pod_cover.get_attribute('src'))
        podcast_descriptions.append(pod_description.get_attribute('innerText'))
        podcast_publishers.append(pod_producer.get_attribute('innerText'))
        
        
        pod_result = browser.find_element(By.CSS_SELECTOR, 'ul[id^="episodeResultsList"] > li:first-child > a')
        pod_result.click()
        
        pod_audio = browser.find_element(By.TAG_NAME, 'audio').get_attribute('src')
        
        stitcher_results.append( pod_audio )
        ui_progress_count += 1
    except TimeoutException:
        ui_progress_bar.value += 1
        print('Error: Timed Out Waiting For Element Presence')
        stitcher_results.append(None)
        podcast_publishers.append(None)
        podcast_episodes.append(None)
        podcast_covers.append(None)
        podcast_descriptions.append(None)
        ui_progress_count += 1

print('\n', 'complete')

IntProgress(value=0, layout=Layout(width='100%'), max=6)

https://www.stitcher.com/search?q=A%20Perfect%20Spy%20John%20le%20Carr%C3%A9#episodes
https://www.stitcher.com/search?q=It%20Stephen%20King#episodes
https://www.stitcher.com/search?q=Last%20of%20the%20Breed%20Louis%20L%27Amour#episodes
https://www.stitcher.com/search?q=Red%20Storm%20Rising%20Tom%20Clancy#episodes
https://www.stitcher.com/search?q=The%20Bourne%20Supremacy%20Robert%20Ludlum#episodes
https://www.stitcher.com/search?q=Wanderlust%20Danielle%20Steel#episodes

 complete


In [30]:
for match in stitcher_results:
    print( match )

https://media.acast.com/intelligencesquared/ianflemingvsjohnlecarre/media.mp3
http://traffic.libsyn.com/jackandnickhorrorshow/It__Stephen_King.m4a?dest-id=313398
http://traffic.libsyn.com/effortlessenglish/True_Education__Louis_Lamour.mp3?dest-id=17067
http://media.blubrry.com/gameorama/gameorama.com.br/wp-content/uploads/podcasts/gameorama/gameorama_21_tomclancy.mp3
https://mcdn.podbean.com/mf/web/pnz4kf/NPPBOURNE03.mp3
https://www.podtrac.com/pts/redirect.mp3/traffic.megaphone.fm/HSW1069448799.mp3


In [31]:
items_layout = Layout(flex='', width='')


flex_layout = Layout(
    display='flex',
    flex_direction='column',
    flex_wrap='wrap',
    justify_content='center',
    align_items='center',
    align_content='center',
    border='solid',
    height='100%',
    width='auto'
)

# words = ['correct', 'horse', 'battery', 'staple']
# items = [Button(description=w, layout=items_layout, button_style='danger') for w in words]

# display( Box(children=book_covers[item], layout=box_layout) )


css_row = 'display: flex; flex-direction: row; flex-wrap: wrap; width: 100%;'
css_column = 'display: flex; flex-direction: column; flex-basis: 100%; flex: 1;'

# display: flex; flex: 1; flex-basis: 100%;
html_data = []
for item in range(len(book_covers)):
    html_data.append('''
    <div class="container" style="display: flex; flex-direction: row; flex-wrap: wrap; width: 100%;">
        
        <img style="height: 150px;" src="{}">
        <img style="width: 90px;" src="{}">
        
        <div style="flex-direction: column; flex-basis: 100%; flex: 1; justify-content: center;">
            <p style="flex-basis: 50%;">
                <b id="publisher">{}</b>
                <h5 id="episode">Episode: {}</h5>
                <h5 style="font-style:italic;" id="description">Description: {}</h5>
            </p>
            <audio style="flex-basis: 50%;" controls src="{}"></audio>
        </div>
    </div>
    '''.format(
        book_covers[item],
        podcast_covers[item],
        
        podcast_publishers[item],
        podcast_episodes[item],
        podcast_descriptions[item],
        
        stitcher_results[item]
    )
)


display( widgets.HTML( ''.join(html_data) , layout=flex_layout) )

# for book in book_covers:
#     print( book )

HTML(value='\n    <div class="container" style="display: flex; flex-direction: row; flex-wrap: wrap; width: 10…

In [None]:
# html_data = []
# for item in range(len(book_covers)):
#      html_data.append('''
     
#      <p>
#          <img sytle="display: inline-block" width="150" src="{}">
#          <li sytle="display: inline-block">
#             <img sytle="display: inline-block" width="90" src="{}">
#             <p sytle="display: inline-block">{}</p>
#             <audio controls src="{}"></audio>
#          </li>
#      </p>
#      '''
#         .format(
#             book_covers[item],
#             podcast_covers[item],
#             podcast_descriptions[item],
#             stitcher_results[item]
#             )
#     )


# ui_html = widgets.HTML(
#     layout={'width': '100%'},
#     value = str(html_data)
# )

# display( ui_html )

In [None]:
## Notes:


# First List Element:  ul > li > a > div[class^="play"].click()

# https://www.stitcher.com/podcast/the-joe-rogan-experience/e/46639950?autoplay=false
# document.getElementsByTagName('audio')[0].getAttribute('src')




In [None]:
# for item in range(len(book_queries)):
#     progress_bar.value += 1
#     # selected_item = ui_book_selection.index[item]
#     selected_title = book_covers_to_fetch[item]
#     browser.get( book_queries[item] )
#     ## Edgecase:
#     ## Some titles are truncated and shortend when selecting the image text
#     ## Use the span selection text for more reliable results.
#     result_elements = browser.find_elements(By.TAG_NAME, 'tr')
    
#     ## Edgecase:
#     ## Search the book results from the query for the EXACT match of the selected title
#     ## Select the correct result and scrape the URL
#     ## Test currect selected title against found table row results from seach query
#     for result in range(len(result_elements)):
#         title_result = result_elements[result].get_attribute('innerText').splitlines()[0].strip()
#         ## Debug:
#         # print(title_result)
#         if len(title_result) is len(selected_title):
#             ## Debug:
#             # print('match: ', result)
#             url_match = result_elements[result].find_element(By.TAG_NAME, 'a').get_attribute('href')
#         else:
#         ## Edgecase:
#         ## There are books that truncate or add in extra sub titles to the match
#         ## In that event, use a CSS_SELECTOR regex for nearest match.
#             url_match = browser.find_element(By.CSS_SELECTOR, 'a[title*="{}"]'.format( selected_title ) ).get_attribute('href')

#     ## Debug:
#     # print('url_match: ', url_match )
#     browser.get(url_match)
#     cover_image = browser.find_element(By.CSS_SELECTOR, '.editionCover > img').get_attribute('src')
#     book_covers.append( cover_image )
#     progress_count += 1

# print('complete')