# How To  Data Scrape With Chrome Using Python:

## Add the neccesary frameworks into the Python:

1. `import selenium`
1. `import selenium.webdriver as webdriver`

In [None]:
## Add in the selenium framework to automate chrome:

import selenium
import selenium.webdriver as webdriver

## Allow Chrome to run in `headless` mode by creating custom options:

1. `custom_options = webdriver.ChromeOptions()`
1. `custom_options.add_argument('headless')`

In [None]:
## Create and add in our custom options:

custom_options = webdriver.ChromeOptions()
custom_options.add_argument('--headless')

## Edgecase:
## Needed for Mybinder.org Google Cloud deployment when saving browser data to cache and forcing /tmp/
custom_options.add_argument('--disable-dev-shm-usage')

## Declare an instance of our headless Chrome browser

1. `browser = webdriver.Chrome(options=custom_options)`

In [None]:
browser = webdriver.Chrome(options=custom_options)

## Let's get a webpage loaded!

1. `browser.get('https://www.amazon.com/')`

In [None]:
## Enter a webpage to load

browser.get('https://www.amazon.com/')

## A few ways to interact with the loaded web page:

### Interact with the `html` of the web page:
  - `html( browser.page_source )`
  
### View an `image` of the web page:  
  - `image( browser.get_screenshot_as_png() )`
  
### Browse the source code of the `html ` web page:  
  - `display( browser.page_source )`  

In [None]:
## Interact with the loaded webpage:

image( browser.get_screenshot_as_png() )


## Lets select a specific element from the web page:


## Import the `Selenium` power tools for selection

 - from selenium.webdriver.common.by import By.TAG_NAME as tag_name


In [None]:
from selenium.webdriver.common.by import By


In [None]:
# browser.find_element(By.CSS_SELECTOR, 'a .nav-brand')
logo = browser.find_element(By.CLASS_NAME, 'a-carousel-row-inner')

In [None]:
image( logo.screenshot_as_png )

In [None]:
# signup_button = browser.find_element(By.CSS_SELECTOR, 'a[href^="/signup"]')
# signup_button = browser.find_element(By.TAG_NAME, 'footer')


In [None]:
image( browser.find_element(By.ID, 'gw-card-layout').screenshot_as_png )

## That's cool, but can we remove the scrollbars?


### Absolultly! - Lets add in a custom option to our browser to hide the scrollbars:

 - `custom_options.add_argument('--hide-scrollbars')`

In [None]:
## Add in your custom option to hide scrollbars:

custom_options.add_argument('--hide-scrollbars')

In [None]:
## We have to close and create the browser, because we are adding in new options.
## Run the code to refresh the options and browser:

browser.close()
browser = webdriver.Chrome(options=custom_options)
browser.get('https://www.amazon.com/')

In [None]:
## image( browser.find_element(By.CLASS_NAME, 'a-cardui-header').screenshot_as_png )

image( browser.find_element(By.CLASS_NAME, 'a-image-container').screenshot_as_png )

## Cool, now lets do something practical!

### Lets load the best selling books from Amazon:

 - `https://www.amazon.com/gp/bestsellers/2019/books/`

In [None]:
## Load the url into the browser

browser.get('https://www.amazon.com/gp/bestsellers/2019/books/')

In [None]:
## We can view or interact with the new loaded page:

image( browser.get_screenshot_as_png() )

In [None]:
## The html is local, served in an iframe:
## which means we can inspect with the Chrome Devtools:

html( browser.page_source )

## We can do everything that we would normally do in the browser

## Let's run some `Javascript` inside the DOM of the loaded browser page

 - `browser.execute_script( 'return Date.now().toString()' )`

In [None]:
browser.execute_script( 'return Date.now().toString()' )

In [None]:
multiline_code = '''
var book_data =  Array.from(
    document.querySelectorAll('.aok-inline-block.zg-item'))
    .map((element) => element.innerText.trim());

return book_data[0].split('\\n')[1]
'''

browser.execute_script(multiline_code)

In [None]:
multiline_code = '''
var book_data = 
    Array.from(document.querySelectorAll('.aok-inline-block.zg-item'))
    .map((element, index) => {
     var book = element.innerText.trim().split('\\n')
        return {
            id: index,
            title: book[0],
            author: book[1],
            reviews: book[2],
            price: book[4],
        }
    });
     
return book_data;
'''

result = browser.execute_script(multiline_code)

In [None]:
print(result)

In [None]:
import urllib.parse

In [None]:
search_query = urllib.parse.quote( result[0]['author'] + ' ' + result[0]['title'] )

In [None]:
print( search_query )

In [None]:
search_url = 'https://www.listennotes.com/search/?q='
search_url += search_query
search_url += '&sort_by_date=0&scope=episode&offset=0&language=Any%20language&len_min=0'

In [None]:
print(search_url)

In [None]:
browser.get(search_url)

In [None]:
image( browser.get_screenshot_as_png() )