## 1 Scraping
Scott McEachern
April 18, 2019

### 1.1 Dependences
Load depencies and initialize browser used with the collection data

In [2]:
#-- Dependences
import requests
from splinter import Browser
from bs4 import BeautifulSoup

import time
import urllib.parse


#-- Initialize Browser
executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
browser = Browser('chrome', **executable_path, headless=False)


print("Completed initialization of browser")

Completed initialization of browser


### 1.2 NASA Mars News
Scrape latest news title and paragraph from the NASA Mars News Site. The news article content is loaded async after the initial load of the page and require to use the Splinter library to control the browser so that all of the content is available.

In [3]:
#-- Load Browser with Site
nasaMarsNewsUrl = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest'

browser.visit(nasaMarsNewsUrl)

print("Completed load of site")


#-- Delay 
# It was found that the first time the script is run, the async content is not dowloaded and the delay is 
# to ensure that the content can be downloaded before attempting to search for content on page
time.sleep(5)

print("Delay for download is completed")


#-- Parse Page
nasaMarsNewsSoup = BeautifulSoup(browser.html, 'html.parser')


#-- Get Content
#- Get List items
newsList = nasaMarsNewsSoup.find_all('li', class_='slide')


#- Get First News
news_title = ''
news_p = ''

if (len(newsList) > 1):
    news_title = newsList[0].find('div', class_='content_title').text
    
    news_p = newsList[0].find('div', class_='article_teaser_body').text
    

#- Display Information
print(f"Title: {news_title}")
print(f"Paragraph: {news_p}")


Completed load of site
Delay for download is completed
Title: Things Are Stacking up for NASA's Mars 2020 Spacecraft
Paragraph: As the July 2020 launch date inches closer, the next spacecraft headed to the Red Planet is assembled for more testing.


### 1.3 JPL Mars Space Images
Get the URL to the currently featured Mars image.

In [4]:
#-- Navigate to Feature Image Metadata
#- Browse to Site
marsSpaceImagesUrl = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'

browser.visit(marsSpaceImagesUrl)


#- Navigate to Featured Image
browser.click_link_by_partial_text('FULL IMAGE')

print("Completed navigation to featured image")


#- Wait
# To ensure success in navigating to metadata page; wait for 2 seconds
time.sleep(2)


#- Navigate to Metadata
browser.click_link_by_partial_text('more info')

print("Completed navigation to metadata page")


#-- Parse Page
spaceImageSoup = BeautifulSoup(browser.html, 'html.parser')


#-- Get URL to Feature Image
#- Get List of Image Details
imageMetadataList = spaceImageSoup.find_all('div', class_='download_tiff')


#- Get JPG
featured_image_url = ''

for imageMetadata in imageMetadataList:
    
    if ('Full-Res JPG' in imageMetadata.text):
                
        #- Create URL
        baseUrl = "https:"
        featured_image_url = urllib.parse.urljoin(baseUrl, imageMetadata.find('a')['href'])
        
        break

        
#- Display Information
print(f"URL: {featured_image_url}")


Completed navigation to featured image
Completed navigation to metadata page
URL: https://photojournal.jpl.nasa.gov/jpeg/PIA16021.jpg
