## 1 Scraping
Scott McEachern  
April 18, 2019

### 1.1 Dependences
Load depencies and initialize browser used with the collection data

In [1]:
#-- Dependences
import requests
from splinter import Browser
from bs4 import BeautifulSoup

import time
import urllib.parse
import pandas as pd


#-- Initialize Browser
executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
browser = Browser('chrome', **executable_path, headless=False)


print("Completed initialization of browser")

Completed initialization of browser


### 1.2 NASA Mars News
Scrape latest news title and paragraph from the NASA Mars News Site. The news article content is loaded async after the initial load of the page and require to use the Splinter library to control the browser so that all of the content is available.

In [3]:
#-- Load Browser with Site
nasaMarsNewsUrl = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest'

browser.visit(nasaMarsNewsUrl)

print("Completed load of site")


#-- Delay 
# It was found that the first time the script is run, the async content is not dowloaded and the delay is 
# to ensure that the content can be downloaded before attempting to search for content on page
time.sleep(5)

print("Delay for download is completed")


#-- Parse Page
nasaMarsNewsSoup = BeautifulSoup(browser.html, 'html.parser')


#-- Get Content
#- Get List items
newsList = nasaMarsNewsSoup.find_all('li', class_='slide')


#- Get First News
news_title = ''
news_p = ''

if (len(newsList) > 1):
    news_title = newsList[0].find('div', class_='content_title').text
    
    news_p = newsList[0].find('div', class_='article_teaser_body').text
    

#- Display Information
print(f"Title: {news_title}")
print(f"Paragraph: {news_p}")


Completed load of site
Delay for download is completed
Title: Things Are Stacking up for NASA's Mars 2020 Spacecraft
Paragraph: As the July 2020 launch date inches closer, the next spacecraft headed to the Red Planet is assembled for more testing.


### 1.3 JPL Mars Space Images
Get the URL to the currently featured Mars image. Navigate through the site to get the metadata on the featured image. Use time to sleep between page navigations to ensure it has successfully loaded otherwise it was found it could not navigate between pages.

In [4]:
#-- Navigate to Feature Image Metadata
#- Browse to Site
marsSpaceImagesUrl = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'

browser.visit(marsSpaceImagesUrl)


#- Navigate to Featured Image
browser.click_link_by_partial_text('FULL IMAGE')

print("Completed navigation to featured image")


#- Wait
# To ensure success in navigating to metadata page; wait for 2 seconds
time.sleep(2)


#- Navigate to Metadata
browser.click_link_by_partial_text('more info')

print("Completed navigation to metadata page")


#-- Parse Page
spaceImageSoup = BeautifulSoup(browser.html, 'html.parser')


#-- Get URL to Feature Image
#- Get List of Image Details
imageMetadataList = spaceImageSoup.find_all('div', class_='download_tiff')


#- Get JPG
featured_image_url = ''

for imageMetadata in imageMetadataList:
    
    if ('Full-Res JPG' in imageMetadata.text):
                
        #- Create URL
        baseUrl = "https:"
        featured_image_url = urllib.parse.urljoin(baseUrl, imageMetadata.find('a')['href'])
        
        break

        
#- Display Information
print(f"URL: {featured_image_url}")


Completed navigation to featured image
Completed navigation to metadata page
URL: https://photojournal.jpl.nasa.gov/jpeg/PIA16021.jpg


### 1.4 Mars Weather
Get the latest Mars weather from tweeter feed. The paragraphy that contains the latest weather also includes an image of a graphic of the weather and when pulling the text from the paragraphy it also contains that image name; the code removes that image name.

In [27]:
#-- Get Page
#- Navigate to Page
marsWeatherUrl = 'https://twitter.com/marswxreport?lang=en'

browser.visit(marsWeatherUrl)


#- Parse Page
weatherSoup = BeautifulSoup(browser.html, 'html.parser')


#-- Get Latest Weather
#- Get Div with List of weather reports
weatherStreamDivs = weatherSoup.find_all('div', class_='stream')


#- Get List of Weather Reports
weatherReportsList = weatherStreamDivs[0].find_all('li')


#- Get Latest Weather
latestWeatherDiv = weatherReportsList[0].find('div', class_='js-tweet-text-container')


#- Get Weather
# Paragraphy contains twitter image text, remove that
weatherRemove = latestWeatherDiv.find('a').text

weatherAll = latestWeatherDiv.find('p').text

mars_weather = weatherAll.replace(weatherRemove, '')


print(f'Mars latest weather: {mars_weather}')


Mars latest weather: InSight sol 141 (2019-04-20) low -98.3ºC (-144.9ºF) high -19.7ºC (-3.5ºF)
winds from the SW at 4.7 m/s (10.6 mph) gusting to 12.9 m/s (28.8 mph)
pressure at 7.40 hPa


### 1.5 Mars Facts
Using Pandas, get the table of Mars facts from website. When exporting to HTML, the index is not included and additional line breaks are removed.

In [49]:
#- Get Mars Fact DataFrame
marsFactsUrl = 'https://space-facts.com/mars/'

marsFactsTables = pd.read_html(marsFactsUrl)

marsFacts_df = marsFactsTables[0]


#- Rename Columns
marsFacts_df.columns = ['Fact', 'Info']


#- Create HTML Table
marsFactsHtml = marsFacts_df.to_html(index=False)


#- Cleanup HTML
marsFactsHtml = marsFactsHtml.replace('\n', '')


#- Display HTML
print(marsFactsHtml)


<table border="1" class="dataframe">  <thead>    <tr style="text-align: right;">      <th>Fact</th>      <th>Info</th>    </tr>  </thead>  <tbody>    <tr>      <td>Equatorial Diameter:</td>      <td>6,792 km</td>    </tr>    <tr>      <td>Polar Diameter:</td>      <td>6,752 km</td>    </tr>    <tr>      <td>Mass:</td>      <td>6.42 x 10^23 kg (10.7% Earth)</td>    </tr>    <tr>      <td>Moons:</td>      <td>2 (Phobos &amp; Deimos)</td>    </tr>    <tr>      <td>Orbit Distance:</td>      <td>227,943,824 km (1.52 AU)</td>    </tr>    <tr>      <td>Orbit Period:</td>      <td>687 days (1.9 years)</td>    </tr>    <tr>      <td>Surface Temperature:</td>      <td>-153 to 20 °C</td>    </tr>    <tr>      <td>First Record:</td>      <td>2nd millennium BC</td>    </tr>    <tr>      <td>Recorded By:</td>      <td>Egyptian astronomers</td>    </tr>  </tbody></table>


### 1.6 Mars Hemispheres
Create dictionary that contains an item for each hemisphere which includes the URL to the image from the UCGS Astrogeology site. Dynamically determine the names of the buttons so that these can be used to navigate with.

In [26]:
#-- Navigate to Astrogeology Page
marsHemispheresUrl = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'

browser.visit(marsHemispheresUrl)


#-- Get Names of buttons
#- Parse Page
hemisphereSoup = BeautifulSoup(browser.html, 'html.parser')

#- Div of Hemispheres
divHemispheres = hemisphereSoup.find_all('div', class_='description')

#- Loop Through divs and get name of hyperlink
hyperlinkTitles = []

for divHemisphere in divHemispheres:
    hyperlinkTitles.append(divHemisphere.find('h3').text)
    

#-- Get Hemisphere Metadata
hemisphere_image_urls = []

for hyperlinkTitle in hyperlinkTitles:
    
    #- Navigate to page
    browser.click_link_by_partial_text(hyperlinkTitle)
    
    print(f'Completed navigation to details page: {hyperlinkTitle}')
    
    
    #- Parse Page
    hemisphereDetailSoup = BeautifulSoup(browser.html, 'html.parser')
    
    
    #- Get Div with Metadata
    downloadDiv = hemisphereDetailSoup.find('div', class_='downloads')
    
    
    #- Get Image URL
    hemisphereImageUrl = downloadDiv.find('a')['href']
    
    #- Add Dictionary to list
    hemisphere_image_urls.append(
    {
        'title': hyperlinkTitle.replace(' Enhanced', ''),
        'img_url': hemisphereImageUrl
    })
    
    
    print(f'Success getting image URL: {hemisphereImageUrl}')
    
    
    #- Return to source page
    browser.visit(marsHemispheresUrl)


print(hemisphere_image_urls)



Completed navigation to details page: Cerberus Hemisphere Enhanced
Success getting image URL: http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg
Completed navigation to details page: Schiaparelli Hemisphere Enhanced
Success getting image URL: http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg
Completed navigation to details page: Syrtis Major Hemisphere Enhanced
Success getting image URL: http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg
Completed navigation to details page: Valles Marineris Hemisphere Enhanced
Success getting image URL: http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg
[{'title': 'Cerberus Hemisphere', 'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg'}, {'title': 'Schiaparelli Hemisphere', 'img_url': 'http://astropedia.astrogeology.usg