In [1]:
# imports for web browsing and parsing
from bs4 import BeautifulSoup
from splinter import Browser
import pandas as pd

In [2]:
# Set up browser with chromedriver executable
executable_path = {"executable_path": "/usr/local/bin/chromedriver"}
browser = Browser("chrome", **executable_path, headless=False)

In [3]:
# Visit first scraping target [NASA Mars News] and set up parser
news_url = "https://mars.nasa.gov/news/"
browser.visit(news_url)
browser.find_by_css(".item_list").first.find_by_tag("a").click()

news_html = browser.html
soup = BeautifulSoup(news_html, 'html.parser')

In [4]:
# Collect the latest News Title and Paragraph Text. Assign the text to variables that you can reference later
news_title = soup.find("h1", {"class": "article_title"}).get_text().strip()
news_paragraph = soup.find("div", {"class": "wysiwyg_content"}).p.get_text()

# Test results
print(news_title)
print(news_paragraph)

Things Are Stacking up for NASA's Mars 2020 Spacecraft
For the past few months, the clean room floor in High Bay 1 at NASA's Jet Propulsion Laboratory in Pasadena, California, has been covered in parts, components and test equipment for the Mars 2020 spacecraft, scheduled for launch toward the Red Planet in July of 2020. But over the past few weeks, some of these components — the spacecraft-rocket-laden landing system and even the stand-in for the rover (christened "surrogate-rover") — have seemingly disappeared.


In [5]:
# Visit second scraping target [JPL Mars Space Images - Featured Image]
images_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
browser.visit(images_url)
browser.find_by_id("full_image").click()

# Button may not load immediately causing an error, loop until it appears
while browser.is_element_not_present_by_text("more info     ", wait_time=None):
    pass
browser.find_by_text("more info     ").click()

# Select full size image in order to obtain url
browser.find_by_css(".main_image").click()
featured_image_url = browser.url

# Test results
print(featured_image_url)

https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA16105_hires.jpg


In [6]:
# Visit third scraping target [Mars Weather]
weather_url = "https://twitter.com/marswxreport?lang=en"
browser.visit(weather_url)

# Set up parser
weather_html = browser.html
soup = BeautifulSoup(weather_html, 'html.parser')

# Remove child <a> in order to exclude twitter url
soup.find("p", {"class": "TweetTextSize TweetTextSize--normal js-tweet-text tweet-text"}).a.extract()

# Get weather tweet
mars_weather = soup.find("p", {"class": "TweetTextSize TweetTextSize--normal js-tweet-text tweet-text"}).get_text()

# Test results
print(mars_weather)

InSight sol 138 (2019-04-17) low -97.7ºC (-143.9ºF) high -17.3ºC (0.9ºF)
winds from the W at 4.3 m/s (9.5 mph) gusting to 12.6 m/s (28.1 mph)
pressure at 7.30 hPa


In [7]:
# Visit fourth scraping target [Mars Facts]
facts_url = "https://space-facts.com/mars/"

# Parse table with pandas.read_html and export table to a html string
facts_df = pd.read_html(facts_url, attrs={"id": "tablepress-mars"})[0]
facts_html = facts_df.to_html(index=False)

# Test results
print(facts_html)

<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th>0</th>
      <th>1</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td>Equatorial Diameter:</td>
      <td>6,792 km</td>
    </tr>
    <tr>
      <td>Polar Diameter:</td>
      <td>6,752 km</td>
    </tr>
    <tr>
      <td>Mass:</td>
      <td>6.42 x 10^23 kg (10.7% Earth)</td>
    </tr>
    <tr>
      <td>Moons:</td>
      <td>2 (Phobos &amp; Deimos)</td>
    </tr>
    <tr>
      <td>Orbit Distance:</td>
      <td>227,943,824 km (1.52 AU)</td>
    </tr>
    <tr>
      <td>Orbit Period:</td>
      <td>687 days (1.9 years)</td>
    </tr>
    <tr>
      <td>Surface Temperature:</td>
      <td>-153 to 20 °C</td>
    </tr>
    <tr>
      <td>First Record:</td>
      <td>2nd millennium BC</td>
    </tr>
    <tr>
      <td>Recorded By:</td>
      <td>Egyptian astronomers</td>
    </tr>
  </tbody>
</table>


In [8]:
# Visit fifth scraping target [Mars Hemispheres]
hemispheres_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"

hemisphere_titles = ["Cerberus Hemisphere Enhanced", "Schiaparelli Hemisphere Enhanced", "Syrtis Major Hemisphere Enhanced", "Valles Marineris Hemisphere Enhanced"]
hemisphere_image_urls = []

for title in hemisphere_titles:
    # Visit each hemisphere's page
    browser.visit(hemispheres_url)
    browser.find_by_text(title).click()
    
    # Find and extract URL for each full-size image
    hemisphere_html = browser.html
    soup = BeautifulSoup(hemisphere_html, 'html.parser')
    hemisphere_image_urls.append({"title": title, "img_url": soup.find(string="Sample").findParent()["href"]})
    
# Test results
for url in hemisphere_image_urls:
    for key in url:
        print(key, ":", url[key])

title : Cerberus Hemisphere Enhanced
img_url : http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg
title : Schiaparelli Hemisphere Enhanced
img_url : http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg
title : Syrtis Major Hemisphere Enhanced
img_url : http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg
title : Valles Marineris Hemisphere Enhanced
img_url : http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg
