In [1]:
# --- dependencies and setup ---
from bs4 import BeautifulSoup
import pandas as pd
from splinter import Browser
import time
import re

In [2]:
# --- urls for scraping ---
MarsNews_url = 'https://mars.nasa.gov/news/'
JPLimage_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
MarsWeather_url = 'https://twitter.com/marswxreport'
MarsFacts_url = 'https://space-facts.com/mars/'
MarsHemImage_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'

# --- create a browser instance using splinter ---
executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
browser = Browser('chrome', **executable_path, headless=False)

## Scraping Mars News
- save the latest news title 
- save the latest news paragraph

In [5]:
# --- visit the Mars News url ---
browser.visit(MarsNews_url)
time.sleep(1)

# --- create HTML object ---
html = browser.html

# --- parse HTML with BeautifulSoup ---
soup = BeautifulSoup(html, 'html.parser')

# --- get the first <li> item under <ul> list of headlines: this contains the latest news title and paragraph text ---
first_li = soup.find('li', class_='slide')

# --- save the news title under the <div> tag with a class of 'content_title' ---
news_title = first_li.find('div', class_='content_title').text
print(news_title)

# --- save the paragraph text under the <div> tag with a class of 'article_teaser_body' ---
news_para = first_li.find('div', class_='article_teaser_body').text
print(news_para)

NASA's MAVEN Observes Martian Night Sky Pulsing in Ultraviolet Light
Vast areas of the Martian night sky pulse in ultraviolet light, according to images from NASA’s MAVEN spacecraft. The results are being used to illuminate complex circulation patterns in the Martian atmosphere.


## Scraping JPL Featured Image URL
- save the current Featured Mars Image url and title

In [3]:
# --- visit the JPL Featured Space Image website ---
browser.visit(JPLimage_url)
time.sleep(1)

# --- create HTML object ---
html = browser.html

# --- parse HTML with BeautifulSoup ---
soup = BeautifulSoup(html, 'html.parser')


In [7]:
# --- get the <div> with a class of 'carousel_container': this contains the current featured image details ---
carousel = soup.find('div', class_='carousel_container')

# --- get the image title found under the <a> tag ---
featuredimage_title = carousel.find('a')['data-title']

# --- use splinter to click on the 'full image' button to retrieve a full-size jpg url ---
browser.find_by_id('full_image').click()
time.sleep(1)

# --- check if the div with the 'more info' button is visible to proceed to the download page. If false: ---
if browser.is_element_visible_by_css('div.fancybox-title') == False:
    
    # --- create the base url for the image from the carousel container ---
    base_url = 'https://www.jpl.nasa.gov/'
    
    # --- get the image url found under the <a> tag in the carousel ---
    image_url = carousel.find('a')['data-fancybox-href']
    
    # --- complete the featured image url by adding the base url ---
    featuredimage_url = base_url + image_url

# --- if the div is visible and there is a 'more info' button to proceed --- 
else:
    
    # --- create the base url for the fullsize image download link ---
    base_url = 'https:'
    
    # --- click the 'more info' button to go to the image detail page ---
    browser.links.find_by_partial_text('more info').click()
    time.sleep(1)
    
    # --- create a beautiful soup object with the image detail page's html ---
    img_detail_html = browser.html
    imagesoup = BeautifulSoup(img_detail_html, 'html.parser')
    
    # --- find the fullsize jpg image link and store the url ---
    download_div = imagesoup.find_all('div', class_='download_tiff')[1]
    fullsize_img = download_div.find('a')['href']

    # --- complete the featured image url by adding the base url ---
    featuredimage_url = base_url + fullsize_img

print(featuredimage_url)
print(featuredimage_title)


https://photojournal.jpl.nasa.gov/jpeg/PIA18328.jpg
Dark Side of the Moon: Enceladus


## Scraping Mars Weather
- Scrape the latest Mars weather tweet 

In [5]:
# --- visit the Mars Weather twitter account ---
browser.visit(MarsWeather_url)
time.sleep(1)

# --- create HTML object ---
#html = browser.html

# --- parse HTML with BeautifulSoup ---
#soup = BeautifulSoup(html, 'html.parser')

#pattern = re.compile(r"sol")
#sol = soup.find('span', text=pattern)
#sol

In [7]:
# --- get a list of all <span> tags ---
all_spans = browser.find_by_css('span')

# --- checking to find index for the first tweet (stored in <span> tag) - it is at index 48 ---
#for span in range(len(all_spans)):
#    print(span, all_spans[span].value)

# --- save the latest tweet (at index 48 of the span element list) in a variable ---
latest_tweet = all_spans[48].value

# --- clean up the tweet (remove newline) ---
latest_tweet.replace('\n', '')

'InSight sol 605 (2020-08-09) low -92.7ºC (-134.8ºF) high -18.4ºC (-1.1ºF)winds from the WNW at 8.8 m/s (19.7 mph) gusting to 22.5 m/s (50.4 mph)pressure at 7.90 hPa'

## Scraping Mars Facts
- Scrape the table containing facts about the planet including Diameter, Mass, etc.
- Use Pandas to convert the data to a HTML table string.

In [9]:
# --- visit the Mars Facts website ---
browser.visit(MarsFacts_url)
time.sleep(1)

# --- create HTML object ---
html = browser.html

# --- use Pandas to scrape table of facts ---
table = pd.read_html(html)

# --- use indexing to slice the table to a dataframe ---
facts_df = table[0]
facts_df.columns =['Description', 'Value']
facts_df

Unnamed: 0,Description,Value
0,Equatorial Diameter:,"6,792 km"
1,Polar Diameter:,"6,752 km"
2,Mass:,6.39 × 10^23 kg (0.11 Earths)
3,Moons:,2 (Phobos & Deimos)
4,Orbit Distance:,"227,943,824 km (1.38 AU)"
5,Orbit Period:,687 days (1.9 years)
6,Surface Temperature:,-87 to -5 °C
7,First Record:,2nd millennium BC
8,Recorded By:,Egyptian astronomers


In [11]:
# --- convert the dataframe to a HTML table and save to html file ---
facts_df.to_html('marsfacts.html', index=False)

## Scraping Mars Hemisphere images
- Find and save the image url and title of the full resolution image for each of Mar's hemispheres in a Python dictionary
- Append the dictionary with the image url string and the hemisphere title to a list. This list will contain one dictionary for each hemisphere.

In [3]:
# --- visit the Mars Hemisphere website ---

browser.visit(MarsHemImage_url)
time.sleep(1)

# --- create HTML object ---
html = browser.html

# --- parse HTML with BeautifulSoup ---
soup = BeautifulSoup(html, 'html.parser')


In [11]:
browser.find_by_css("a.product-item h3")[0].click()
time.sleep(1)

# --- create a beautiful soup object with the image detail page's html ---
img_detail_html = browser.html
imagesoup = BeautifulSoup(img_detail_html, 'html.parser')
    
# --- retrieve the full-res image url and save into a variable ---
base_url = 'https://astrogeology.usgs.gov'
hem_url = imagesoup.find('img', class_="wide-image")['src']
img_url = base_url + hem_url
print(img_url)

browser.back()

https://astrogeology.usgs.gov/cache/images/f5e372a36edfa389625da6d0cc25d905_cerberus_enhanced.tif_full.jpg


In [4]:
# --- retrieve all the parent div tags for each hemisphere --- 
hemisphere_divs = soup.find_all('div', class_="item")

# --- create an empty list to store the python dictionary ---
hemisphere_image_data = []

# --- loop through each div item to get hemisphere data ---
for hemisphere in range(len(hemisphere_divs)):

    # --- use splinter's browser to click on each hemisphere's link in order to retrieve image data ---
    hem_link = browser.find_by_css("a.product-item h3")
    hem_link[hemisphere].click()
    time.sleep(1)
    
    # --- create a beautiful soup object with the image detail page's html ---
    img_detail_html = browser.html
    imagesoup = BeautifulSoup(img_detail_html, 'html.parser')
    
    # --- create the base url for the fullsize image link ---
    base_url = 'https://astrogeology.usgs.gov'
    
    # --- retrieve the full-res image url and save into a variable ---
    hem_url = imagesoup.find('img', class_="wide-image")['src']
    
    # --- complete the featured image url by adding the base url ---
    img_url = base_url + hem_url

    # --- retrieve the image title using the title class and save into variable ---
    img_title = browser.find_by_css('.title').text
    
    # --- add the key value pairs to python dictionary and append to the list ---
    hemisphere_image_data.append({"title": img_title,
                              "img_url": img_url})
    
    # --- go back to the main page ---
    browser.back()

hemisphere_image_data

[{'title': 'Cerberus Hemisphere Enhanced',
  'img_url': 'https://astrogeology.usgs.gov/cache/images/f5e372a36edfa389625da6d0cc25d905_cerberus_enhanced.tif_full.jpg'},
 {'title': 'Schiaparelli Hemisphere Enhanced',
  'img_url': 'https://astrogeology.usgs.gov/cache/images/3778f7b43bbbc89d6e3cfabb3613ba93_schiaparelli_enhanced.tif_full.jpg'},
 {'title': 'Syrtis Major Hemisphere Enhanced',
  'img_url': 'https://astrogeology.usgs.gov/cache/images/555e6403a6ddd7ba16ddb0e471cadcf7_syrtis_major_enhanced.tif_full.jpg'},
 {'title': 'Valles Marineris Hemisphere Enhanced',
  'img_url': 'https://astrogeology.usgs.gov/cache/images/b3c7c6c9138f57b4756be9b9c43e3a48_valles_marineris_enhanced.tif_full.jpg'}]