In [5]:
# Import 
from splinter import Browser
from bs4 import BeautifulSoup as soup
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd

In [50]:
# Set the executable path and initialize Splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

[WDM] - Downloading: 100%|█████████████████| 8.84M/8.84M [00:00<00:00, 35.7MB/s]


# Step 1 - Scraping

## NASA Mars News
- Scrape the Mars News Site and collect the latest News Title and Paragraph Text. Assign the text to variables that you can reference later.

In [57]:
# Visit the mars nasa news site
url = 'https://redplanetscience.com/'
browser.visit(url)

# Delay for loading the page
browser.is_element_present_by_css('div.list_text', wait_time=1)

True

In [58]:
# Convert the browser html to a soup object and find the latest title
html= browser.html
news_soup= soup(html, 'html.parser')
latest_news_elem=news_soup.select_one('div.list_text')
latest_news_title=latest_news_elem.find('div', class_='content_title').get_text()
latest_news_title

"NASA's Mars 2020 Rover Closer to Getting Its Name"

In [59]:
# Use the parent element to find the paragraph text
news_para = latest_news_elem.find('div', class_='article_teaser_body').get_text()
news_para

"155 students from across the U.S. have been chosen as semifinalists in NASA's essay contest to name the Mars 2020 rover, and see it launch from Cape Canaveral this July."

## JPL Mars Space Images - Featured Image

In [60]:
# Visit URL
url = 'https:/spaceimages-mars.com/'
browser.visit(url)

In [61]:
# Find and click the full image button
image_button_elem = browser.find_by_tag('button')[1]
image_button_elem.click()

In [62]:
# Parsing
html = browser.html
full_img_soup = soup(html, 'html.parser')

In [63]:
# find the relative url
img_url = full_img_soup.find('img', class_='fancybox-image').get('src')
img_url

'image/featured/mars1.jpg'

In [64]:
# Create absolute url
featured_img_url = f'https://spaceimages-mars.com/{img_url}'
featured_img_url

'https://spaceimages-mars.com/image/featured/mars1.jpg'

## Mars Facts

In [65]:
# Vist URL and use Pandas to scrape the table containing facts  about the planet including Diameter, Mass, etc.
url = 'https://galaxyfacts-mars.com/'
df = pd.read_html(url, header=0)[0]
df.set_index('Mars - Earth Comparison', inplace=True)
df

Unnamed: 0_level_0,Mars,Earth
Mars - Earth Comparison,Unnamed: 1_level_1,Unnamed: 2_level_1
Diameter:,"6,779 km","12,742 km"
Mass:,6.39 × 10^23 kg,5.97 × 10^24 kg
Moons:,2,1
Distance from Sun:,"227,943,824 km","149,598,262 km"
Length of Year:,687 Earth days,365.24 days
Temperature:,-87 to -5 °C,-88 to 58°C


In [66]:
# convert the data to a HTML table string
df.to_html()

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>Mars</th>\n      <th>Earth</th>\n    </tr>\n    <tr>\n      <th>Mars - Earth Comparison</th>\n      <th></th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>Diameter:</th>\n      <td>6,779 km</td>\n      <td>12,742 km</td>\n    </tr>\n    <tr>\n      <th>Mass:</th>\n      <td>6.39 × 10^23 kg</td>\n      <td>5.97 × 10^24 kg</td>\n    </tr>\n    <tr>\n      <th>Moons:</th>\n      <td>2</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>Distance from Sun:</th>\n      <td>227,943,824 km</td>\n      <td>149,598,262 km</td>\n    </tr>\n    <tr>\n      <th>Length of Year:</th>\n      <td>687 Earth days</td>\n      <td>365.24 days</td>\n    </tr>\n    <tr>\n      <th>Temperature:</th>\n      <td>-87 to -5 °C</td>\n      <td>-88 to 58°C</td>\n    </tr>\n  </tbody>\n</table>'

## Mars Hemispheres

In [67]:
# Visit the marshemispheres site
url = 'https://marshemispheres.com/'
browser.visit(url)

In [68]:
# Create a list to hold the images and titles.
hemisphere_image_urls = []

# Retrieve the image urls and titles for each hemisphere.
for i in range(4):
    # Browse through each article
    browser.links.find_by_partial_text('Hemisphere')[i].click()
    
    # Parse the HTML
    html = browser.html
    hemi_soup = soup(html,'html.parser')
    
    # Scraping
    title = hemi_soup.find('h2', class_='title').text
    img_url = hemi_soup.find('li').a.get('href')
    
    # Store findings into a dictionary and append to list
    hemispheres = {}
    hemispheres['img_url'] = f'https://marshemispheres.com/{img_url}'
    hemispheres['title'] = title
    hemisphere_image_urls.append(hemispheres)
    
    # Browse back to repeat
    browser.back()


In [69]:
# Quit browser
browser.quit()

In [70]:
# Print the list.
hemisphere_image_urls

[{'img_url': 'https://marshemispheres.com/images/full.jpg',
  'title': 'Cerberus Hemisphere Enhanced'},
 {'img_url': 'https://marshemispheres.com/images/schiaparelli_enhanced-full.jpg',
  'title': 'Schiaparelli Hemisphere Enhanced'},
 {'img_url': 'https://marshemispheres.com/images/syrtis_major_enhanced-full.jpg',
  'title': 'Syrtis Major Hemisphere Enhanced'},
 {'img_url': 'https://marshemispheres.com/images/valles_marineris_enhanced-full.jpg',
  'title': 'Valles Marineris Hemisphere Enhanced'}]