In [1]:
pip install splinter webdriver_manager


Note: you may need to restart the kernel to use updated packages.


In [1]:
# Import Splinter, BeautifulSoup, and Pandas and other dependencies
from splinter import Browser
from bs4 import BeautifulSoup
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import pymongo
import requests
import pathlib
import pprint
import time

# Web Scraping

In [2]:
# Set the executable path and initialize the chrome browser in splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

[WDM] - Current google-chrome version is 89.0.4389
[WDM] - Get LATEST driver version for 89.0.4389
[WDM] - Driver [C:\Users\jabuk\.wdm\drivers\chromedriver\win32\89.0.4389.23\chromedriver.exe] found in cache






## Visit the NASA mars news site

In [3]:
# Visit the mars nasa news site
url_nasa = 'https://mars.nasa.gov/news/'
browser.visit(url_nasa)
# Optional delay for loading the page
time.sleep(5)

In [4]:
# Convert the browser html to a soup object and then quit the browser
html = browser.html
soup = BeautifulSoup(html, 'html.parser')
# print(soup.prettify())

In [5]:
# Scrape latest news title
# Latest news title as of today, 4/9/2021: "NASA's Mars helicopter to make first flight attempt Sunday"
latest_news = soup.findAll('div', class_="content_title")
news_title= latest_news[1].text
news_title

"NASA's Mars Helicopter to Make First Flight Attempt Sunday"

In [6]:
# Scrape Latest news description text
descriptions = soup.findAll('div', class_="article_teaser_body")
news_desc = descriptions[0].text
news_desc

'The small rotorcraft’s “Wright brothers moment” is two Mars days away.'

## JPL Space Images Featured Image

In [7]:
# Visit JPL space images Mars URL to find the featured 
url_jpl = 'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/index.html'
browser.visit(url_jpl)

In [8]:
# Convert the browser html to a soup object and then quit the browser
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

In [9]:
#Scrape relative image URL of the featured image from the jpl site using soup
images = soup.findAll('img', class_="headerimage fade-in")
featured_img= images[0].attrs['src']
featured_img

'image/featured/mars3.jpg'

In [10]:
# Featured image URL
featured_img_url = 'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/'+featured_img
featured_img_url

'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/image/featured/mars3.jpg'

## Mars Facts

In [11]:
# Visit space-facts.com 
url_mars = 'https://space-facts.com/mars/'
browser.visit(url_mars)

In [12]:
# Reading html
rawdata_mars = pd.read_html(url_mars)[1]
rawdata_mars

Unnamed: 0,Mars - Earth Comparison,Mars,Earth
0,Diameter:,"6,779 km","12,742 km"
1,Mass:,6.39 × 10^23 kg,5.97 × 10^24 kg
2,Moons:,2,1
3,Distance from Sun:,"227,943,824 km","149,598,262 km"
4,Length of Year:,687 Earth days,365.24 days
5,Temperature:,-87 to -5 °C,-88 to 58°C


#### Data Cleaning Stage

In [13]:
# Rename column Mars - Earth Comparison to Attributes
mars_df = rawdata_mars.rename(columns= {'Mars - Earth Comparison': 'Attributes'}).drop(columns = ["Earth"])
mars_df

Unnamed: 0,Attributes,Mars
0,Diameter:,"6,779 km"
1,Mass:,6.39 × 10^23 kg
2,Moons:,2
3,Distance from Sun:,"227,943,824 km"
4,Length of Year:,687 Earth days
5,Temperature:,-87 to -5 °C


In [14]:
# Export data to HTML
mars_html_table = mars_df.to_html("mars_facts.html", header=False, index=False)

## Mars Hemispheres

In [3]:
# Visit the USGS astrogeology page for hemisphere data from Mars
base_url ="https://astrogeology.usgs.gov"
url_hemisphere = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
browser.visit(url_hemisphere)

In [4]:
# Extract the .html out of the browser & create beautifulsoup object
hem_html = browser.html
hem_soup = BeautifulSoup(hem_html,'html.parser')
# print(hem_soup.prettify())

In [5]:
# Finds images in divs with class=item
results = hem_soup.findAll('div', class_='item')
results

[<div class="item"><a class="itemLink product-item" href="/search/map/Mars/Viking/cerberus_enhanced"><img alt="Cerberus Hemisphere Enhanced thumbnail" class="thumb" src="/cache/images/39d3266553462198bd2fbc4d18fbed17_cerberus_enhanced.tif_thumb.png"/></a><div class="description"><a class="itemLink product-item" href="/search/map/Mars/Viking/cerberus_enhanced"><h3>Cerberus Hemisphere Enhanced</h3></a><span class="subtitle" style="float:left">image/tiff 21 MB</span><span class="pubDate" style="float:right"></span><br/><p>Mosaic of the Cerberus hemisphere of Mars projected into point perspective, a view similar to that which one would see from a spacecraft. This mosaic is composed of 104 Viking Orbiter images acquired…</p></div> <!-- end description --></div>,
 <div class="item"><a class="itemLink product-item" href="/search/map/Mars/Viking/schiaparelli_enhanced"><img alt="Schiaparelli Hemisphere Enhanced thumbnail" class="thumb" src="/cache/images/08eac6e22c07fb1fe72223a79252de20_schiapa

In [11]:
hem_img_urls = []
image_titles = hem_soup.findAll('h3')

for i in range(len(image_titles)):
    hemisphere = {}
    
    img_title = hem_soup.findAll('h3')[i].text
    
    img_url = base_url + hem_soup.findAll('a', class_='itemLink product-item')[i]['href']
    browser.visit(img_url)
    
    time.sleep(1)
    
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    
    final_img_url = soup.find('div', class_='downloads').find('a')['href']
    
    hemisphere["title"] = img_title
    hemisphere["img_url"] = final_img_url
    
    hem_img_urls.append(hemisphere)
    
hem_img_urls

[{'title': 'Cerberus Hemisphere Enhanced',
  'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg'},
 {'title': 'Schiaparelli Hemisphere Enhanced',
  'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg'},
 {'title': 'Syrtis Major Hemisphere Enhanced',
  'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg'},
 {'title': 'Valles Marineris Hemisphere Enhanced',
  'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg'}]

In [19]:
# Quit browser when finished
browser.quit()