In [1]:
# Import dependencies
import pandas as pd
import requests
from splinter import Browser
from bs4 import BeautifulSoup
from webdriver_manager.chrome import ChromeDriverManager

In [2]:
def init_browser():
    '''Launches a Chrome browser using splinter and ChromeDriverManager'''
    executable_path = {'executable_path': ChromeDriverManager().install()}
    return Browser('chrome', **executable_path, headless=True)

In [3]:
def browse(url):
    '''Function to scrape a single webpage'''
    browser = init_browser()
    browser.visit(url)

    # Scrape html code
    html = browser.html    

    # Create BeautifulSoup object; parse with 'html.parser'
    soup = BeautifulSoup(html, "html.parser")
    
    # Quit browser
    browser.quit()
    
    return soup

In [4]:
# Create an empty dictionary to hold web scraping resutls
mars_scrape_dict = {}

## NASA Mars News

In [5]:
# URL for NASA Mars News Site 
nasa_url = "https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest"

# Scrape html of nasa_url
nasa_soup = browse(nasa_url)



Current google-chrome version is 94.0.4606
Get LATEST driver version for 94.0.4606
Get LATEST driver version for 94.0.4606
Trying to download new driver from https://chromedriver.storage.googleapis.com/94.0.4606.61/chromedriver_mac64.zip
Driver has been saved in cache [/Users/mochi/.wdm/drivers/chromedriver/mac64/94.0.4606.61]


In [6]:
# Find title and teaser text of first article and store them to variables
first_article_title = nasa_soup.find("div", class_="bottom_gradient").text
first_article_teaser = nasa_soup.find("div", class_="article_teaser_body").text

# Append the article variables to mars_scrape_dict
mars_scrape_dict["article_title"] = first_article_title
mars_scrape_dict["article_teaser"] = first_article_teaser

## JPL Mars Space Images - Featured Image

In [8]:
# URL to JPL Mars image website
jpl_url = "https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/index.html"

# Scrape jpl_url
jpl_soup = browse(jpl_url)



Current google-chrome version is 94.0.4606
Get LATEST driver version for 94.0.4606
Driver [/Users/mochi/.wdm/drivers/chromedriver/mac64/94.0.4606.61/chromedriver] found in cache


In [9]:
# Scrape the link to the featured image
jpl_image_link = jpl_soup.find("a", class_="showimg fancybox-thumbs")["href"]

# Assign the full image url to a variable
featured_image_url = f"https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/{jpl_image_link}"

In [10]:
mars_scrape_dict["featured_image"] = featured_image_url

## Mars Facts

In [12]:
# Scrape url using pandas and assign scraped tables to a variable
space_facts_url = "https://space-facts.com/mars/"
tables = pd.read_html(space_facts_url)

In [13]:
# Create a dataframe from the first table
stats_table_df = pd.DataFrame(tables[0])

# Rename the columns
stats_table_df = stats_table_df.rename(columns={0:"", 1:"values"}, inplace=False)

In [14]:
# Set the index to the first column
stats_table_new_index_df = stats_table_df.set_index(keys="")

In [15]:
# Generate html code for the table and remove "\n" from the code
stats_table_html = stats_table_new_index_df.to_html()

In [16]:
stats_table_html = stats_table_html.replace("\n", "")

In [17]:
# Append the table to mars_scrape_dict
mars_scrape_dict["table_html"] = stats_table_html

## Mars Hemispheres

In [19]:
# USGS Mars website url
usgs_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"

# Initiate browswer instance
usgs_browser = init_browser()
usgs_browser.visit(usgs_url)



Current google-chrome version is 94.0.4606
Get LATEST driver version for 94.0.4606
Driver [/Users/mochi/.wdm/drivers/chromedriver/mac64/94.0.4606.61/chromedriver] found in cache


In [20]:
# Scrape image Mars hemispheres pages
hemispheres = ["Cerberus", "Schiaparelli", "Syrtis", "Valles"]
soup_objects = []

for hemisphere in hemispheres:
    usgs_browser.links.find_by_partial_text(hemisphere).click()

    # Scrape html code
    usgs_html = usgs_browser.html    

    # Create BeautifulSoup object; parse with 'html.parser'
    usgs_soup = BeautifulSoup(usgs_html, "html.parser")

    # Assign BeautifulSoup object to unique variable; print variable name
    locals()["soup_" + hemisphere] = usgs_soup

    # Append soup object to list
    soup_objects.append(locals()["soup_" + hemisphere])
                        
    # Go back to previous page
    usgs_browser.back()
    
usgs_browser.quit()

In [21]:
# Find title and url for hemispheres images
hemispheres_list = []

for soup_object in soup_objects:

    hemispheres_dict = {}

    title = soup_object.find("h2", class_="title").text.replace(" Enhanced", "")
    img_url = soup_object.find_all("a", target="_blank")[3]["href"]
    
    hemispheres_dict["title"] = title
    hemispheres_dict["img_url"] = img_url
    
    hemispheres_list.append(hemispheres_dict)

In [23]:
mars_scrape_dict["hemispheres"] = hemispheres_list

In [24]:
mars_scrape_dict

{'article_title': "NASA's Mars Fleet Lies Low with Sun Between Earth and Red Planet",
 'article_teaser': 'The missions will continue collecting data about the Red Planet, though engineers back on Earth will stop sending commands to them until mid-October.',
 'featured_image': 'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/image/featured/mars3.jpg',
 'table_html': '<table border="1" class="dataframe">  <thead>    <tr style="text-align: right;">      <th></th>      <th>values</th>    </tr>    <tr>      <th></th>      <th></th>    </tr>  </thead>  <tbody>    <tr>      <th>Equatorial Diameter:</th>      <td>6,792 km</td>    </tr>    <tr>      <th>Polar Diameter:</th>      <td>6,752 km</td>    </tr>    <tr>      <th>Mass:</th>      <td>6.39 × 10^23 kg (0.11 Earths)</td>    </tr>    <tr>      <th>Moons:</th>      <td>2 (Phobos &amp; Deimos)</td>    </tr>    <tr>      <th>Orbit Distance:</th>      <td>227,943,824 km (1.38 AU)</td>    </tr>    <tr>      <th>Orbit Period:</th>      <t