# Web Scraping Homework - Mission to Mars

Build a web application that scrapes various websites for data related to the Mission to Mars and displays the information in a single HTML page

# Step 1 - Scraping

In [1]:
# Dependencies
from bs4 import BeautifulSoup
from splinter import Browser
from splinter.exceptions import ElementDoesNotExist
import time
import requests
import pandas as pd

1 NASA Mars News

Scrape the NASA Mars News Site and collect the latest News Title and Paragraph Text. Assign the text to variables that you can reference later.

In [2]:
# URL of page to be scraped
url_news = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest'

In [3]:
# Create BeautifulSoup object; parse with 'html.parser'
from selenium import webdriver
browser_sele = webdriver.Chrome()
browser_sele.get(url_news)
soup_selenium = BeautifulSoup(browser_sele.page_source, "html.parser")

In [4]:
news_title = soup_selenium.find('div', class_="content_title").text.strip()
news_title

"Media Get a Close-Up of NASA's Mars 2020 Rover"

In [5]:
news_p = soup_selenium.find('div', class_="article_teaser_body").text.strip()
news_p

"The clean room at NASA's Jet Propulsion Laboratory was open to the media to see NASA's next Mars explorer before it leaves for Florida in preparation for a summertime launch."

In [6]:
browser_sele.quit()

2 JPL Mars Space Images - Featured Image

Find the image url for the current Featured Mars Image and assign the url string to a variable called featured_image_url.

In [7]:
from splinter import Browser
from splinter.exceptions import ElementDoesNotExist
import time

In [8]:
executable_path = {'executable_path': 'chromedriver'}
browser = Browser('chrome', **executable_path, headless=False)

In [9]:
url_pic = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
browser.visit(url_pic)

In [10]:
browser.links.find_by_partial_text('FULL IMAGE')
browser.click_link_by_partial_text('FULL IMAGE')



In [11]:
html = browser.html
soup = BeautifulSoup(html, 'html.parser')
    
featured_image_url = "http://www.jpl.nasa.gov" + soup.find('img', class_='fancybox-image')['src']
featured_image_url

'http://www.jpl.nasa.gov/spaceimages/images/mediumsize/PIA18048_ip.jpg'

In [12]:
browser.quit()

3 Mars Weather

Visit the Mars Weather twitter account here and scrape the latest Mars weather tweet from the page. Save the tweet text for the weather report

In [13]:
url_twit = 'https://twitter.com/marswxreport?lang=en'
response_tw = requests.get(url_twit)
soup_tw = BeautifulSoup(response_tw.text, 'html.parser')

In [14]:
last_twit = (soup_tw.find('div', class_="js-tweet-text-container")).find('p')
split = last_twit.get_text().split(last_twit.find('a').text.strip()) 
mars_weather = split[0].replace('\n', ', ')
mars_weather

'InSight sol 390 (2020-01-01) low -100.2ºC (-148.3ºF) high -17.8ºC (-0.1ºF), winds from the SSE at 6.5 m/s (14.4 mph) gusting to 27.8 m/s (62.1 mph), pressure at 6.40 hPa'

4 Mars Facts

Visit the Mars Facts webpage here and use Pandas to scrape the table containing facts about the planet

In [15]:
url_facts = 'https://space-facts.com/mars/'

In [16]:
tables = pd.read_html(url_facts)
df = tables[0]
df.columns = ['Description', 'Value']
df.set_index('Description', inplace=True)
df.head(10)

Unnamed: 0_level_0,Value
Description,Unnamed: 1_level_1
Equatorial Diameter:,"6,792 km"
Polar Diameter:,"6,752 km"
Mass:,6.39 × 10^23 kg (0.11 Earths)
Moons:,2 (Phobos & Deimos)
Orbit Distance:,"227,943,824 km (1.38 AU)"
Orbit Period:,687 days (1.9 years)
Surface Temperature:,-87 to -5 °C
First Record:,2nd millennium BC
Recorded By:,Egyptian astronomers


In [17]:
html_table = df.to_html()
html_table

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>Value</th>\n    </tr>\n    <tr>\n      <th>Description</th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>Equatorial Diameter:</th>\n      <td>6,792 km</td>\n    </tr>\n    <tr>\n      <th>Polar Diameter:</th>\n      <td>6,752 km</td>\n    </tr>\n    <tr>\n      <th>Mass:</th>\n      <td>6.39 × 10^23 kg (0.11 Earths)</td>\n    </tr>\n    <tr>\n      <th>Moons:</th>\n      <td>2 (Phobos &amp; Deimos)</td>\n    </tr>\n    <tr>\n      <th>Orbit Distance:</th>\n      <td>227,943,824 km (1.38 AU)</td>\n    </tr>\n    <tr>\n      <th>Orbit Period:</th>\n      <td>687 days (1.9 years)</td>\n    </tr>\n    <tr>\n      <th>Surface Temperature:</th>\n      <td>-87 to -5 °C</td>\n    </tr>\n    <tr>\n      <th>First Record:</th>\n      <td>2nd millennium BC</td>\n    </tr>\n    <tr>\n      <th>Recorded By:</th>\n      <td>Egyptian astronomers</td>\n    </tr>\

In [18]:
html_table.replace('\n', '')

'<table border="1" class="dataframe">  <thead>    <tr style="text-align: right;">      <th></th>      <th>Value</th>    </tr>    <tr>      <th>Description</th>      <th></th>    </tr>  </thead>  <tbody>    <tr>      <th>Equatorial Diameter:</th>      <td>6,792 km</td>    </tr>    <tr>      <th>Polar Diameter:</th>      <td>6,752 km</td>    </tr>    <tr>      <th>Mass:</th>      <td>6.39 × 10^23 kg (0.11 Earths)</td>    </tr>    <tr>      <th>Moons:</th>      <td>2 (Phobos &amp; Deimos)</td>    </tr>    <tr>      <th>Orbit Distance:</th>      <td>227,943,824 km (1.38 AU)</td>    </tr>    <tr>      <th>Orbit Period:</th>      <td>687 days (1.9 years)</td>    </tr>    <tr>      <th>Surface Temperature:</th>      <td>-87 to -5 °C</td>    </tr>    <tr>      <th>First Record:</th>      <td>2nd millennium BC</td>    </tr>    <tr>      <th>Recorded By:</th>      <td>Egyptian astronomers</td>    </tr>  </tbody></table>'

5 Mars Hemispheres

Visit the USGS Astrogeology site here to obtain high resolution images for each of Mar's hemispheres.

In [19]:
executable_path = {'executable_path': 'chromedriver'}
browser = Browser('chrome', **executable_path, headless=False)

In [20]:
url_hemisphere = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
browser.visit(url_hemisphere)

In [21]:
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

results_hemisphere = soup.find('div', class_='full-content')

In [22]:
titles_hemisphere = results_hemisphere.find_all('h3')
title_list = []

for title in titles_hemisphere:
    clean_title = str.strip(title.text)
    title_list.append(clean_title)

In [24]:
hemisphere_image_urls = []

try:
    for x in range(len(title_list)):

            url_hemisphere = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
            browser.visit(url_hemisphere)
            time.sleep(1)
            
            browser.links.find_by_partial_text(title_list[x])
            browser.click_link_by_partial_text(title_list[x])
                    
            html = browser.html
            soup_pic = BeautifulSoup(html, 'html.parser')
            
            eachpic_hemisphere = soup_pic.find('div', class_='downloads')
            eachpic_link = eachpic_hemisphere.find('a')['href']
            
            pics_dict = {}
            pics_dict["title"] = title_list[x]
            pics_dict["img_url"] = eachpic_link
            hemisphere_image_urls.append(pics_dict)
            
            time.sleep(2)
            
except ElementDoesNotExist:
    print("Scraping Complete")
     
hemisphere_image_urls        

[{'title': 'Cerberus Hemisphere Enhanced',
  'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg'},
 {'title': 'Schiaparelli Hemisphere Enhanced',
  'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg'},
 {'title': 'Syrtis Major Hemisphere Enhanced',
  'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg'},
 {'title': 'Valles Marineris Hemisphere Enhanced',
  'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg'}]

In [25]:
browser.quit()

In [26]:
mars = {}
mars["news_title"] = news_title
mars["news_p"] = news_p
mars["mars_weather"] = mars_weather
mars["featured_image_url"] = featured_image_url
mars["hemisphere_image_urls"] = hemisphere_image_urls
mars["mars_facts"]= html_table

In [27]:
print(mars)

{'news_title': "Media Get a Close-Up of NASA's Mars 2020 Rover", 'news_p': "The clean room at NASA's Jet Propulsion Laboratory was open to the media to see NASA's next Mars explorer before it leaves for Florida in preparation for a summertime launch.", 'mars_weather': 'InSight sol 390 (2020-01-01) low -100.2ºC (-148.3ºF) high -17.8ºC (-0.1ºF), winds from the SSE at 6.5 m/s (14.4 mph) gusting to 27.8 m/s (62.1 mph), pressure at 6.40 hPa', 'featured_image_url': 'http://www.jpl.nasa.gov/spaceimages/images/mediumsize/PIA18048_ip.jpg', 'hemisphere_image_urls': [{'title': 'Cerberus Hemisphere Enhanced', 'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg'}, {'title': 'Schiaparelli Hemisphere Enhanced', 'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg'}, {'title': 'Syrtis Major Hemisphere Enhanced', 'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major

In [28]:
import pymongo
# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

In [29]:
# Dictionary to be inserted as a MongoDB document
db = client.mars_db
db.mars_db.drop()
collection = db.info

collection.insert_one(mars)

<pymongo.results.InsertOneResult at 0x19029872448>

In [30]:
# Display items in MongoDB collection
listings = db.info.find()

for listing in listings:
    print(listing)

{'_id': ObjectId('5e0d281f544bb904c1f4d55d'), 'news_title': "Media Get a Close-Up of NASA's Mars 2020 Rover", 'news_p': "The clean room at NASA's Jet Propulsion Laboratory was open to the media to see NASA's next Mars explorer before it leaves for Florida in preparation for a summertime launch.", 'mars_weather': 'InSight sol 389 (2019-12-30) low -99.2ºC (-146.5ºF) high -18.6ºC (-1.6ºF), winds from the SSE at 5.6 m/s (12.6 mph) gusting to 23.1 m/s (51.6 mph), pressure at 6.40 hPa', 'featured_image_url': 'www.jpl.nasa.gov/spaceimages/images/mediumsize/PIA17449_ip.jpg', 'hemisphere_image_urls': [{'title': 'Cerberus Hemisphere Enhanced', 'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg'}, {'title': 'Schiaparelli Hemisphere Enhanced', 'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg'}, {'title': 'Syrtis Major Hemisphere Enhanced', 'img_url': 'http://astropedia.astrogeology.usgs.