# Web Scraping - Mission to Mars

### Step 1: Build Scrape

#### NASA Mars News

In [97]:
# Set up Dependencies
import pandas as pd
from bs4 import BeautifulSoup as bs
import pymongo
import requests

# use Google Chrome with Splinter
from splinter import Browser
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager

In [98]:
# ADD Splinter: an abstraction layer on top of existing browser automation tools
executable_path = {'executable_path':ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)



Current google-chrome version is 96.0.4664
Get LATEST chromedriver version for 96.0.4664 google-chrome
Driver [/Users/nepanji/.wdm/drivers/chromedriver/mac64/96.0.4664.45/chromedriver] found in cache


In [99]:
# Set scraper to retreive image from Mars url 
mars_url = 'https://redplanetscience.com/'
browser.visit(mars_url)

html = browser.html

mars_soup = bs(html, 'html.parser')

In [100]:
# Pull news title and paragraph teaser info

title = mars_soup.find('div', class_='content_title').text
teaser = mars_soup.find('div', class_='article_teaser_body').text

In [101]:
# Display results
print(title)
print(' ')
print(teaser)

NASA's Mars Helicopter Attached to Mars 2020 Rover 
 
The helicopter will be first aircraft to perform flight tests on another planet.


#### JPL Mars Space Images - Featured Image

In [102]:
# Set scraper to retreive image from JPL url 

jpl_url = 'https://spaceimages-mars.com/'
browser.visit(jpl_url)

html = browser.html

jpl_soup = bs(html, 'lxml')

In [103]:
# Pull image
image = jpl_soup.find_all('img', class_= "headerimage fade-in")[0]["src"]

In [104]:
# Create featured image url
featured_image_url = jpl_url + image
print(featured_image_url)

https://spaceimages-mars.com/image/featured/mars1.jpg


#### Mars Facts

In [105]:
# Use the read_html function in Pandas to automatically scrape tabular data from the Mars Facts url
url = 'https://galaxyfacts-mars.com/'

# Read table
mars_table = pd.read_html(url)
mars_table

[                         0                1                2
 0  Mars - Earth Comparison             Mars            Earth
 1                Diameter:         6,779 km        12,742 km
 2                    Mass:  6.39 × 10^23 kg  5.97 × 10^24 kg
 3                   Moons:                2                1
 4       Distance from Sun:   227,943,824 km   149,598,262 km
 5          Length of Year:   687 Earth days      365.24 days
 6             Temperature:     -87 to -5 °C      -88 to 58°C,
                       0                              1
 0  Equatorial Diameter:                       6,792 km
 1       Polar Diameter:                       6,752 km
 2                 Mass:  6.39 × 10^23 kg (0.11 Earths)
 3                Moons:          2 ( Phobos & Deimos )
 4       Orbit Distance:       227,943,824 km (1.38 AU)
 5         Orbit Period:           687 days (1.9 years)
 6  Surface Temperature:                   -87 to -5 °C
 7         First Record:              2nd millennium BC

In [106]:
# Slice off any Mars dataframe using normal indexing
facts_df = mars_table[0]
facts_df.columns = ['Description', 'Mars', 'Earth']
facts_df

Unnamed: 0,Description,Mars,Earth
0,Mars - Earth Comparison,Mars,Earth
1,Diameter:,"6,779 km","12,742 km"
2,Mass:,6.39 × 10^23 kg,5.97 × 10^24 kg
3,Moons:,2,1
4,Distance from Sun:,"227,943,824 km","149,598,262 km"
5,Length of Year:,687 Earth days,365.24 days
6,Temperature:,-87 to -5 °C,-88 to 58°C


In [107]:
# Create html table from Dataframe
mars_html_table = facts_df.to_html()
mars_html_table

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>Description</th>\n      <th>Mars</th>\n      <th>Earth</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>Mars - Earth Comparison</td>\n      <td>Mars</td>\n      <td>Earth</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>Diameter:</td>\n      <td>6,779 km</td>\n      <td>12,742 km</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>Mass:</td>\n      <td>6.39 × 10^23 kg</td>\n      <td>5.97 × 10^24 kg</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>Moons:</td>\n      <td>2</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>Distance from Sun:</td>\n      <td>227,943,824 km</td>\n      <td>149,598,262 km</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>Length of Year:</td>\n      <td>687 Earth days</td>\n      <td>365.24 days</td>\n    </tr>\n    <tr>\n      <th>6</th>\n      <td>Temperature:</td>\n      <

In [108]:
# Remove newlines
mars_html_table.replace('\n', '')

'<table border="1" class="dataframe">  <thead>    <tr style="text-align: right;">      <th></th>      <th>Description</th>      <th>Mars</th>      <th>Earth</th>    </tr>  </thead>  <tbody>    <tr>      <th>0</th>      <td>Mars - Earth Comparison</td>      <td>Mars</td>      <td>Earth</td>    </tr>    <tr>      <th>1</th>      <td>Diameter:</td>      <td>6,779 km</td>      <td>12,742 km</td>    </tr>    <tr>      <th>2</th>      <td>Mass:</td>      <td>6.39 × 10^23 kg</td>      <td>5.97 × 10^24 kg</td>    </tr>    <tr>      <th>3</th>      <td>Moons:</td>      <td>2</td>      <td>1</td>    </tr>    <tr>      <th>4</th>      <td>Distance from Sun:</td>      <td>227,943,824 km</td>      <td>149,598,262 km</td>    </tr>    <tr>      <th>5</th>      <td>Length of Year:</td>      <td>687 Earth days</td>      <td>365.24 days</td>    </tr>    <tr>      <th>6</th>      <td>Temperature:</td>      <td>-87 to -5 °C</td>      <td>-88 to 58°C</td>    </tr>  </tbody></table>'

In [109]:
# View html file
print(mars_html_table)

<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>Description</th>
      <th>Mars</th>
      <th>Earth</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>Mars - Earth Comparison</td>
      <td>Mars</td>
      <td>Earth</td>
    </tr>
    <tr>
      <th>1</th>
      <td>Diameter:</td>
      <td>6,779 km</td>
      <td>12,742 km</td>
    </tr>
    <tr>
      <th>2</th>
      <td>Mass:</td>
      <td>6.39 × 10^23 kg</td>
      <td>5.97 × 10^24 kg</td>
    </tr>
    <tr>
      <th>3</th>
      <td>Moons:</td>
      <td>2</td>
      <td>1</td>
    </tr>
    <tr>
      <th>4</th>
      <td>Distance from Sun:</td>
      <td>227,943,824 km</td>
      <td>149,598,262 km</td>
    </tr>
    <tr>
      <th>5</th>
      <td>Length of Year:</td>
      <td>687 Earth days</td>
      <td>365.24 days</td>
    </tr>
    <tr>
      <th>6</th>
      <td>Temperature:</td>
      <td>-87 to -5 °C</td>
      <td>-88 to 58°C</td>
  

#### Mars Hemispheres

In [110]:
# Set scraper to retreive images from Mars Hemisheres url
hems_url = 'https://marshemispheres.com/'
browser.visit(hems_url)

html = browser.html

hems_soup = bs(html, 'lxml')

In [111]:
# Locate and pull hemisphere data
all_hemispheres = hems_soup.find('div', class_= "collapsible results")

# Locate and pull individual hemisphere data
each_hemisphere = all_hemispheres.find_all('div', class_='item')

In [120]:
hems_link = []

# Iterate through to locate each hemisphere title and image

for each in each_hemisphere:
    # Find the title of the image
    hems_title = each.find("h3").text
    hems_title = hems_title.replace("Enhanced", "")
    
    #Find the image url
    image = each.find('img', class_= "thumb")["src"]

    # Create featured image url
    img_url = hems_url + image
    
    # Create dictionary for title and image info
    hems_dict = {}
    hems_dict['title'] = hems_title
    hems_dict['img_url'] = img_url
    
    hems_link.append(hems_dict)
    
print(hems_link)
#     print(image)

[{'title': 'Cerberus Hemisphere ', 'img_url': 'https://marshemispheres.com/images/39d3266553462198bd2fbc4d18fbed17_cerberus_enhanced.tif_thumb.png'}, {'title': 'Schiaparelli Hemisphere ', 'img_url': 'https://marshemispheres.com/images/08eac6e22c07fb1fe72223a79252de20_schiaparelli_enhanced.tif_thumb.png'}, {'title': 'Syrtis Major Hemisphere ', 'img_url': 'https://marshemispheres.com/images/55a0a1e2796313fdeafb17c35925e8ac_syrtis_major_enhanced.tif_thumb.png'}, {'title': 'Valles Marineris Hemisphere ', 'img_url': 'https://marshemispheres.com/images/4e59980c1c57f89c680c0e1ccabbeff1_valles_marineris_enhanced.tif_thumb.png'}]


In [121]:
# Store all scraped data in a mars dictionary
mars_data_dict = {
    "news_title": title,
    "news_info": teaser,
    "featured_image_url": featured_image_url,
    "fun_facts_table": mars_html_table,
    "hemisphere_info": hems_link
}

In [122]:
mars_data_dict

{'news_title': "NASA's Mars Helicopter Attached to Mars 2020 Rover ",
 'news_info': 'The helicopter will be first aircraft to perform flight tests on another planet.',
 'featured_image_url': 'https://spaceimages-mars.com/image/featured/mars1.jpg',
 'fun_facts_table': '<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>Description</th>\n      <th>Mars</th>\n      <th>Earth</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>Mars - Earth Comparison</td>\n      <td>Mars</td>\n      <td>Earth</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>Diameter:</td>\n      <td>6,779 km</td>\n      <td>12,742 km</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>Mass:</td>\n      <td>6.39 × 10^23 kg</td>\n      <td>5.97 × 10^24 kg</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>Moons:</td>\n      <td>2</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>Distance from Sun:</td>\