# 1. Scraping first news article from https://redplanetscience.com/

In [1]:
# Dependencies
import pandas as pd
from bs4 import BeautifulSoup
from splinter import Browser
import requests
# from flask import Flask, render_template, redirect
# from flask_pymongo import pymongo
# import scrape
from webdriver_manager.chrome import ChromeDriverManager
import os

In [2]:
# Run Chrome to assist with identifying html calls.
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)



Current google-chrome version is 96.0.4664
Get LATEST chromedriver version for 96.0.4664 google-chrome
Trying to download new driver from https://chromedriver.storage.googleapis.com/96.0.4664.45/chromedriver_mac64.zip
Driver has been saved in cache [/Users/normanadkins/.wdm/drivers/chromedriver/mac64/96.0.4664.45]


In [3]:
# Visit URL for preview
url = 'https://redplanetscience.com/'
browser.visit(url)

In [4]:
# HTML object
html = browser.html
# Parse HTML with Beautiful Soup
soup = BeautifulSoup(html, 'html.parser')
# Retrieve the first element that contain article information
rows = soup.find_all("div", class_="list_text")
rows[0]

<div class="list_text">
<div class="list_date">December 6, 2021</div>
<div class="content_title">Three New Views of Mars' Moon Phobos</div>
<div class="article_teaser_body">Taken with the infrared camera aboard NASA's Odyssey orbiter, they reveal temperature variations on the small moon as it drifts into and out of Mars’ shadow.</div>
</div>

In [5]:
# Retrive the latest title text.  Titles are ordered by date on the screen, so selecting the first one.  
news_title = soup.find("div", class_="content_title").text

# Grabbing the latest paragraph as well.
news_p = soup.find("div", class_="article_teaser_body").text

# Print Results
print(news_title)
print("----------------------------------------------------------------------")
print(news_p)

Three New Views of Mars' Moon Phobos
----------------------------------------------------------------------
Taken with the infrared camera aboard NASA's Odyssey orbiter, they reveal temperature variations on the small moon as it drifts into and out of Mars’ shadow.


In [6]:
# Close out browser session
browser.quit()

# 2. Scraping featured image from https://spaceimages-mars.com/

In [7]:
# Run Chrome to assist with identifying another set of html calls.
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)



Current google-chrome version is 96.0.4664
Get LATEST chromedriver version for 96.0.4664 google-chrome
Driver [/Users/normanadkins/.wdm/drivers/chromedriver/mac64/96.0.4664.45/chromedriver] found in cache


In [8]:
# Visit the new URL for preview
url = "https://spaceimages-mars.com"
browser.visit(url)

In [9]:
# HTML object
html = browser.html
# Parse HTML with Beautiful Soup
soup = BeautifulSoup(html, "html.parser")
# Retrieve all elements that contain image information
image = soup.find("img", class_="headerimage fade-in")
print(image)

<img class="headerimage fade-in" src="image/featured/mars2.jpg"/>


In [10]:
# Retrive the current image URL 
src = image["src"]
featured_image_url = "https://spaceimages-mars.com/" + src
print(featured_image_url)

https://spaceimages-mars.com/image/featured/mars2.jpg


In [11]:
# Close out browser session
browser.quit()

# 3. Scraping a table into a DataFrame from https://galaxyfacts-mars.com

In [12]:
# Run Chrome to assist with identifying another set of html calls.
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)



Current google-chrome version is 96.0.4664
Get LATEST chromedriver version for 96.0.4664 google-chrome
Driver [/Users/normanadkins/.wdm/drivers/chromedriver/mac64/96.0.4664.45/chromedriver] found in cache


In [13]:
# Visit the new URL for preview
url = "https://galaxyfacts-mars.com/"
browser.visit(url)

In [14]:
# HTML object
html = browser.html
# Parse HTML with Beautiful Soup
soup = BeautifulSoup(html, "html.parser")

In [15]:
# Read the tables
tables = pd.read_html(url)
tables

[                         0                1                2
 0  Mars - Earth Comparison             Mars            Earth
 1                Diameter:         6,779 km        12,742 km
 2                    Mass:  6.39 × 10^23 kg  5.97 × 10^24 kg
 3                   Moons:                2                1
 4       Distance from Sun:   227,943,824 km   149,598,262 km
 5          Length of Year:   687 Earth days      365.24 days
 6             Temperature:     -87 to -5 °C      -88 to 58°C,
                       0                              1
 0  Equatorial Diameter:                       6,792 km
 1       Polar Diameter:                       6,752 km
 2                 Mass:  6.39 × 10^23 kg (0.11 Earths)
 3                Moons:          2 ( Phobos & Deimos )
 4       Orbit Distance:       227,943,824 km (1.38 AU)
 5         Orbit Period:           687 days (1.9 years)
 6  Surface Temperature:                   -87 to -5 °C
 7         First Record:              2nd millennium BC

In [16]:
# Understand what object type this is.
type(tables)

list

In [17]:
# Preview as a Data Frame to understand how/if I want to slice for an index.
df = tables[0]
df.head()

Unnamed: 0,0,1,2
0,Mars - Earth Comparison,Mars,Earth
1,Diameter:,"6,779 km","12,742 km"
2,Mass:,6.39 × 10^23 kg,5.97 × 10^24 kg
3,Moons:,2,1
4,Distance from Sun:,"227,943,824 km","149,598,262 km"


In [18]:
# Generate HTML DataFrame
html_table = df.to_html()
html_table

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>0</th>\n      <th>1</th>\n      <th>2</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>Mars - Earth Comparison</td>\n      <td>Mars</td>\n      <td>Earth</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>Diameter:</td>\n      <td>6,779 km</td>\n      <td>12,742 km</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>Mass:</td>\n      <td>6.39 × 10^23 kg</td>\n      <td>5.97 × 10^24 kg</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>Moons:</td>\n      <td>2</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>Distance from Sun:</td>\n      <td>227,943,824 km</td>\n      <td>149,598,262 km</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>Length of Year:</td>\n      <td>687 Earth days</td>\n      <td>365.24 days</td>\n    </tr>\n    <tr>\n      <th>6</th>\n      <td>Temperature:</td>\n      <td>-87 to -5 °C</

In [19]:
# Strip New Lines tags
html_table.replace('\n', '')

'<table border="1" class="dataframe">  <thead>    <tr style="text-align: right;">      <th></th>      <th>0</th>      <th>1</th>      <th>2</th>    </tr>  </thead>  <tbody>    <tr>      <th>0</th>      <td>Mars - Earth Comparison</td>      <td>Mars</td>      <td>Earth</td>    </tr>    <tr>      <th>1</th>      <td>Diameter:</td>      <td>6,779 km</td>      <td>12,742 km</td>    </tr>    <tr>      <th>2</th>      <td>Mass:</td>      <td>6.39 × 10^23 kg</td>      <td>5.97 × 10^24 kg</td>    </tr>    <tr>      <th>3</th>      <td>Moons:</td>      <td>2</td>      <td>1</td>    </tr>    <tr>      <th>4</th>      <td>Distance from Sun:</td>      <td>227,943,824 km</td>      <td>149,598,262 km</td>    </tr>    <tr>      <th>5</th>      <td>Length of Year:</td>      <td>687 Earth days</td>      <td>365.24 days</td>    </tr>    <tr>      <th>6</th>      <td>Temperature:</td>      <td>-87 to -5 °C</td>      <td>-88 to 58°C</td>    </tr>  </tbody></table>'

In [20]:
# Save Table to a HTML file.
df.to_html('Script_Outputs/table.html')

In [21]:
# Confirm changes by opening.
!open Script_Outputs/table.html

In [22]:
# Close out browser session
browser.quit()

# 4. Scrape high-resolution images from https://marshemispheres.com/

In [23]:
# Run Chrome to assist with identifying another set of html calls.
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)



Current google-chrome version is 96.0.4664
Get LATEST chromedriver version for 96.0.4664 google-chrome
Driver [/Users/normanadkins/.wdm/drivers/chromedriver/mac64/96.0.4664.45/chromedriver] found in cache


In [24]:
# Visit the new URL for preview
url = "https://marshemispheres.com/"
browser.visit(url)

In [25]:
# HTML object
html = browser.html
# Parse HTML with Beautiful Soup
soup = BeautifulSoup(html, "html.parser")

In [27]:
# Retrieve all elements that contain image information
divs = soup.find("div", class_="collapsible results")
print(divs)

<div class="collapsible results">
<div class="item">
<a class="itemLink product-item" href="cerberus.html"><img alt="Cerberus Hemisphere Enhanced thumbnail" class="thumb" src="images/39d3266553462198bd2fbc4d18fbed17_cerberus_enhanced.tif_thumb.png"/></a>
<div class="description">
<a class="itemLink product-item" href="cerberus.html">
<h3>Cerberus Hemisphere Enhanced</h3>
</a>
<span class="subtitle" style="float:left">image/tiff 21 MB</span><span class="pubDate" style="float:right"></span><br/>
<p>Mosaic of the Cerberus hemisphere of Mars projected into point perspective, a view similar to that which one would see from a spacecraft. This mosaic is composed of 104 Viking Orbiter images acquired…</p>
</div>
<!-- end description -->
</div>
<div class="item">
<a class="itemLink product-item" href="schiaparelli.html"><img alt="Schiaparelli Hemisphere Enhanced thumbnail" class="thumb" src="images/08eac6e22c07fb1fe72223a79252de20_schiaparelli_enhanced.tif_thumb.png"/></a>
<div class="descripti

In [60]:
# Narrow in on section with data that needs to be appended.
m_hems = divs.find_all("div", class_="item")

# Create an open list.
sphere_img_url = []

# Use a for loop to navigate and grab each hemisphere's information.

    # Gathering the Hemisphere name in the "H3" portion.
    hemisphere = each.find("div", class_="description")
    name = hemisphere.h3.text
    
    # For each of the hemispheres, navigate to the URL for grabbing the enhanced image's URL.
    hem_url = hemisphere.a["href"]    
    browser.visit(url + hem_url)
    
    # Boiler Plate for parsing the new HTML.
    link = browser.html
    link_conn = BeautifulSoup(link, "html.parser")
    
    # Grab the URL for the enhanced image.
    img1_url = link_conn.find("div", class_="description")
    img2_url = url + img1_url.a["href"]
    
    # Create a dictionary to include the new lists (Drop the "Enhanced" label).
    image_dictionary = {}
    image_dictionary["title"] = name.replace(" Enhanced", "")
    image_dictionary["img_url"] = img2_url
    
    # Append the lists to the dictionary.
    sphere_img_url.append(image_dictionary)

# Print the dictionary to confirm the changes.
print(sphere_img_url)

[{'title': 'Cerberus Hemisphere', 'img_url': 'https://marshemispheres.com/images/cerberus_enhanced.tif'}, {'title': 'Schiaparelli Hemisphere', 'img_url': 'https://marshemispheres.com/images/schiaparelli_enhanced.tif'}, {'title': 'Syrtis Major Hemisphere', 'img_url': 'https://marshemispheres.com/images/syrtis_major_enhanced.tif'}, {'title': 'Valles Marineris Hemisphere', 'img_url': 'https://marshemispheres.com/images/valles_marineris_enhanced.tif'}]
