In [1]:
# Dependencies
from bs4 import BeautifulSoup
from time import sleep

In [2]:
# Import Splinter and set the chromedriver path
from splinter import Browser
from webdriver_manager.chrome import ChromeDriverManager

executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

[WDM] - Current google-chrome version is 84.0.4147
[WDM] - Get LATEST driver version for 84.0.4147
[WDM] - Driver [C:\Users\Boss\.wdm\drivers\chromedriver\win32\84.0.4147.30\chromedriver.exe] found in cache


 


## NASA Mars News scraping

In [3]:
# URL of page to be scraped
url = "https://mars.nasa.gov/news/"

# Use splinter to navigate the site
browser.visit(url)

In [4]:
# Let the script to wait for the browser to be fully loaded
sleep(1)

In [5]:
# Create BeautifulSoup object; parse with 'lxml'
html = browser.html
soup = BeautifulSoup(html, 'lxml')

In [6]:
# Select the first element containing the latest news
latest_news = soup.select_one("ul.item_list li.slide")
print(latest_news.prettify())

<li class="slide">
 <div class="image_and_description_container">
  <a href="/news/8742/follow-nasas-perseverance-rover-in-real-time-on-its-way-to-mars/" target="_self">
   <div class="rollover_description">
    <div class="rollover_description_inner">
     A crisply rendered web application can show you where the agency's Mars 2020 mission is right now as it makes its way to the Red Planet for a Feb. 18, 2021, landing.
    </div>
    <div class="overlay_arrow">
     <img alt="More" src="/assets/overlay-arrow.png"/>
    </div>
   </div>
   <div class="list_image">
    <img alt="Illustration of Mars 2020 spacecraft" src="/system/news_items/list_view_images/8742_Mars2020-Earth-226.jpg"/>
   </div>
   <div class="bottom_gradient">
    <div>
     <h3>
      Follow NASA's Perseverance Rover in Real Time on Its Way to Mars
     </h3>
    </div>
   </div>
  </a>
  <div class="list_text">
   <div class="list_date">
    August 21, 2020
   </div>
   <div class="content_title">
    <a href="/news

In [7]:
# Retrieve the latest news title
news_title = latest_news.find('div', class_="content_title").get_text()
news_title

"Follow NASA's Perseverance Rover in Real Time on Its Way to Mars"

In [8]:
# Retrieve the latest news paragraph
news_p = latest_news.find('div', class_='article_teaser_body').get_text()
news_p

"A crisply rendered web application can show you where the agency's Mars 2020 mission is right now as it makes its way to the Red Planet for a Feb. 18, 2021, landing."

## JPL Mars Space Images - Featured Image

In [9]:
# URL of page to be scraped
image_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'

# Use splinter to navigate the site
browser.visit(image_url)

In [10]:
# Click the link under id "full_image" to access the featured image
browser.click_link_by_id("full_image")

In [11]:
# Click the link under text "more info" to access the largesize featured image
browser.links.find_by_partial_text("more info").click()

In [12]:
# Create BeautifulSoup object; parse with 'lxml'
html = browser.html
soup = BeautifulSoup(html, 'lxml')

In [13]:
# Retrieve the path to the largesize featured image
large_image_url = soup.find(class_="main_image")["src"]

In [14]:
# Assign the url string to a variable called featured_image_url
featured_image_url = f"https://www.jpl.nasa.gov{large_image_url}"
featured_image_url

'https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA16192_hires.jpg'

## Mars Facts

In [15]:
# Dependencies
import pandas as pd

In [16]:
# URL of page to be scraped
fact_url = "https://space-facts.com/mars/"

# Use Panda's `read_html` to parse the url
scraped_dfs = pd.read_html(fact_url)

In [17]:
# Overview of the list of the scraped dataframes
for df in scraped_dfs:
    display(df)

Unnamed: 0,0,1
0,Equatorial Diameter:,"6,792 km"
1,Polar Diameter:,"6,752 km"
2,Mass:,6.39 × 10^23 kg (0.11 Earths)
3,Moons:,2 (Phobos & Deimos)
4,Orbit Distance:,"227,943,824 km (1.38 AU)"
5,Orbit Period:,687 days (1.9 years)
6,Surface Temperature:,-87 to -5 °C
7,First Record:,2nd millennium BC
8,Recorded By:,Egyptian astronomers


Unnamed: 0,Mars - Earth Comparison,Mars,Earth
0,Diameter:,"6,779 km","12,742 km"
1,Mass:,6.39 × 10^23 kg,5.97 × 10^24 kg
2,Moons:,2,1
3,Distance from Sun:,"227,943,824 km","149,598,262 km"
4,Length of Year:,687 Earth days,365.24 days
5,Temperature:,-87 to -5 °C,-88 to 58°C


Unnamed: 0,0,1
0,Equatorial Diameter:,"6,792 km"
1,Polar Diameter:,"6,752 km"
2,Mass:,6.39 × 10^23 kg (0.11 Earths)
3,Moons:,2 (Phobos & Deimos)
4,Orbit Distance:,"227,943,824 km (1.38 AU)"
5,Orbit Period:,687 days (1.9 years)
6,Surface Temperature:,-87 to -5 °C
7,First Record:,2nd millennium BC
8,Recorded By:,Egyptian astronomers


In [18]:
# Select the dataframe containing facts about the planet including Diameter, Mass, etc.
df = scraped_dfs[0]
df

Unnamed: 0,0,1
0,Equatorial Diameter:,"6,792 km"
1,Polar Diameter:,"6,752 km"
2,Mass:,6.39 × 10^23 kg (0.11 Earths)
3,Moons:,2 (Phobos & Deimos)
4,Orbit Distance:,"227,943,824 km (1.38 AU)"
5,Orbit Period:,687 days (1.9 years)
6,Surface Temperature:,-87 to -5 °C
7,First Record:,2nd millennium BC
8,Recorded By:,Egyptian astronomers


In [19]:
# Assign the columns Description and Mars to the dataframe and set Description as index
df.columns = ["Description", "Mars"]
df.set_index("Description", inplace=True)
df

Unnamed: 0_level_0,Mars
Description,Unnamed: 1_level_1
Equatorial Diameter:,"6,792 km"
Polar Diameter:,"6,752 km"
Mass:,6.39 × 10^23 kg (0.11 Earths)
Moons:,2 (Phobos & Deimos)
Orbit Distance:,"227,943,824 km (1.38 AU)"
Orbit Period:,687 days (1.9 years)
Surface Temperature:,-87 to -5 °C
First Record:,2nd millennium BC
Recorded By:,Egyptian astronomers


In [20]:
# Use Pandas to convert the data to a HTML table string
html_table = df.to_html()
print(html_table)

<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>Mars</th>
    </tr>
    <tr>
      <th>Description</th>
      <th></th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>Equatorial Diameter:</th>
      <td>6,792 km</td>
    </tr>
    <tr>
      <th>Polar Diameter:</th>
      <td>6,752 km</td>
    </tr>
    <tr>
      <th>Mass:</th>
      <td>6.39 × 10^23 kg (0.11 Earths)</td>
    </tr>
    <tr>
      <th>Moons:</th>
      <td>2 (Phobos &amp; Deimos)</td>
    </tr>
    <tr>
      <th>Orbit Distance:</th>
      <td>227,943,824 km (1.38 AU)</td>
    </tr>
    <tr>
      <th>Orbit Period:</th>
      <td>687 days (1.9 years)</td>
    </tr>
    <tr>
      <th>Surface Temperature:</th>
      <td>-87 to -5 °C</td>
    </tr>
    <tr>
      <th>First Record:</th>
      <td>2nd millennium BC</td>
    </tr>
    <tr>
      <th>Recorded By:</th>
      <td>Egyptian astronomers</td>
    </tr>
  </tbody>
</table>


In [21]:
df.to_html('table.html')

## Mars Hemispheres

In [22]:
# URL of page to be scraped
hemisphere_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"

# Use splinter to navigate the site
browser.visit(hemisphere_url)

In [23]:
# Extract the hemisphere titles with embedded links
html = browser.html
soup = BeautifulSoup(html, 'lxml')
text_to_search = soup.find_all("h3")

In [24]:
# Overview of the hemisphere titles
for text in text_to_search:
    print(text)

<h3>Cerberus Hemisphere Enhanced</h3>
<h3>Schiaparelli Hemisphere Enhanced</h3>
<h3>Syrtis Major Hemisphere Enhanced</h3>
<h3>Valles Marineris Hemisphere Enhanced</h3>


In [25]:
hemisphere_image_urls = list()

for text in text_to_search:
    
    # Click each of the links to the hemispheres in order to find the image url to the full resolution image.
    browser.links.find_by_partial_text(text.get_text()).click()
    sleep(1)
    html = browser.html
    soup = BeautifulSoup(html, 'lxml')
    
    # Retrieve the Hemisphere title containing the hemisphere name
    title = text.get_text().split("Enhanced")[0]
    
    # Retrieve the image url string for the full resolution hemisphere image
    img_url = soup.select_one("dd a")["href"]
    
    # Append the dictionary with the image url string and the hemisphere title to a list
    hemisphere_image_urls.append({
        "title": title,
        "img_url": img_url
    })
    
    browser.visit(hemisphere_url) 
    sleep(1)

In [26]:
hemisphere_image_urls

[{'title': 'Cerberus Hemisphere ',
  'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif'},
 {'title': 'Schiaparelli Hemisphere ',
  'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif'},
 {'title': 'Syrtis Major Hemisphere ',
  'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif'},
 {'title': 'Valles Marineris Hemisphere ',
  'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif'}]

In [27]:
# Quit the browser after scraping
browser.quit()