## Initial setup
- Libraries
- Splinter Browser config

In [1]:
#dependencies and setup
from bs4 import BeautifulSoup
import pandas as pd
from splinter import Browser
from webdriver_manager.chrome import ChromeDriverManager
import time
from pprint import pprint

In [2]:
# create a browser instance using splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)
time.sleep(1)

[WDM] - Current google-chrome version is 88.0.4324
[WDM] - Get LATEST driver version for 88.0.4324
[WDM] - Driver [C:\Users\mosab\.wdm\drivers\chromedriver\win32\88.0.4324.96\chromedriver.exe] found in cache






## Scraping Mars News
- save the latest news title 
- save the latest news paragraph

In [3]:
# visit the Mars News url
MarsNews_url = 'https://mars.nasa.gov/news/'
browser.visit(MarsNews_url)
time.sleep(2)

# create HTML object
html = browser.html

# parse HTML with BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')

# get the first <li> item under <ul> list of headlines: this contains the latest news title and paragraph text
first_li = soup.find('li', class_='slide')

# save the news title under the <div> tag with a class of 'content_title'
news_title = first_li.find('div', class_='content_title').text
print(news_title)

# save the news date under the <div> tag with a class of 'list_date'
news_date = first_li.find('div', class_='list_date').text
print(news_date)

# save the paragraph text under the <div> tag with a class of 'article_teaser_body'
news_para = first_li.find('div', class_='article_teaser_body').text
print(news_para)

NASA's Perseverance Pays Off Back Home
February  2, 2021
Even as the Perseverance rover approaches Mars, technology on board is paying off on Earth.


## Scraping JPL Featured Image URL
- save the current Featured Mars Image url and title

In [4]:
# visit the JPL Featured Space Image website
JPL_index_url = 'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/index.html'
browser.visit(JPL_index_url)
time.sleep(2)

# create HTML object
html = browser.html

# parse HTML with BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')

In [5]:
# use splinter to click on the 'full image' button to retrieve a full-size jpg url
browser.find_by_text(' FULL IMAGE').click()
time.sleep(1)

In [6]:
# get the html for the full featured image
full_img_html = browser.html

# parse HTML with BeautifulSoup
full_img_soup = BeautifulSoup(full_img_html, 'html.parser')

# find the src for img tag with class 'fancybox-image'
header_img_url_partial = full_img_soup.find('img', class_='fancybox-image')['src']
print(header_img_url_partial)

image/featured/mars2.jpg


In [7]:
# creating the final URL for JPL featured image
JPL_base_url = 'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space'
featured_image_url = JPL_base_url + '/' + header_img_url_partial
print(featured_image_url)

https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/image/featured/mars2.jpg


In [8]:
# getting the title of the deatured image
featured_img_title = soup.find('h1',class_='media_feature_title').text
featured_img_title

'Dusty Space Cloud'

## Scraping Mars Facts
- Scrape the table containing facts about the planet including Diameter, Mass, etc.
- Use Pandas to convert the data to a HTML table string.

In [9]:
# visit the Mars Facts website
MarsFacts_url = 'https://space-facts.com/mars/'
browser.visit(MarsFacts_url)
time.sleep(2)

# create HTML object
html = browser.html

# use Pandas to scrape table of facts
table = pd.read_html(html)

# use indexing to slice the table to a dataframe
facts_df = table[0]
facts_df.columns =['Description', 'Value']
facts_df['Description'] = facts_df['Description'].str.replace(':', '')
facts_df

Unnamed: 0,Description,Value
0,Equatorial Diameter,"6,792 km"
1,Polar Diameter,"6,752 km"
2,Mass,6.39 × 10^23 kg (0.11 Earths)
3,Moons,2 (Phobos & Deimos)
4,Orbit Distance,"227,943,824 km (1.38 AU)"
5,Orbit Period,687 days (1.9 years)
6,Surface Temperature,-87 to -5 °C
7,First Record,2nd millennium BC
8,Recorded By,Egyptian astronomers


In [10]:
# --- convert the dataframe to a HTML table and save to html file ---
facts_df.to_html('mars_facts.html', index=False, header=True, border=1, justify = 'left')

## Scraping Mars Hemisphere images
- Find and save the image url and title of the full resolution image for each of Mar's hemispheres in a Python dictionary
- Append the dictionary with the image url string and the hemisphere title to a list. This list will contain one dictionary for each hemisphere.

In [11]:
# visit the Mars Hemisphere website
MarsHemImage_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
browser.visit(MarsHemImage_url)
time.sleep(2)

# create HTML object
html = browser.html

# parse HTML with BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')


In [12]:
# retrieve all the parent div tags for each hemisphere
hemisphere_divs = soup.find_all('div', class_="item")

# create an empty list to store the python dictionary
hemisphere_image_data = []

# loop through each div item to get hemisphere data
for hemisphere in range(len(hemisphere_divs)):

    # use splinter's browser to click on each hemisphere's link in order to retrieve image data
    hem_link = browser.find_by_css("a.product-item h3")
    hem_link[hemisphere].click()
    time.sleep(1)
    
    # create a beautiful soup object with the image detail page's html
    img_detail_html = browser.html
    imagesoup = BeautifulSoup(img_detail_html, 'html.parser')
    
    # create the base url for the fullsize image link
    base_url = 'https://astrogeology.usgs.gov'
    
    # retrieve the full-res image url and save into a variable
    hem_url = imagesoup.find('img', class_="wide-image")['src']
    
    # complete the featured image url by adding the base url
    img_url = base_url + hem_url

    # retrieve the image title using the title class and save into variable
    img_title = browser.find_by_css('.title').text
    
    # add the key value pairs to python dictionary and append to the list
    hemisphere_image_data.append({"title": img_title,
                              "img_url": img_url})
    
    # go back to the main page
    browser.back()
    
hemisphere_image_data

[{'title': 'Cerberus Hemisphere Enhanced',
  'img_url': 'https://astrogeology.usgs.gov/cache/images/f5e372a36edfa389625da6d0cc25d905_cerberus_enhanced.tif_full.jpg'},
 {'title': 'Schiaparelli Hemisphere Enhanced',
  'img_url': 'https://astrogeology.usgs.gov/cache/images/3778f7b43bbbc89d6e3cfabb3613ba93_schiaparelli_enhanced.tif_full.jpg'},
 {'title': 'Syrtis Major Hemisphere Enhanced',
  'img_url': 'https://astrogeology.usgs.gov/cache/images/555e6403a6ddd7ba16ddb0e471cadcf7_syrtis_major_enhanced.tif_full.jpg'},
 {'title': 'Valles Marineris Hemisphere Enhanced',
  'img_url': 'https://astrogeology.usgs.gov/cache/images/b3c7c6c9138f57b4756be9b9c43e3a48_valles_marineris_enhanced.tif_full.jpg'}]

In [13]:
# close the browser session    
browser.quit()