### Import Dependencies

In [1]:
# Import Splinter and BeautifulSoup
from splinter import Browser
from bs4 import BeautifulSoup as soup
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd

In [2]:
# Set up Splinter
executable_path={'executable_path':ChromeDriverManager().install()}
browser=Browser('chrome',**executable_path, headless = False)



Current google-chrome version is 91.0.4472
Get LATEST driver version for 91.0.4472
Driver [/Users/maggie/.wdm/drivers/chromedriver/mac64/91.0.4472.101/chromedriver] found in cache


### Visit the NASA Mars News Site

In [3]:
# Visit the mars nasa news site
url = 'https://redplanetscience.com'
browser.visit(url)

# Search for elements with specific tag combos- like div and attribute (list_text). i.e. ul.item_list would find HTM as <ul class = "item_list">
# Optional delay for loading the page
browser.is_element_present_by_css('div.list_text',wait_time=1)

True

In [4]:
# Convert the browser html to a soup object and then quit the browser
# set up HTML parser
html = browser.html
news_soup = soup(html,'html.parser')
# create a parent element for holding scraped data ("slide_elem" variable)
# use select_one to return the first item since CSS returns from right to left, returning last item
slide_elem = news_soup.select_one('div.list_text')

In [5]:
# Use parsed content to find the content title for the first article
slide_elem.find('div',class_='content_title')

<div class="content_title">AI Is Helping Scientists Discover Fresh Craters on Mars</div>

In [6]:
# Use the parent slide_elem element to find the first 'a' tag and save it as a 'news_title'
news_title = slide_elem.find('div',class_='content_title').get_text()
news_title

'AI Is Helping Scientists Discover Fresh Craters on Mars'

In [7]:
# Use the parent slide_elem element to find the first 'a' tag and save it as a 'news_title'
news_p = slide_elem.find('div',class_='article_teaser_body').get_text()
news_p

"It's the first time machine learning has been used to find previously unknown craters on the Red Planet."

### JPL Space Images Featured Image

In [8]:
# Visit URL
url = 'https://spaceimages-mars.com'
browser.visit(url)

In [9]:
# Find and click the full image button
full_image_elem = browser.find_by_tag('button')[1]
full_image_elem.click()

In [10]:
# Parse the resulting html with soup
html = browser.html
img_soup = soup(html, 'html.parser')

In [11]:
# Find the relative image url
img_url_rel = img_soup.find('img', class_='fancybox-image').get('src')
img_url_rel

'image/featured/mars3.jpg'

In [12]:
# add base url and add to our shortened address above to create an absolute url
img_url = f'https://spaceimages-mars.com/{img_url_rel}'
img_url

'https://spaceimages-mars.com/image/featured/mars3.jpg'

### Mars Facts

In [13]:
# Scrape the table 
# Create a new df from the html table where Pandas read_html searchers for and returns list of tables with [0] returning only the first instance
df = pd.read_html('https://galaxyfacts-mars.com')[0]
# Assign columns for our DF for clarity
df.columns=['Description', 'Mars', 'Earth']
# turn description column into index for our df
df.set_index('Description', inplace = True)
df.head()

Unnamed: 0_level_0,Mars,Earth
Description,Unnamed: 1_level_1,Unnamed: 2_level_1
Mars - Earth Comparison,Mars,Earth
Diameter:,"6,779 km","12,742 km"
Mass:,6.39 × 10^23 kg,5.97 × 10^24 kg
Moons:,2,1
Distance from Sun:,"227,943,824 km","149,598,262 km"


In [14]:
# Convert our DF back to HTML-ready code 
df.to_html()

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>Mars</th>\n      <th>Earth</th>\n    </tr>\n    <tr>\n      <th>Description</th>\n      <th></th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>Mars - Earth Comparison</th>\n      <td>Mars</td>\n      <td>Earth</td>\n    </tr>\n    <tr>\n      <th>Diameter:</th>\n      <td>6,779 km</td>\n      <td>12,742 km</td>\n    </tr>\n    <tr>\n      <th>Mass:</th>\n      <td>6.39 × 10^23 kg</td>\n      <td>5.97 × 10^24 kg</td>\n    </tr>\n    <tr>\n      <th>Moons:</th>\n      <td>2</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>Distance from Sun:</th>\n      <td>227,943,824 km</td>\n      <td>149,598,262 km</td>\n    </tr>\n    <tr>\n      <th>Length of Year:</th>\n      <td>687 Earth days</td>\n      <td>365.24 days</td>\n    </tr>\n    <tr>\n      <th>Temperature:</th>\n      <td>-87 to -5 °C</td>\n      <td>-88 to 58°C</td>\n    </tr>\n  </tbody>

# D1: Scrape High-Resolution Mars’ Hemisphere Images and Titles

### Hemispheres

In [15]:
# 1. Use browser to visit the URL 
url = 'https://marshemispheres.com/'
main_url = browser.visit(url)

In [16]:
# 2. Create a list to hold the images and titles.
hemisphere_image_urls = []
hemisphere_image_urls

[]

In [17]:
# Parse the html
html_hemisphere = browser.html
img_title_soup = soup(html_hemisphere, 'html.parser')

In [18]:
items = img_title_soup.find_all('div', class_='item')

In [43]:
image_page = img_title_soup.find('a',class_='itemLink product-item')['href']
browser.visit(f'https://marshemispheres.com/{image_page}')

In [20]:
# Parse the image page html
image_html = browser.html
img_page_soup = soup(image_html, 'html.parser')

In [21]:
# Get the url from the sample link
image = img_page_soup.find('ul').li.a['href']
image

'images/full.jpg'

In [22]:
# Append full url
img_url = f'https://marshemispheres.com/{image}'
img_url

'https://marshemispheres.com/images/full.jpg'

In [23]:
# Get the title
title = img_page_soup.find('h2').text
title

'Cerberus Hemisphere Enhanced'

In [24]:
# Add to dictionary
data_dict = {}
    
data_dict = {
    'img_url':img_url,
    'title':title
}
data_dict

{'img_url': 'https://marshemispheres.com/images/full.jpg',
 'title': 'Cerberus Hemisphere Enhanced'}

In [25]:
hemisphere_image_urls.append(data_dict)
hemisphere_image_urls

[{'img_url': 'https://marshemispheres.com/images/full.jpg',
  'title': 'Cerberus Hemisphere Enhanced'}]

In [None]:
img_title_soup.find_all('a',class_='itemLink product-item')

In [26]:
main_url = browser.visit(url)

In [46]:
# 2. Create a list to hold the images and titles.
hemisphere_image_urls = []

# 3. Write code to retrieve the image urls and titles for each hemisphere.

# Parse the html
html_hemisphere = browser.html
img_title_soup = soup(html_hemisphere, 'html.parser')

items = img_title_soup.find_all('div', class_='item')

for i in range(len(items)):
    # Create a dictionary list to hold results
    data_dict = {}
    # Open image page
    image_page = items[i].find('a',class_='itemLink product-item')['href']
    browser.visit(f'https://marshemispheres.com/{image_page}')
    # Parse the image page html
    image_html = browser.html
    img_page_soup = soup(image_html, 'html.parser')
    # Get the url from the sample link
    image = img_page_soup.find('ul').li.a['href']
    # Append full url
    img_url = f'https://marshemispheres.com/{image}'
    # Get the title
    title = img_page_soup.find('h2').text
    # Add to dictionary
    print(i,img_url)
    data_dict = {
        'img_url':img_url,
        'title':title
    }
    # append to list
    hemisphere_image_urls.append(data_dict)
    # return to main page
    # main_url = browser.back(url)
    browser.back()

0 https://marshemispheres.com/images/full.jpg
1 https://marshemispheres.com/images/schiaparelli_enhanced-full.jpg
2 https://marshemispheres.com/images/syrtis_major_enhanced-full.jpg
3 https://marshemispheres.com/images/valles_marineris_enhanced-full.jpg


In [48]:
# 4. Print the list that holds the dictionary of each image url and title.
hemisphere_image_urls

[{'img_url': 'https://marshemispheres.com/images/full.jpg',
  'title': 'Cerberus Hemisphere Enhanced'},
 {'img_url': 'https://marshemispheres.com/images/schiaparelli_enhanced-full.jpg',
  'title': 'Schiaparelli Hemisphere Enhanced'},
 {'img_url': 'https://marshemispheres.com/images/syrtis_major_enhanced-full.jpg',
  'title': 'Syrtis Major Hemisphere Enhanced'},
 {'img_url': 'https://marshemispheres.com/images/valles_marineris_enhanced-full.jpg',
  'title': 'Valles Marineris Hemisphere Enhanced'}]

In [49]:
# 5. Quit the browser
browser.quit()