# Step 1 - Scraping

Complete your initial scraping using Jupyter Notebook, BeautifulSoup, Pandas, and Requests/Splinter.

### NASA Mars News

- Collect the latest News Title and Paragraph Text

In [None]:
# Dependencies and setup
from bs4 import BeautifulSoup as bs
from splinter import Browser
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import os
import time

In [None]:
# Set up splinter
executable_path = {"executable_path":ChromeDriverManager().install()}
browser = Browser("chrome", **executable_path, headless = False)

In [None]:
# URL of page to be scraped
newsURL = "https://redplanetscience.com"
browser.visit(newsURL)

In [None]:
# Declare variables to store scraped Title and Teaser 
newsTitle = browser.find_by_css(".content_title")[0].text
paragraphText = browser.find_by_css(".article_teaser_body")[0].text
print("------------------------------------------------------------------")
print(f"Article:  {newsTitle}")
print(f"Teaser:   {paragraphText}")

### JPL Mars Space Images - Featured Image

- Find the image url for the current featured Mars image

In [None]:
# URL of page to be scraped
imageURL = "https://spaceimages-mars.com/"
browser.visit(imageURL)

In [None]:
# HTML object
imageHTML = browser.html
    
# Parse HTML with BeautifulSoup
imageSoup = bs(imageHTML, "html.parser")

In [None]:
# Find href within anchor tag element
featuredImage = imageSoup.find("a", class_ = "showimg fancybox-thumbs")["href"]

# Declare variable and combine main URL with link
featured_image_url = f"https://spaceimages-mars.com/{featuredImage}"
print(f"Featured Image URL:  {featured_image_url}")

### Mars Facts

- Use Pandas to scrape the table containing facts about the planet including Diameter, Mass, etc.
- Use Pandas to convert the data to a HTML table string.

In [None]:
# Use the read_html function in pandas to automatically scrape tabular data
tableURL = "https://galaxyfacts-mars.com"
marsTable = pd.read_html(tableURL)
marsTable

In [None]:
# Slice off DataFrame that we want using normal indexing
marsFacts_df = marsTable[0]
marsFacts_df

In [None]:
# Extract first row from DataFrame and set it as the header
new_header = marsFacts_df.iloc[0]
marsFacts_df = marsFacts_df[1:]
marsFacts_df.columns = new_header
marsFacts_df

In [None]:
# Export DataFrame to HTML
marsFacts_table = marsFacts_df.to_html()

### Mars Hemispheres

- Obtain high resolution images for each of Mars's hemispheres (find the image url to the full resolution image).
- Save both the image url string for the full resolution hemisphere image, and the hemisphere title containing the hemisphere name. Use a Python dictionary to store the data using the keys "img_url" and "title."
- Append the dictionary with the image url string and the hemisphere title to a list. This list will contain one dictionary for each hemisphere.

In [None]:
# URL of page to be scraped
hemispheresURL = "https://marshemispheres.com/"
browser.visit(hemispheresURL)

In [None]:
# Create empty list to be appended with dictionaries
hemisphere_image_urls = []

In [None]:
# Declare variable to store scraped links
imageLinks = browser.find_by_css("a.itemLink img")

In [None]:
# Iterate through each element
for link in range(len(imageLinks)):
        
    # Create empty dictionary
    imageDict = {}

    # Locate and follow each subsequent link
    browser.find_by_css("a.product-item img")[link].click()

    # Declare variable to store scraped image URL
    image = browser.links.find_by_text("Sample").first

    # Append scraped image URL and scraped Title to dictionary
    imageDict["img_url"] = image["href"]
    imageDict["title"] = browser.find_by_css("h2.title").text
        
    # Append list with scraped dictionary
    hemisphere_image_urls.append(imageDict)
        
    # Go back
    browser.back()

In [None]:
hemisphere_image_urls

In [None]:
# Create and append single dictionary with all scraped data
scrapedData = {
"News_Title":newsTitle, 
"News_Text":paragraphText,
"Featured_Image":featured_image_url,
"Mars_Facts":marsFacts_table,
"Hemisphere_Images":hemisphere_image_urls
}

# Close remote browser
browser.quit()

In [None]:
scrapedData