# Step 1 - Scraping

Complete your initial scraping using Jupyter Notebook, BeautifulSoup, Pandas, and Requests/Splinter.

### NASA Mars News

- Collect the latest News Title and Paragraph Text

In [None]:
# Dependencies and setup
from bs4 import BeautifulSoup as bs
from splinter import Browser
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import os
import requests
import pymongo

In [None]:
# Set up splinter
executable_path = {"executable_path":ChromeDriverManager().install()}
browser = Browser("chrome", **executable_path, headless = False)

In [None]:
# Initialize PyMongo to work with MongoDBs
conn = "mongodb://localhost:27017"
client = pymongo.MongoClient(conn)

In [None]:
# Define database and collection
db = client.mars_db
collection = db.headlines

In [None]:
# URL of page to be scraped
newsURL = "https://redplanetscience.com"
browser.visit(newsURL)

In [None]:
# Loop through homepage
for x in range(1):
    
    # HTML object
    html = browser.html
    
    # Parse HTML with BeautifulSoup
    soup = bs(html, "html.parser")
    
    # Retrieve all elements that contain News Titles and Paragraph Text
    articles = soup.find_all("div", class_ = "list_text")
    
    # Iterate through each headline
    for article in articles:
        
        # Use BeautifulSoup's find() method to navigate and retrieve attributes
        newsTitle = article.find("div", class_ = "content_title").text
        paragraphText = article.find("div", class_ = "article_teaser_body").text
        print("------------------------------------------")
        print(f"Article:  {newsTitle}")
        print(f"Teaser:  {paragraphText}")
    
    # Dictionary to be inserted as a MongoDB document
        post = {
            "Title": newsTitle,
            "Paragraph Content": paragraphText,
        }
            
        collection.insert_one(post)

### JPL Mars Space Images - Featured Image

- Find the image url for the current featured Mars image

In [None]:
# URL of page to be scraped
imageURL = "https://spaceimages-mars.com/"
browser.visit(imageURL)

In [None]:
# HTML object
imageHTML = browser.html
    
# Parse HTML with BeautifulSoup
imageSoup = bs(imageHTML, "html.parser")

In [None]:
# Find href within anchor tag element
featuredImage = imageSoup.find("a", class_ = "showimg fancybox-thumbs")["href"]

# Declare variable and combine main URL with link
featured_image_url = f"https://spaceimages-mars.com/{featuredImage}"
print(f"Featured Image URL:  {featured_image_url}")

### Mars Facts

- Use Pandas to scrape the table containing facts about the planet including Diameter, Mass, etc.
- Use Pandas to convert the data to a HTML table string.

In [None]:
# Use the read_html function in pandas to automatically scrape tabular data
tableURL = "https://galaxyfacts-mars.com"
tables = pd.read_html(tableURL)

In [None]:
# Slice off DataFrame that we want using normal indexing
marsFacts_df = tables[0]
marsFacts_df

### Mars Hemispheres

- Obtain high resolution images for each of Mars's hemispheres (find the image url to the full resolution image).
- Save both the image url string for the full resolution hemisphere image, and the hemisphere title containing the hemisphere name. Use a Python dictionary to store the data using the keys "img_url" and "title."
- Append the dictionary with the image url string and the hemisphere title to a list. This list will contain one dictionary for each hemisphere.

In [None]:
# URL of page to be scraped
hemispheresURL = "https://marshemispheres.com/"
browser.visit(hemispheresURL)

In [None]:
# HTML object
hemispheresHTML = browser.html
    
# Parse HTML with BeautifulSoup
hemispheresSoup = bs(hemispheresHTML, "html.parser")
    
# Retrieve all elements that contain image URLs to the full resolution image
links = hemispheresSoup.find_all("div", class_ = "description")

# Create empty list to be appended with dictionaries
hemisphere_image_urls = []

In [None]:
# Iterate through each description
for link in links:
        
    # Use BeautifulSoup's find() method to navigate and retrieve attributes
    imageTitle = link.find("h3").text
    imageLink = link.find("a", class_ = "itemLink product-item")["href"]
    
    # Visit the link with the full resolution image
    browser.visit(hemispheresURL + imageLink)
    
    # HTML object
    linkHTML = browser.html
    
    # Parse HTML with BeautifulSoup
    linkSoup = bs(linkHTML, "html.parser")
    
    # Full resolution image URL
    fullURL = hemispheresURL + linkSoup.find("img", class_ = "wide-image")["src"]
    
    # Append to list of dictionaries
    hemisphere_image_urls.append({"title":imageTitle, "img_url":fullURL})
    
    # Display titles and links
    print("------------------------------------------")
    print(f"Title:  {imageTitle}")
    print(f"Link:   {fullURL}")

# Display end statement    
print(" ")
print("Scraping Complete.")

In [None]:
# Close remote browser
browser.quit()