## Step 1 - Scraping

#### Dependencies

In [1]:
import numpy as np                                                           # numpy library

In [2]:
import pandas as pd                                                          # pandas library

In [3]:
from splinter import Browser                                                 # browser module from splinter library

In [4]:
from selenium import webdriver                                               # webdriver module from selenium library

In [5]:
from bs4 import BeautifulSoup as bs                                          # BeautifulSoup module from bs4 library

In [6]:
import requests as req                                                       # requests library

#### NASA Mars News - Scrape the NASA Mars News Site and collect the latest News Title and Paragragh Text. Assign the text to variables that you can reference later.

In [7]:
news_data = {}                                                               # initializes empty dictionary
paragraph_text = []                                                          # initializes empty list

In [8]:
base_url = "https://mars.nasa.gov/"                                          # base URL for finding paragraph text
nasa_url = "https://mars.nasa.gov/news/"                                     # URL for initial scrape
response_1 = req.get(nasa_url)                                               # acquires first response from URL

nasa_soup = bs(response_1.text, 'html.parser')                               # sends response to beautiful soup

In [9]:
soup_div = nasa_soup.find(class_="slide")                                    # finds class
soup_news = soup_div.find_all('a')                                           # finds all anchors
news_title = soup_news[1].get_text().strip()                                 # extracts and cleans title

In [10]:
soup_p = soup_div.find_all('a', href=True)                                   # finds paragraphs
soup_p_url = soup_p[0]['href']                                               # gets paragraphs URL
paragraph_url = base_url + soup_p_url                                        # concatenates URL for paragraph
response_2 = req.get(paragraph_url)                                          # acquires second response from URL
para_soup = bs(response_2.text, "html.parser")                               # sends response to beautiful soup
ww_paragraphs = para_soup.find(class_='wysiwyg_content')                     # finds class
paragraphs = ww_paragraphs.find_all('p')                                     # finds paragraphs

In [11]:
for paragraph in paragraphs:                                                 # iterates through paragraphs
    clean_paragraph = paragraph.get_text().strip()                           # extracts and cleans paragraphs    
    paragraph_text.append(clean_paragraph)                                   # appends to list

In [12]:
news_data["news_title"] = news_title                                         # adds title to dictionary

In [13]:
news_data["paragraph_text_1"] = paragraph_text[0]                            # adds paragraph summary to dictionary

In [14]:
news_data["paragraph_text_2"] = paragraph_text[1]                            # adds paragraph detail to dictionary

In [15]:
news_data                                                                    # displays dictionary

{'news_title': 'A Piece of Mars is Going Home',
 'paragraph_text_1': 'A chunk of Mars will soon be returning home.',
 'paragraph_text_2': "A piece of a meteorite called Sayh al Uhaymir 008 (SaU008) will be carried on board NASA's Mars 2020 rover mission, now being built at the agency's Jet Propulsion Laboratory in Pasadena, California. This chunk will serve as target practice for a high-precision laser on the rover's arm."}

#### JPL Mars Space Images - Visit the url for JPL's Featured Space Image. Use splinter to navigate the site and find the image url for the current Featured Mars Image and assign the url string to a variable called featured_image_url.

##### Developer note: One of the Gotchas! I ran into with this assignment is that the URL provided does NOT consistently provide images of Mars. In response to this, I resorted to a brute-force attack, in light of assignment time constraints.

In [16]:
browser = Browser('chrome', headless=False)                                  # defines splinter browser
jpl_fullsize_url = 'https://photojournal.jpl.nasa.gov/jpeg/'                 # defines base URL for fullsize images
jpl_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"      # defines search URL

In [17]:
browser.visit(jpl_url)                                                       # visits search URL with automated browser
jpl_html = browser.html                                                      # acquires response from URL
jpl_soup = bs(jpl_html, 'html.parser')                                       # sends response to beautiful soup

In [18]:
featured_image_list = []                                                     # initializes empty list

for image in jpl_soup.find_all('div',class_="img"):                          # extracts all images
    featured_image_list.append(image.find('img').get('src'))                 # appends URL to list

In [19]:
feature_image = featured_image_list[0]                                       # extracts first image found
temp_list_1 = feature_image.split('-')                                       # splits on '-' (removes size limiters)
temp_list_2 = temp_list_1[0].split('/')                                      # splits on '/' (parses out base filename)
featured_image_url = jpl_fullsize_url + temp_list_2[-1] + '.jpg'             # concatenates fullsize image URL

In [20]:
featured_image_url                                                           # displays URL

'https://photojournal.jpl.nasa.gov/jpeg/PIA22273.jpg'

In [21]:
browser.quit()                                                               # closes automated browser

#### Mars Weather - Visit the Mars Weather twitter account and scrape the latest Mars weather tweet from the page. Save the tweet text for the weather report as a variable called mars_weather

In [22]:
browser = Browser('chrome', headless=False)                                  # defines browser
tweet_url = 'https://twitter.com/marswxreport?lang=en'                       # defines search URL
browser.visit(tweet_url)                                                     # visits search URL with automated browser

In [23]:
tweet_html = browser.html                                                    # acquires response from URL
tweet_soup = bs(tweet_html, 'html.parser')                                   # sends response to beautiful soup

In [24]:
weather_info_list = []                                                       # initializes empty list

# extracts all tweets from soup
for weather_info in tweet_soup.find_all('p',class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text"):
    weather_info_list.append(weather_info.text.strip())                      # appends cleaned tweet to list

In [25]:
for value in reversed(weather_info_list):                                    # loops through list backwards
    if value[:3]=='Sol':                                                     # isolates weather tweet
        mars_weather = value                                                 # assigns to variable

In [26]:
mars_weather                                                                 # displays tweet

'Sol 1962 (Feb 12, 2018), Sunny, high -14C/6F, low -78C/-108F, pressure at 7.38 hPa, daylight 05:40-17:27'

In [27]:
browser.quit()                                                               # closes automated browser

#### Mars Facts - Visit the Mars Facts webpage here and use Pandas to scrape the table containing facts about the planet including Diameter, Mass, etc.

In [28]:
facts_url = 'https://space-facts.com/mars/'                                  # defines search URL

In [29]:
fact_list = pd.read_html(facts_url)                                          # extracts data from URL using pandas

In [30]:
facts_df = fact_list[0]                                                      # converts list to dataframe

In [31]:
facts_table = facts_df.to_html(header=False, index=False)                    # converts dataframe to html table
print(facts_table)                                                           # displays html table

<table border="1" class="dataframe">
  <tbody>
    <tr>
      <td>Equatorial Diameter:</td>
      <td>6,792 km</td>
    </tr>
    <tr>
      <td>Polar Diameter:</td>
      <td>6,752 km</td>
    </tr>
    <tr>
      <td>Mass:</td>
      <td>6.42 x 10^23 kg (10.7% Earth)</td>
    </tr>
    <tr>
      <td>Moons:</td>
      <td>2 (Phobos &amp; Deimos)</td>
    </tr>
    <tr>
      <td>Orbit Distance:</td>
      <td>227,943,824 km (1.52 AU)</td>
    </tr>
    <tr>
      <td>Orbit Period:</td>
      <td>687 days (1.9 years)</td>
    </tr>
    <tr>
      <td>Surface Temperature:</td>
      <td>-153 to 20 °C</td>
    </tr>
    <tr>
      <td>First Record:</td>
      <td>2nd millennium BC</td>
    </tr>
    <tr>
      <td>Recorded By:</td>
      <td>Egyptian astronomers</td>
    </tr>
  </tbody>
</table>


#### Mars Hemisperes - Visit the USGS Astrogeology site to obtain high resolution images for each of Mars' hemispheres.

In [32]:
browser = Browser('chrome', headless=False)                                  # defines browser                                 

# defines search URL
usgs_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
browser.visit(usgs_url)                                                      # visits search URL with automated browser

In [33]:
usgs_html = browser.html                                                     # acquires response from URL
usgs_soup = bs(usgs_html, 'html.parser')                                     # sends response to beautiful soup

In [34]:
hemisphere_image_urls = []                                                   # Creates empty list

products = usgs_soup.find('div', class_='result-list')                       # finds products
hemispheres = products.find_all('div', class_='item')                        # finds hemispheres

for hemisphere in hemispheres:                                               # iterates through hemispheres
    title = hemisphere.find('div', class_='description')
    
    title_text = title.a.text                                                # extracts cleaned title
    title_text = title_text.replace(' Enhanced', '')
    browser.click_link_by_partial_text(title_text)                           # (automated) click
    
    usgs_html = browser.html                                                 # acquires response from URL
    usgs_soup = bs(usgs_html, 'html.parser')                                 # sends response to beautiful soup
    
    image = usgs_soup.find('div', class_='downloads').find('ul').find('li')  # extracts image url
    img_url = image.a['href']
    
    hemisphere_image_urls.append({'title': title_text, 'img_url': img_url})  # adds dictionary to list  
    
    browser.click_link_by_partial_text('Back')                               # (automated) click back

In [35]:
hemisphere_image_urls                                                        # displays list of dictionaries

[{'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg',
  'title': 'Cerberus Hemisphere'},
 {'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg',
  'title': 'Schiaparelli Hemisphere'},
 {'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg',
  'title': 'Syrtis Major Hemisphere'},
 {'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg',
  'title': 'Valles Marineris Hemisphere'}]

In [36]:
browser.quit()                                                               # closes automated browser