In [76]:
from splinter import Browser
from splinter.exceptions import ElementDoesNotExist
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd

# Step 1 - Scraping

### NASA Mars News

In [77]:
# bs4 is not pulling the correct content. I'm using splinter to get the data
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)
browser.visit('https://mars.nasa.gov/news')

html = browser.html

soup = bs(html, 'html.parser')

In [78]:
#Use BS to find all the content titles, then save the first one as a variable
results = soup.find_all('div', class_='content_title')
news_title = results[0].text.strip()
news_title

"Curiosity Tastes First Sample in 'Clay-Bearing Unit'"

In [79]:
#The description of the article is held in a class called 'rollover_description_inner'. Pull the first one and store it.
results = soup.body.find_all('div', class_='rollover_description_inner')
news_p = results[0].text.strip()
news_p

'This new region on Mars might reveal more about the role of water on Mount Sharp.'

### JPL Mars Space Images - Featured Image

In [80]:
#Direct the browser to the JPL space images site
browser.visit('https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars')

In [81]:
#Extract the html and pass it into BS
soup = bs(browser.html, 'html.parser')

#The featured image button (button fancybox) has the relative address of the featured image in the a tag
results = soup.find('a', class_='button fancybox')
tail = results['data-fancybox-href']

#We append the relative address to the main URL to get the full link
featured_image_url = 'https://www.jpl.nasa.gov' + tail
featured_image_url

'https://www.jpl.nasa.gov/spaceimages/images/mediumsize/PIA17044_ip.jpg'

### Mars Weather

In [82]:
# Use the response libary read the URL and output the text to Beautiful Soup
url = 'https://twitter.com/marswxreport?lang=en'
response = requests.get(url)
soup = bs(response.text, 'html.parser')

In [83]:
# Find all the tweets on the first page and extract the text from the first one
tweets = soup.find_all('div', class_='js-tweet-text-container')
mars_weather = tweets[0].text.strip()
mars_weather

'InSight sol 133 (2019-04-11) low -96.8ºC (-142.3ºF) high -15.7ºC (3.8ºF)\nwinds from the W at 4.2 m/s (9.3 mph) gusting to 11.7 m/s (26.2 mph)\npressure at 7.30 hPapic.twitter.com/kmh5FXGbBL'

### Mars Facts

In [84]:
#Pull the html for the space facts site
url = 'https://space-facts.com/mars/'
response = requests.get(url)

In [85]:
#Pull the table into pandas
facts = pd.read_html(response.text, attrs = {'id':"tablepress-mars"})
facts_df = facts[0]

#Pull the table out of padas
facts_table = facts_df.to_html(header=False, index=False)

### Mars Hemispheres

In [86]:
#Pull the html from the browser and pull the hrefs from the links
browser.visit('https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars')
soup = bs(browser.html, 'html.parser')
hemispheres = soup.find_all('div', class_="description")

In [88]:
#Loop though the urls and pull the image urls
hemisphere_image_urls = []
for item in hemispheres:
    title = item.h3.text
    url_tail = item.a['href']
    url = 'https://astrogeology.usgs.gov' + url_tail
    browser.visit(url)
    img_url = browser.find_by_text('Sample')['href']
    hemisphere_image_urls.append({title:img_url})

browser.quit()
hemisphere_image_urls

[{'Cerberus Hemisphere Enhanced': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg'},
 {'Schiaparelli Hemisphere Enhanced': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg'},
 {'Syrtis Major Hemisphere Enhanced': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg'},
 {'Valles Marineris Hemisphere Enhanced': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg'}]

In [90]:
import pymongo
conn = "mongodb://localhost:27017"
client = pymongo.MongoClient(conn)
db = client.mars

In [91]:
data = db.collection.find_one()

print(data['featured_image_url'])

https://www.jpl.nasa.gov/spaceimages/images/mediumsize/PIA22911_ip.jpg
