In [1]:
# Dependencies
from bs4 import BeautifulSoup
from splinter import Browser
import lxml
import requests
import pymongo

# Step 1 - Scraping

In [2]:
# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

In [3]:
# Define database and collection
db = client.mars_db
collection = db.articles

# 1.a. NASA Mars News

In [4]:
# URL of page to be scraped
url = 'https://mars.nasa.gov/news/?page=0&per_page=\
40&order=publish_date+desc%2Ccreated_at+desc&search=\
&category=19%2C165%2C184%2C204&blank_scope=Latest'

# Retrieve page with the requests module
response = requests.get(url)

In [5]:
# Create BeautifulSoup object; parse with 'lxml'
soup = BeautifulSoup(response.text, 'lxml')

# print(soup)
# Retrieve the parent divs for all articles
results = soup.find_all('div', class_='slide')

In [6]:
for result in results:
    news_title = result.find('div', class_='content_title').a.text
    news_p = result.find('div', class_='rollover_description_inner').text

# 1.b. JPL Mars Space Images - Featured Image

In [7]:
executable_path = {'executable_path': '/usr/local/Cellar/chromedriver/2.37/bin/chromedriver'}
browser = Browser('chrome', **executable_path, headless=False)
url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
browser.visit(url)

In [8]:
# use bs to get featured image URL
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

image_url = soup.find('img', class_='thumb')
featured_url = 'https://www.jpl.nasa.gov' + image_url['src']
print(featured_url)

https://www.jpl.nasa.gov/spaceimages/images/wallpaper/PIA22374-640x350.jpg


# 1.c. Mars Weather

In [9]:
# reset url variable to Mars report twitter account
url = 'https://twitter.com/marswxreport?lang=en'
browser.visit(url)

# use bs to get most recent Mars weather tweet
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

In [10]:
# assign most recent tweet to mars_weather variable
mars_weather = soup.find('p', class_='TweetTextSize TweetTextSize--normal js-tweet-text tweet-text').text

# 1.d. Mars Facts

In [11]:
# import pandas to scrape table from next url
import pandas as pd

# reset url variable to Mars facts webpage
url = 'https://space-facts.com/mars/'

In [12]:
# set table variable to html table element
tables = pd.read_html(url)
tables

[                      0                              1
 0  Equatorial Diameter:                       6,792 km
 1       Polar Diameter:                       6,752 km
 2                 Mass:  6.42 x 10^23 kg (10.7% Earth)
 3                Moons:            2 (Phobos & Deimos)
 4       Orbit Distance:       227,943,824 km (1.52 AU)
 5         Orbit Period:           687 days (1.9 years)
 6  Surface Temperature:                  -153 to 20 °C
 7         First Record:              2nd millennium BC
 8          Recorded By:           Egyptian astronomers]

In [13]:
# pull tables[0] and assign to df variable
df = tables[0]

# pd.to_html on df variable
html_table = df.to_html()

# 1.e. Mars Hemispheres

In [14]:
# reset url variable to Mars report twitter account
url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
browser.visit(url)

# use bs to get full resolution images of Mars's hemisphere
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

In [15]:
# create empty dict to store title and url string
hemisphere_image_urls = []

results = soup.find_all('div', class_='item')

# tell browser to cycle through results and click links 
# to get enhanced image url, append title and url to dict in list
for result in results:
    link = result.find('h3').text
    title = link.replace(' Enhanced', '')
    browser.click_link_by_partial_text(link)
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    result2 = soup.find('img', class_='wide-image')
    url = 'https://astrogeology.usgs.gov/' + result2["src"]
    hemisphere_image_urls.append({"title":title, "img_url":url})
    browser.back()

In [25]:
hemisphere_image_urls[0]['title']

'https://astrogeology.usgs.gov//cache/images/7cf2da4bf549ed01c17f206327be4db7_valles_marineris_enhanced.tif_full.jpg'