In [1]:
import requests
from bs4 import BeautifulSoup as bs
from splinter import Browser
import os
import time
import re
import pandas as pd

## 1. Set-up urls to be scraped

In [2]:
# Set-up dictionary of all Mars urls to be scraped
url_dict = {"NASA":"https://mars.nasa.gov/news/",\
           "JPL":"https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars",\
           "Twitter":"https://twitter.com/marswxreport?lang=en",\
           "space-facts":"https://space-facts.com/mars/",\
           "astrogeology":"https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"}

## 2. Scrape NASA and extract latest news _title_ and _text_

In [3]:
response = requests.get(url = url_dict["NASA"])

# Wait for 20 sec while the page loads
time.sleep(20)

soup = bs(response.text, "html.parser")

# print(soup.prettify())

results = soup.find_all("div", class_ = "slide")

mars_latest_news = []

for result in results:    
    try:    
        news_title = result.find("div", class_ = "content_title").text
        news_description = result.find("div", class_ = "rollover_description_inner").text

        news_dict = {'title':news_title.strip(),\
                    'description':news_description.strip()}

        mars_latest_news.append(news_dict)
    except AttributeError as e:
        print(e)        

In [4]:
# Scanity-check
for news in mars_latest_news:
    print('Title: '+ news['title'])
    print('Descirption: '+news['description'])
    print("")

Title: Virginia Middle School Student Earns Honor of Naming NASA's Next Mars Rover
Descirption: NASA chose a seventh-grader from Virginia as winner of the agency's "Name the Rover" essay contest. Alexander Mather's entry for "Perseverance" was voted tops among 28,000 entries.

Title: NASA Prepares for Moon and Mars With New Addition to Its Deep Space Network
Descirption: Robotic spacecraft will be able to communicate with the dish using radio waves and lasers.

Title: NASA Administrator Statement on Moon to Mars Initiative, FY 2021 Budget
Descirption: Jim Bridenstine addresses NASA's ambitious plans for the coming years, including Mars Sample Return.

Title: NASA's Mars 2020 Rover Closer to Getting Its Name
Descirption: 155 students from across the U.S. have been chosen as semifinalists in NASA's essay contest to name the Mars 2020 rover, and see it launch from Cape Canaveral this July.

Title: NASA Invites Students to Name Mars 2020 Rover
Descirption: Through Nov. 1, K-12 students in 

## 3. Scrape JPL and extract latest Mars featured image (using Splinter)

In [6]:
# Initiate the browser and visit JPL page

browser = Browser('chrome', headless=False)
browser.visit(url_dict['JPL'])

# Wait for 20 sec while the page loads
time.sleep(20)

# scrape the page
html = browser.html
soup = bs(html, 'html.parser')
featured_image = soup.find("li", class_="slide").a['data-fancybox-href']

# create full-url to high-res image
featured_image_url = url_dict['JPL'].split('/spaceimages')[0]+featured_image

browser.quit()
# sanity-check
# browser.visit(featured_image_url)

## 4. Scrape Twitter to extract Mars Weather

In [7]:
response = requests.get(url = url_dict["Twitter"])

# Wait for 20 sec while the page loads
time.sleep(20)

soup = bs(response.text, "html.parser")

In [8]:
# using re: Regular Expression matching (source:https://docs.python.org/3/library/re.html)
# since the weather reports always start with 'Insight sol' on the Twitter page

weather_reports = soup(text=re.compile(r'InSight sol'))

try:
    # only select the latest weather report
    mars_weather = str(weather_reports[0])

    # Manipulate the string to return desired format
    mars_weather = mars_weather[8:].capitalize().replace("\n","; ")
    
except:
    mars_weather = "N/A"
    print("Weather Report not available.")

# Sanity-check
print(mars_weather)

Sol 457 (2020-03-10) low -95.7ºc (-140.3ºf) high -9.1ºc (15.6ºf); winds from the sse at 6.5 m/s (14.5 mph) gusting to 21.0 m/s (46.9 mph); pressure at 6.30 hpa


## 5. Scrape _space-facts.com_ to obtain Mars parameters

In [9]:
tables = pd.read_html(url_dict["space-facts"])

# sleep for 10 sec while the page loads
time.sleep(10)

mars_table = tables[0]

In [10]:
try:
    mars_table = mars_table.rename(columns = {0: 'Parameter', 1:"Value"})
    mars_table = mars_table.set_index('Parameter')
except:
    print('Columns already renamed')
    
mars_table_html = mars_table.to_html

## 6. Scrape _astrogeology.com_ for Images

In [148]:
browser = Browser('chrome', headless=False)
browser.visit(url_dict["astrogeology"])

time.sleep(10)

html = browser.html
soup = bs(html, 'html.parser')

items = soup.find_all("div", class_="item")

mars_hemisphere_image_urls = []

for item in items:    
    full_url = url_dict['astrogeology'].split('/search')[0]+item.a['href']    
    
    browser.visit(full_url)
    
    html = browser.html
    soup = bs(html,'html.parser')
    
    image_url = soup.find("div", class_="downloads")
    
    mars_hemisphere_image_urls.append({"title": item.h3.text.split(' Enhanced')[0],
   "img_url": image_url.li.a['href']})

browser.quit()

In [149]:
mars_hemisphere_image_urls

[{'title': 'Cerberus Hemisphere',
  'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg'},
 {'title': 'Schiaparelli Hemisphere',
  'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg'},
 {'title': 'Syrtis Major Hemisphere',
  'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg'},
 {'title': 'Valles Marineris Hemisphere',
  'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg'}]