## Matt Byrd - Scraping Homework

In [13]:
# Dependencies
from bs4 import BeautifulSoup as bs
import pandas as pd
import requests
import os

In [14]:
# Open and read the static HTML file
file = os.path.join('NewsNASAMarsExplorationProgram', 'News_NASA_Mars_Exploration_Program.html')
with open(file) as f:
    html = f.read()

In [15]:
# Create a soup object
soup = bs(html, 'lxml')

In [16]:
# Get the Title and Arcticle-snippet from the static HTML page
title = soup.find('div', class_ = 'content_title')
news_title = title.find('a').text
news_text = soup.find('div', class_= 'article_teaser_body').text
news_text = news_text.replace('\n', '')

In [17]:
# Setup to get the image of the day - new soup object
url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
response = requests.get(url)
image_soup = bs(response.text, 'lxml')

In [18]:
# Base string that will be used as a final URL using a concatenation later
base_image_url = 'https://www.jpl.nasa.gov'

In [7]:
# Making the concatenation
stem_image_url = image_soup.find('a', class_ = 'button fancybox')['data-fancybox-href']
featured_image_url = base_image_url + stem_image_url

In [8]:
# Setup for weather scraping
tweets_url = 'https://twitter.com/marswxreport?lang=en'
tweets_response = requests.get(tweets_url)
tweet_soup = bs(tweets_response.text, 'lxml')

In [9]:
# Store weather tweet
mars_weather = tweet_soup.find('p', class_ = 'TweetTextSize').text

In [10]:
# Scrape facts table with Pandas
facts_url = 'https://space-facts.com/mars/'
tables = pd.read_html(facts_url)

In [11]:
# Use pandas to store Mars facts in table and export to HTML
df = tables[0]
df.columns = ['description', 'value']
df.set_index('description', inplace=True)
facts_table = df.to_html()

In [12]:
# View of table
df.head()

Unnamed: 0_level_0,value
description,Unnamed: 1_level_1
Equatorial Diameter:,"6,792 km"
Polar Diameter:,"6,752 km"
Mass:,6.42 x 10^23 kg (10.7% Earth)
Moons:,2 (Phobos & Deimos)
Orbit Distance:,"227,943,824 km (1.52 AU)"


In [None]:
# Setup to get Mars images
hemisphere_image_urls = []
hem_base_url = 'https://astrogeology.usgs.gov'

In [None]:
# Scrape Hemisphere images
hems_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
hems_response = requests.get(hems_url)
hems_soup = bs(hems_response.text, 'lxml')
hspheres = hems_soup.find_all('div', class_ = 'item')

In [None]:
# We essentially used bs to follow links here. This may be extremely inefficient at large scales but works for 
# this project's purposes. This populates the hemisphere titles and image link urls dictionary.
for hem in hspheres:
    hem_dict = {}
    hem_dict['title'] = hem.find('h3').text.replace(' Enhanced', '')
    img_url = hem_base_url + hem.find('a')['href']
    img_response = requests.get(img_url)
    img_soup = bs(img_response.text, 'lxml')
    hem_dict['img_url'] = img_soup.find('a', target = '_blank')['href']
    hemisphere_image_urls.append(hem_dict)