In [1]:
# Declare Dependencies 
from bs4 import BeautifulSoup
from splinter import Browser
import requests
import pandas as pd
from pprint import pprint

Windows User setup for Splinter

In [2]:
# Choose the executable path to driver 
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)

# Step 1 - Scraping

## NASA Mars News

In [3]:
# Visit Nasa news url through splinter module
url = 'https://mars.nasa.gov/news/'
browser.visit(url)

In [4]:
# HTML Object
html = browser.html

# Parse HTML with Beautiful Soup
soup = BeautifulSoup(html, 'lxml')

# Retrieve the latest element that contains news title and news_paragraph
news_title = soup.find('div', class_='content_title').find('a').text
news_p = soup.find('div', class_='article_teaser_body').text

# Display scrapped data 
print(news_title)
print("-"*60)
print(news_p)

NASA's MRO Completes 60,000 Trips Around Mars
------------------------------------------------------------
The orbiting spacecraft is also about to set a record for data relayed from the Martian surface.


## JPL Mars Space Images - Featured Image

In [5]:
# Visit Mars Space Images through splinter module
image_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
browser.visit(image_url)

In [6]:
# Click in 'FULL IMAGE' link to get a full image
full_image_elem=browser.find_by_id('full_image').click()

In [7]:
# splinter seems to need more time for loading text 'more info'
browser.is_text_present('more info',wait_time=5)

True

In [8]:
# Click in 'more info' for full size image
more_info_elem=browser.click_link_by_partial_text('more info')

In [9]:
# HTML Object 
html_image = browser.html

# Parse HTML with Beautiful Soup
soup = BeautifulSoup(html_image, 'lxml')

# Retrieve image url
src_image_url = soup.select_one('figure.lede a img')["src"]

# Website main url 
JPL='https://www.jpl.nasa.gov'

# Concatenate website url with scrapped route
featured_image_url=JPL+src_image_url

print(featured_image_url)

https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA20464_hires.jpg


## Mars Weather

In [10]:
# Visit Mars Weather Twitter through splinter module
weather_url = 'https://twitter.com/marswxreport?lang=en'
browser.visit(weather_url)

In [11]:
# HTML Object 
html_weather = browser.html

# Parse HTML with Beautiful Soup
soup = BeautifulSoup(html_weather, 'lxml')

# Find all tweets with p tag and class 'TweetTextSize'
tweets = soup.find_all('p', class_='TweetTextSize')
tweets

[<p class="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text" data-aria-label-part="0" lang="en">InSight sol 167 (2019-05-17) low -100.5ºC (-148.9ºF) high -20.4ºC (-4.6ºF)
 winds from the SW at 4.7 m/s (10.6 mph) gusting to 13.5 m/s (30.3 mph)
 pressure at 7.50 hPa<a class="twitter-timeline-link u-hidden" data-pre-embedded="true" dir="ltr" href="https://t.co/0Eqt9nN21o">pic.twitter.com/0Eqt9nN21o</a></p>,
 <p class="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text" data-aria-label-part="0" lang="en">InSight sol 166 (2019-05-15) low -100.5ºC (-148.8ºF) high -20.5ºC (-4.8ºF)
 winds from the W at 4.2 m/s (9.4 mph) gusting to 11.7 m/s (26.2 mph)
 pressure at 7.50 hPa</p>,
 <p class="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text" data-aria-label-part="0" lang="de">This is what a sunset on Earth looks like from above. / So sieht ein irdischer Sonnenuntergang aus, von oben gesehen. <a class="twitter-hashtag pretty-link js-nav" data-query-source="hashtag_c

In [12]:
# Search entries with weather-related words and collect them in a list
weather_tweets=[]

for t in tweets:
    weather_tweet = t.text.strip()
    if ('sol' and 'low' and 'high') in weather_tweet:
        # Split picture address
        weather_tweet_split=weather_tweet.split('pic')[0]
        weather_tweets.append(weather_tweet_split)
    else:pass

weather_tweets

['InSight sol 167 (2019-05-17) low -100.5ºC (-148.9ºF) high -20.4ºC (-4.6ºF)\nwinds from the SW at 4.7 m/s (10.6 mph) gusting to 13.5 m/s (30.3 mph)\npressure at 7.50 hPa',
 'InSight sol 166 (2019-05-15) low -100.5ºC (-148.8ºF) high -20.5ºC (-4.8ºF)\nwinds from the W at 4.2 m/s (9.4 mph) gusting to 11.7 m/s (26.2 mph)\npressure at 7.50 hPa',
 'InSight sol 165 (2019-05-15) low -100.3ºC (-148.6ºF) high -18.2ºC (-0.7ºF)\nwinds from the SW at 4.6 m/s (10.4 mph) gusting to 13.7 m/s (30.6 mph)\npressure at 7.50 hPa',
 'InSight sol 164 (2019-05-13) low -100.0ºC (-147.9ºF) high -16.6ºC (2.1ºF)\nwinds from the W at 4.1 m/s (9.1 mph) gusting to 15.1 m/s (33.7 mph)\npressure at 7.50 hPa',
 'InSight sol 163 (2019-05-13) low -99.9ºC (-147.7ºF) high -17.7ºC (0.2ºF)\nwinds from the SW at 4.3 m/s (9.7 mph) gusting to 15.2 m/s (34.0 mph)\npressure at 7.50 hPa',
 'InSight sol 162 (2019-05-12) low -100.2ºC (-148.3ºF) high -20.3ºC (-4.5ºF)\nwinds from the SW at 4.5 m/s (10.1 mph) gusting to 14.3 m/s (32.0

In [13]:
# Latest Mars weather tweet
mars_weather = weather_tweets[0]
print(mars_weather)

InSight sol 167 (2019-05-17) low -100.5ºC (-148.9ºF) high -20.4ºC (-4.6ºF)
winds from the SW at 4.7 m/s (10.6 mph) gusting to 13.5 m/s (30.3 mph)
pressure at 7.50 hPa


## Mars Facts

In [14]:
# Visit the Mars Facts webpage through splinter module
facts_url = 'https://space-facts.com/mars/'
browser.visit(facts_url)

In [15]:
tables = pd.read_html(facts_url)
tables

[                      0                              1
 0  Equatorial Diameter:                       6,792 km
 1       Polar Diameter:                       6,752 km
 2                 Mass:  6.42 x 10^23 kg (10.7% Earth)
 3                Moons:            2 (Phobos & Deimos)
 4       Orbit Distance:       227,943,824 km (1.52 AU)
 5         Orbit Period:           687 days (1.9 years)
 6  Surface Temperature:                  -153 to 20 °C
 7         First Record:              2nd millennium BC
 8          Recorded By:           Egyptian astronomers]

In [16]:
df=tables[0]
df.columns=['Mars Planet','Profile']
mars_df=df.set_index('Mars Planet')
mars_df

Unnamed: 0_level_0,Profile
Mars Planet,Unnamed: 1_level_1
Equatorial Diameter:,"6,792 km"
Polar Diameter:,"6,752 km"
Mass:,6.42 x 10^23 kg (10.7% Earth)
Moons:,2 (Phobos & Deimos)
Orbit Distance:,"227,943,824 km (1.52 AU)"
Orbit Period:,687 days (1.9 years)
Surface Temperature:,-153 to 20 °C
First Record:,2nd millennium BC
Recorded By:,Egyptian astronomers


In [17]:
# DataFrames to HTML tables. to_html() generates tables
html_table = mars_df.to_html(index=False)
html_table

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th>Profile</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <td>6,792 km</td>\n    </tr>\n    <tr>\n      <td>6,752 km</td>\n    </tr>\n    <tr>\n      <td>6.42 x 10^23 kg (10.7% Earth)</td>\n    </tr>\n    <tr>\n      <td>2 (Phobos &amp; Deimos)</td>\n    </tr>\n    <tr>\n      <td>227,943,824 km (1.52 AU)</td>\n    </tr>\n    <tr>\n      <td>687 days (1.9 years)</td>\n    </tr>\n    <tr>\n      <td>-153 to 20 °C</td>\n    </tr>\n    <tr>\n      <td>2nd millennium BC</td>\n    </tr>\n    <tr>\n      <td>Egyptian astronomers</td>\n    </tr>\n  </tbody>\n</table>'

In [18]:
# to strip unwanted newlines to clean up the table
html_table.replace('\n', '')

'<table border="1" class="dataframe">  <thead>    <tr style="text-align: right;">      <th>Profile</th>    </tr>  </thead>  <tbody>    <tr>      <td>6,792 km</td>    </tr>    <tr>      <td>6,752 km</td>    </tr>    <tr>      <td>6.42 x 10^23 kg (10.7% Earth)</td>    </tr>    <tr>      <td>2 (Phobos &amp; Deimos)</td>    </tr>    <tr>      <td>227,943,824 km (1.52 AU)</td>    </tr>    <tr>      <td>687 days (1.9 years)</td>    </tr>    <tr>      <td>-153 to 20 °C</td>    </tr>    <tr>      <td>2nd millennium BC</td>    </tr>    <tr>      <td>Egyptian astronomers</td>    </tr>  </tbody></table>'

In [19]:
# save
df.to_html('mars_table.html', index=False)

## Mars Hemispheres

In [20]:
# Visit the USGS Astrogeology site through splinter module
usgs_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
browser.visit(usgs_url)

In [21]:
# HTML Object 
html_hemispheres = browser.html

# Parse HTML with Beautiful Soup
soup = BeautifulSoup(html_hemispheres,'lxml')

# Retreive all items that contain mars hemispheres information
items=soup.find_all('div',class_='item')

In [22]:
# Create an empty list to store scrapped data as dictionaries
hemisphere_image_urls = []

# main_url for usgs 
main_url = 'https://astrogeology.usgs.gov'

# Loop through the items previously stored
for i in items: 
    
    # Find link that leads to full image page
    hemi_url = i.find('a', class_='itemLink product-item')['href']
    
    # Visit the link that contains the full image website 
    browser.visit(main_url + hemi_url)
    
    # HTML Object of hemispheres
    soup = BeautifulSoup(browser.html,'lxml')
    
    # Hemisphere title
    title=soup.select_one('div .content section h2').text
    
    # Hemisphere img_url
    src_img_url=soup.find('img', class_='wide-image')['src']
    # Concat
    img_url=main_url+src_img_url
    
    # Dictionary
    hemi_dict={'title':title, 'img_url':img_url}
    
    # Append to the list
    hemisphere_image_urls.append(hemi_dict)

hemisphere_image_urls

[{'title': 'Cerberus Hemisphere Enhanced',
  'img_url': 'https://astrogeology.usgs.gov/cache/images/cfa62af2557222a02478f1fcd781d445_cerberus_enhanced.tif_full.jpg'},
 {'title': 'Schiaparelli Hemisphere Enhanced',
  'img_url': 'https://astrogeology.usgs.gov/cache/images/3cdd1cbf5e0813bba925c9030d13b62e_schiaparelli_enhanced.tif_full.jpg'},
 {'title': 'Syrtis Major Hemisphere Enhanced',
  'img_url': 'https://astrogeology.usgs.gov/cache/images/ae209b4e408bb6c3e67b6af38168cf28_syrtis_major_enhanced.tif_full.jpg'},
 {'title': 'Valles Marineris Hemisphere Enhanced',
  'img_url': 'https://astrogeology.usgs.gov/cache/images/7cf2da4bf549ed01c17f206327be4db7_valles_marineris_enhanced.tif_full.jpg'}]