In [1]:
# Import Splinter and BeautifulSoup
from splinter import Browser
from bs4 import BeautifulSoup
import pandas as pd 

In [2]:
# Windows users - Make sure your chromedriver.exe is in the same folder as this program
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)

### Article Scraping

In [3]:
# Visit the mars nasa news site
url = 'https://mars.nasa.gov/news/'
browser.visit(url)
# Optional delay for loading the page
browser.is_element_present_by_css("ul.item_list li.slide", wait_time=1)

True

In [4]:
# set up the HTML parser
html = browser.html
news_soup = BeautifulSoup(html, 'html.parser')
slide_elem = news_soup.select_one('ul.item_list li.slide')

In [5]:
slide_elem.find("div", class_='content_title')

<div class="content_title"><a href="/news/8731/nasas-maven-observes-martian-night-sky-pulsing-in-ultraviolet-light/" target="_self">NASA's MAVEN Observes Martian Night Sky Pulsing in Ultraviolet Light</a></div>

In [6]:
#use the parent element to find the first <a> tag (.find) and save as "news title"
news_title =slide_elem.find('div', class_='content_title').get_text()
news_title

"NASA's MAVEN Observes Martian Night Sky Pulsing in Ultraviolet Light"

In [7]:
# Use the parent element to find the paragraph text
news_p = slide_elem.find('div', class_="article_teaser_body").get_text()
news_p

'Vast areas of the Martian night sky pulse in ultraviolet light, according to images from NASA’s MAVEN spacecraft. The results are being used to illuminate complex circulation patterns in the Martian atmosphere.'

### Featured Images 

In [8]:
# Visit URL
url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
browser.visit(url)

In [9]:
# Find and click the full image button
full_image_elem = browser.find_by_id('full_image')
full_image_elem.click()

In [10]:
# Find the "more info" button and click it
browser.is_element_present_by_text('more info', wait_time=1)
more_info_elem = browser.links.find_by_partial_text('more info')
#more_info_elem.click()
more_info_elem

<splinter.element_list.ElementList at 0x25eb64e0b20>

In [11]:
more_info_elem.click()

In [12]:
# Parse the resulting HTML with soup
html = browser.html
img_soup = BeautifulSoup(html, 'html.parser')
#img_soup

In [13]:
# Find the relative image url
img_url_rel = img_soup.select_one('figure.lede a img').get("src")
img_url_rel

'/spaceimages/images/largesize/PIA22831_hires.jpg'

In [14]:
# Use the base URL to create an absolute URL
img_url = f'https://www.jpl.nasa.gov{img_url_rel}'
img_url

'https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA22831_hires.jpg'

### Scrape a table 

In [15]:
# Visit URL and
# scrape the entire table from website with Pandas
# searches for and returns first table ([0]) found in the HTML
df = pd.read_html('http://space-facts.com/mars/')[0]
df.columns=['description', 'value']
# the updated index will remain in place, without having to reassign the DataFrame to a new variable
df.set_index('description', inplace=True)
df

Unnamed: 0_level_0,value
description,Unnamed: 1_level_1
Equatorial Diameter:,"6,792 km"
Polar Diameter:,"6,752 km"
Mass:,6.39 × 10^23 kg (0.11 Earths)
Moons:,2 (Phobos & Deimos)
Orbit Distance:,"227,943,824 km (1.38 AU)"
Orbit Period:,687 days (1.9 years)
Surface Temperature:,-87 to -5 °C
First Record:,2nd millennium BC
Recorded By:,Egyptian astronomers


In [16]:
#Convert dataframe to usable HTML
df.to_html()

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>value</th>\n    </tr>\n    <tr>\n      <th>description</th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>Equatorial Diameter:</th>\n      <td>6,792 km</td>\n    </tr>\n    <tr>\n      <th>Polar Diameter:</th>\n      <td>6,752 km</td>\n    </tr>\n    <tr>\n      <th>Mass:</th>\n      <td>6.39 × 10^23 kg (0.11 Earths)</td>\n    </tr>\n    <tr>\n      <th>Moons:</th>\n      <td>2 (Phobos &amp; Deimos)</td>\n    </tr>\n    <tr>\n      <th>Orbit Distance:</th>\n      <td>227,943,824 km (1.38 AU)</td>\n    </tr>\n    <tr>\n      <th>Orbit Period:</th>\n      <td>687 days (1.9 years)</td>\n    </tr>\n    <tr>\n      <th>Surface Temperature:</th>\n      <td>-87 to -5 °C</td>\n    </tr>\n    <tr>\n      <th>First Record:</th>\n      <td>2nd millennium BC</td>\n    </tr>\n    <tr>\n      <th>Recorded By:</th>\n      <td>Egyptian astronomers</td>\n    </tr>\

In [17]:
browser.quit() 

In [18]:
def image(browser):
    # ### Scrape Featured Image from JPL site

    # Visit URL
    url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(url)
    
    html = browser.html
    img_soup = BeautifulSoup(html, 'html.parser')
    image_url = img_soup.find('div', class_ = 'carousel_items')('article')[0]['style'].\
    replace('background-image: url(','').replace(');', '')[1:-1]

    # Find and click the full image button
    
    
    
#     full_image_elem = browser.find_by_id('full_image')

#     # Find the "more info" button and click it
#     browser.is_element_present_by_text('more info', wait_time=1)
#     more_info_elem = browser.links.find_by_partial_text('more info')

#     # Parse the resulting HTML with soup
#     html = browser.html
#     img_soup = BeautifulSoup(html, 'html.parser')
    
#     try:
#         # Find the relative image url
#         img_url_rel = img_soup.select_one('figure.lede a img')
#         src = img_url_rel.get("src")
#     except AttributeError:
#         return None

    # Use the base URL to create an absolute URL
    img_url = f'https://www.jpl.nasa.gov{image_url}'

    return img_url

In [19]:
print(image(browser))

MaxRetryError: HTTPConnectionPool(host='127.0.0.1', port=54799): Max retries exceeded with url: /session/235edd0366f0320aa5c6de2674761612/url (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x0000025EB915B4F0>: Failed to establish a new connection: [WinError 10061] No connection could be made because the target machine actively refused it'))

### Get Mars Hemisphere images

In [3]:
# Visit URL: USGS
url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
browser.visit(url)
html= browser.html

In [4]:
print(html)

<html lang="en"><head>
		<link rel="stylesheet" type="text/css" href="//ajax.googleapis.com/ajax/libs/jqueryui/1.11.3/themes/smoothness/jquery-ui.css">
<title>Astropedia Search Results | USGS Astrogeology Science Center</title>
		<meta name="description" content="USGS Astrogeology Science Center Astropedia search results.">
		<meta name="keywords" content="USGS,Astrogeology Science Center,Cartography,Geology,Space,Geological Survey,Mapping">
		<meta http-equiv="X-UA-Compatible" content="IE=edge">
		<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
		<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1">
		<meta name="google-site-verification" content="x61hXXVj7wtfBSNOPnTftajMsZ5yB2W-qRoyr7GtOKM">
		<!--<link rel="stylesheet" href="http://fonts.googleapis.com/css?family=Open+Sans:400italic,400,bold"/>-->
		<link rel="stylesheet" media="screen" href="/css/main.css">
		<link rel="stylesheet" media="print" href="/css/print.css">
		<!--[if lt

In [5]:
# Find and click the full image button 
#img_soup.find('div', class_ = 'carousel_items')('article')[0]['style']
imgs_soup = BeautifulSoup(html, 'html.parser')
imgs_soup

<html lang="en"><head>
<link href="//ajax.googleapis.com/ajax/libs/jqueryui/1.11.3/themes/smoothness/jquery-ui.css" rel="stylesheet" type="text/css"/>
<title>Astropedia Search Results | USGS Astrogeology Science Center</title>
<meta content="USGS Astrogeology Science Center Astropedia search results." name="description"/>
<meta content="USGS,Astrogeology Science Center,Cartography,Geology,Space,Geological Survey,Mapping" name="keywords"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<meta content="width=device-width, initial-scale=1, maximum-scale=1" name="viewport"/>
<meta content="x61hXXVj7wtfBSNOPnTftajMsZ5yB2W-qRoyr7GtOKM" name="google-site-verification"/>
<!--<link rel="stylesheet" href="http://fonts.googleapis.com/css?family=Open+Sans:400italic,400,bold"/>-->
<link href="/css/main.css" media="screen" rel="stylesheet"/>
<link href="/css/print.css" media="print" rel="stylesheet"/>
<!--[if lt IE 9]>
			<s

In [74]:
collapsible_results = imgs_soup.find('div', class_ = 'collapsible results')
item_one = collapsible_results.find('div', class_ = 'item')('a')[1]['href']
#click(item_one)
print(item_one)

/search/map/Mars/Viking/cerberus_enhanced


In [75]:
# Use the base URL to create an absolute URL
item_one_url = f'https://astrogeology.usgs.gov{item_one}'
item_one_url

'https://astrogeology.usgs.gov/search/map/Mars/Viking/cerberus_enhanced'

In [76]:
# Parse the resulting HTML with soup
browser.visit(item_one_url)
html= browser.html
img1_soup = BeautifulSoup(html, 'html.parser')
img1_soup

<html lang="en"><head>
<link href="//ajax.googleapis.com/ajax/libs/jqueryui/1.11.4/themes/smoothness/jquery-ui.css" rel="stylesheet" type="text/css"/>
<title>Cerberus Hemisphere Enhanced | USGS Astrogeology Science Center</title>
<meta content="Mosaic of the Cerberus hemisphere of Mars projected into point perspective, a view similar to that which one would see from…" name="description"/>
<meta content="USGS,Astrogeology Science Center,Cartography,Geology,Space,Geological Survey,Mapping" name="keywords"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<meta content="width=device-width, initial-scale=1, maximum-scale=1" name="viewport"/>
<meta content="x61hXXVj7wtfBSNOPnTftajMsZ5yB2W-qRoyr7GtOKM" name="google-site-verification"/>
<!--<link rel="stylesheet" href="http://fonts.googleapis.com/css?family=Open+Sans:400italic,400,bold"/>-->
<link href="/css/main.css" media="screen" rel="stylesheet"/>
<link href="/css

In [85]:
# Find the relative image url
img1_elem = img1_soup.select_one('ul li a').get('href')
#img1_url_abs = img1_url_rel.find_all(ul)
print(img1_elem)

https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg


In [96]:
# Find Title <h2 class="title">Cerberus Hemisphere Enhanced</h2>
title1_elem = img1_soup.select('div', class_ = 'content')[0]('h2')[0].text

print(title1_elem)

Cerberus Hemisphere Enhanced
