In [32]:
import bs4
from bs4 import BeautifulSoup as soup
import pandas as pd
import pprint

# Needed for dynamic web pages
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from splinter import Browser
import os

In [33]:
# Instantiate an Options object and add the --headless argument
# This runs the browser in the background
opts = Options()
opts.add_argument(" --headless")

# Set the location of the webdriver
chrome_driver='C:\\Users\\srobi\\Projects\\chromedriver.exe'

# Instantiate a webdriver
driver = webdriver.Chrome(options=opts,executable_path=chrome_driver)

# To scrape a url rather than a local file use this
my_url = 'https://redplanetscience.com/'
driver.get(my_url)

# Put the page source into a variable and create a BS object from it, parse HTML
page_html = driver.page_source
page_soup = soup(page_html, "html.parser")

# Close the browser
driver.quit()

In [34]:
# Tests to see if html has been parsed properly
print('\nTesting html parsing...')
print(page_soup.title.get_text())
print(page_soup.h1.text) # shows header information
print(page_soup.p.text) # shows paragraph information
print(page_soup.body.span.text)
print('Testing completed.\n')


Testing html parsing...
News - Mars Exploration Program
News
NASA to Broadcast Mars 2020 Perseverance Launch, Prelaunch Activities
MARS Planet Science
Testing completed.



In [35]:
# Grabs each product
titles = page_soup.findAll("div",{"class":"content_title"})
print(f'There are {len(titles)} titles total.') # find total number of containers
news_title = titles[0].text # look at the first container
print(f'Here is the first title: \n{news_title}\n')

paragraphs = page_soup.findAll("div",{"class":"article_teaser_body"})
print(f'There are {len(paragraphs)} paragraphs total.') # find total number of containers
news_p = paragraphs[0].text # look at the first container
print(f'Here is the first paragraph: \n{news_p}\n')

There are 15 titles total.
Here is the first title: 
Space History Is Made in This NASA Robot Factory

There are 15 paragraphs total.
Here is the first paragraph: 
From rockets to rovers, JPL's Spacecraft Assembly Facility has been at the center of robotic spaceflight. Here's a closer look at what makes it so special.



In [36]:
# Instantiate an Options object and add the --headless argument
# This runs the browser in the background
opts = Options()
opts.add_argument(" --headless")

# Set the location of the webdriver
chrome_driver='C:\\Users\\srobi\\Projects\\chromedriver.exe'

# Instantiate a webdriver
driver = webdriver.Chrome(options=opts,executable_path=chrome_driver)

# To scrape a url rather than a local file use this
mars_image_url = 'https://spaceimages-mars.com/'
driver.get(mars_image_url)

# Put the page source into a variable and create a BS object from it, parse HTML
page_html = driver.page_source
page_soup = soup(page_html, "html.parser")

# Load and print the title and the text of the <div>
print('Testing that html has been parsed...')
print(page_soup.title.get_text())
print('Testing completed.')

# Close the browser
driver.quit()

Testing that html has been parsed...
Space Image
Testing completed.


In [91]:
df

Unnamed: 0,Mars - Earth Comparison,Mars,Earth
0,Diameter:,"6,779 km","12,742 km"
1,Mass:,6.39 × 10^23 kg,5.97 × 10^24 kg
2,Moons:,2,1
3,Distance from Sun:,"227,943,824 km","149,598,262 km"
4,Length of Year:,687 Earth days,365.24 days
5,Temperature:,-87 to -5 °C,-88 to 58°C


In [37]:


# Display featured url
images = page_soup.findAll('img')[1]
featured_image_url = mars_image_url + images['src']
print(featured_image_url)

# Pandas scraping
url = 'https://galaxyfacts-mars.com/'

tables = pd.read_html(url,header=0)
df = tables[0]
print('Dataframe created from table.')

# Used for bootstrap HTML table for better look
table_headers = df.columns.tolist()
table_rows = df.values.tolist()

# Convert to HTML table, used to create table directly from dataframe
html_table = df.to_html()
html_table = html_table.replace('\n','')
print('Dataframe converted to HTML table called html_table.')

# url for the image (Chrome version)
mars_hemispheres_url = 'https://marshemispheres.com/'

# Set up browser for Splinter
executable_path = {'executable_path':'../chromedriver'}
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized") # start with window maximized
options.add_argument("--disable-notifications") # notifications turned off
browser = Browser('chrome', **executable_path, headless=False, options = options)

browser.visit(mars_hemispheres_url)

submit = browser.find_by_css('a[class="itemLink product-item"]')
original_submit_count = len(submit)
print(f'There are {original_submit_count} items in the submit variable.')

https://spaceimages-mars.com/image/featured/mars1.jpg
Dataframe created from table.
Dataframe converted to HTML table called html_table.
There are 9 items in the submit variable.


In [38]:
# Need to set up a loop to retrieve only even numbers
# Can achieve this by adding 1 and divide by 2, mod needs to == 0
hemisphere_image_urls = []

for x in range(original_submit_count):
    if (x+1) % 2 == 0:
        # Splinter clicks on the submit element to open the page in Chrome
        submit = browser.find_by_css('a[class="itemLink product-item"]')
        submit[x].click()

        # Grab the current url using Splinter
        # This will be used as input for Beautiful Soup to parse the HTML
        current_url = browser.url
        print(current_url)
        
        # Instantiate a webdriver
        driver = webdriver.Chrome(options=opts,executable_path=chrome_driver)
        driver.get(current_url)

        # Put the page source into a variable and create a BS object from it, parse HTML
        page_html = driver.page_source
        page_soup = soup(page_html, "html.parser")

        # Load and print the title as a test
        print(page_soup.title.get_text())

        # Close the browser
        driver.quit()

        # Find title of image
        hemisphere_titles = page_soup.find_all("h2", {"class":"title"})
        title = hemisphere_titles[0].text
        print(title)

        # Find image_url
        hemisphere_images = page_soup.find_all("img", {"class":"wide-image"})
        image_url = mars_hemispheres_url + hemisphere_images[0]['src']
        print(image_url)
        
        # Append title and image_url to dictionary
        dictionary = {'title':title,'img_url':image_url}
        hemisphere_image_urls.append(dictionary)
        
        # Add a break line
        print()
        
        # Splinter clicks on the browser Back button
        browser.back()

https://marshemispheres.com/cerberus.html
Astropedia Search Results | GUSS Astrogeology Science Center
Cerberus Hemisphere Enhanced
https://marshemispheres.com/images/f5e372a36edfa389625da6d0cc25d905_cerberus_enhanced.tif_full.jpg

https://marshemispheres.com/schiaparelli.html
Astropedia Search Results | GUSS Astrogeology Science Center
Schiaparelli Hemisphere Enhanced
https://marshemispheres.com/images/3778f7b43bbbc89d6e3cfabb3613ba93_schiaparelli_enhanced.tif_full.jpg

https://marshemispheres.com/syrtis.html
Astropedia Search Results | GUSS Astrogeology Science Center
Syrtis Major Hemisphere Enhanced
https://marshemispheres.com/images/555e6403a6ddd7ba16ddb0e471cadcf7_syrtis_major_enhanced.tif_full.jpg

https://marshemispheres.com/valles.html
Astropedia Search Results | GUSS Astrogeology Science Center
Valles Marineris Hemisphere Enhanced
https://marshemispheres.com/images/b3c7c6c9138f57b4756be9b9c43e3a48_valles_marineris_enhanced.tif_full.jpg



In [39]:
pprint.pprint(hemisphere_image_urls)
print()

[{'img_url': 'https://marshemispheres.com/images/f5e372a36edfa389625da6d0cc25d905_cerberus_enhanced.tif_full.jpg',
  'title': 'Cerberus Hemisphere Enhanced'},
 {'img_url': 'https://marshemispheres.com/images/3778f7b43bbbc89d6e3cfabb3613ba93_schiaparelli_enhanced.tif_full.jpg',
  'title': 'Schiaparelli Hemisphere Enhanced'},
 {'img_url': 'https://marshemispheres.com/images/555e6403a6ddd7ba16ddb0e471cadcf7_syrtis_major_enhanced.tif_full.jpg',
  'title': 'Syrtis Major Hemisphere Enhanced'},
 {'img_url': 'https://marshemispheres.com/images/b3c7c6c9138f57b4756be9b9c43e3a48_valles_marineris_enhanced.tif_full.jpg',
  'title': 'Valles Marineris Hemisphere Enhanced'}]



In [40]:
# Create dictionary of all scraped data
final_dictionary = {
    "news_title":news_title,
    "news_paragraph":news_p,
    "featured_image":featured_image_url,
    "mars_facts":html_table,
    "mars_hemispheres":hemisphere_image_urls
}
pprint.pprint(final_dictionary)

{'featured_image': 'https://spaceimages-mars.com/image/featured/mars1.jpg',
 'mars_facts': '<table border="1" class="dataframe">  <thead>    <tr '
               'style="text-align: right;">      <th></th>      <th>Mars - '
               'Earth Comparison</th>      <th>Mars</th>      '
               '<th>Earth</th>    </tr>  </thead>  <tbody>    <tr>      '
               '<th>0</th>      <td>Diameter:</td>      <td>6,779 km</td>      '
               '<td>12,742 km</td>    </tr>    <tr>      <th>1</th>      '
               '<td>Mass:</td>      <td>6.39 × 10^23 kg</td>      <td>5.97 × '
               '10^24 kg</td>    </tr>    <tr>      <th>2</th>      '
               '<td>Moons:</td>      <td>2</td>      <td>1</td>    </tr>    '
               '<tr>      <th>3</th>      <td>Distance from Sun:</td>      '
               '<td>227,943,824 km</td>      <td>149,598,262 km</td>    '
               '</tr>    <tr>      <th>4</th>      <td>Length of '
               'Year:</td>      <td>6