In [1]:
#Dependencies
from splinter import Browser
from bs4 import BeautifulSoup as bs
import pandas as pd
import time


In [2]:
#Function for browser

def initial_browser():
    executable_path = {"executable_path": "/Users/Nataliia/chromedriver"}
    return Browser("chrome", **executable_path, headless=False)

In [3]:
#Additional function for parcing through html code.
#Added as there are repeated code parts in the scrape_info function
#Arguments added: browser (taking result of "initial_browser" function) and
#url (taking different urls for visiting web-pages)

def parser(browser, url):
    browser.visit(url)
    time.sleep(1)
    html = browser.html
    soup = bs(html, "html.parser")
    return(soup)

In [4]:
#Additional function for scrapping html code for Hemispheres images.
#Added as there are repeated code parts in the scrape_info function
#Arguments added: soup (taking result of "parser" function),
#url (taking different urls for visiting web-pages) and 
#list for adding images urls to assigned list.

def find_image_url(soup, url, list):
    result = soup.find_all('img', class_="wide-image")[0].attrs["src"]
    list.append(url + result)
    return(list)

In [5]:
#Creating an empty dictionary for scrapped data
mars_data ={}

#Initializing the browser
browser = initial_browser()

In [6]:
#Scrapping data about the latest news on Mars and adding it to the dictionary
url_news = "https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest"
soup = parser(browser, url_news)
latest_title = soup.find_all('div', class_="content_title")[1].text
latest_paragraph = soup.find_all('div', class_="article_teaser_body")[0].text

mars_data.update({"latest_title": latest_title,
                "latest_paragraph": latest_paragraph})

print(mars_data)       

{'latest_title': "A Martian Roundtrip: NASA's Perseverance Rover Sample Tubes", 'latest_paragraph': "Marvels of engineering, the rover's sample tubes must be tough enough to safely bring Red Planet samples on the long journey back to Earth in immaculate condition. "}


In [7]:
#Scrapping the featured image url
url_image_search = "https://www.jpl.nasa.gov"
url_image = url_image_search + "/spaceimages/?search=&category=Mars"
soup = parser(browser, url_image)
featured_image = soup.find_all("a", id="full_image")
featured_image_link = featured_image[0].attrs["data-fancybox-href"]
featured_image_url = url_image_search + featured_image_link

mars_data.update({"featured_image_url": featured_image_url})

print(mars_data)   

{'latest_title': "A Martian Roundtrip: NASA's Perseverance Rover Sample Tubes", 'latest_paragraph': "Marvels of engineering, the rover's sample tubes must be tough enough to safely bring Red Planet samples on the long journey back to Earth in immaculate condition. ", 'featured_image_url': 'https://www.jpl.nasa.gov/spaceimages/images/mediumsize/PIA17044_ip.jpg'}


In [8]:
#Scrapping Mars facts table
url = "https://space-facts.com/mars/"
tables = pd.read_html(url)
Mars_facts = tables[0]
Mars_facts

Unnamed: 0,0,1
0,Equatorial Diameter:,"6,792 km"
1,Polar Diameter:,"6,752 km"
2,Mass:,6.39 × 10^23 kg (0.11 Earths)
3,Moons:,2 (Phobos & Deimos)
4,Orbit Distance:,"227,943,824 km (1.38 AU)"
5,Orbit Period:,687 days (1.9 years)
6,Surface Temperature:,-87 to -5 °C
7,First Record:,2nd millennium BC
8,Recorded By:,Egyptian astronomers


In [9]:
#Renaming columns for more user-friendly look
Mars_facts.rename(columns={list(Mars_facts)[0]:"Parameters",
                          list(Mars_facts)[1]:"Numbers"}, inplace=True)
Mars_facts

Unnamed: 0,Parameters,Numbers
0,Equatorial Diameter:,"6,792 km"
1,Polar Diameter:,"6,752 km"
2,Mass:,6.39 × 10^23 kg (0.11 Earths)
3,Moons:,2 (Phobos & Deimos)
4,Orbit Distance:,"227,943,824 km (1.38 AU)"
5,Orbit Period:,687 days (1.9 years)
6,Surface Temperature:,-87 to -5 °C
7,First Record:,2nd millennium BC
8,Recorded By:,Egyptian astronomers


In [10]:
#Changing the index (deleting integer index column) for more user-friendly look
Mars_facts = Mars_facts.set_index("Parameters")
Mars_facts

Unnamed: 0_level_0,Numbers
Parameters,Unnamed: 1_level_1
Equatorial Diameter:,"6,792 km"
Polar Diameter:,"6,752 km"
Mass:,6.39 × 10^23 kg (0.11 Earths)
Moons:,2 (Phobos & Deimos)
Orbit Distance:,"227,943,824 km (1.38 AU)"
Orbit Period:,687 days (1.9 years)
Surface Temperature:,-87 to -5 °C
First Record:,2nd millennium BC
Recorded By:,Egyptian astronomers


In [11]:
#Converting the table into html-string
html_table = Mars_facts.to_html()
print(html_table)

<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>Numbers</th>
    </tr>
    <tr>
      <th>Parameters</th>
      <th></th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>Equatorial Diameter:</th>
      <td>6,792 km</td>
    </tr>
    <tr>
      <th>Polar Diameter:</th>
      <td>6,752 km</td>
    </tr>
    <tr>
      <th>Mass:</th>
      <td>6.39 × 10^23 kg (0.11 Earths)</td>
    </tr>
    <tr>
      <th>Moons:</th>
      <td>2 (Phobos &amp; Deimos)</td>
    </tr>
    <tr>
      <th>Orbit Distance:</th>
      <td>227,943,824 km (1.38 AU)</td>
    </tr>
    <tr>
      <th>Orbit Period:</th>
      <td>687 days (1.9 years)</td>
    </tr>
    <tr>
      <th>Surface Temperature:</th>
      <td>-87 to -5 °C</td>
    </tr>
    <tr>
      <th>First Record:</th>
      <td>2nd millennium BC</td>
    </tr>
    <tr>
      <th>Recorded By:</th>
      <td>Egyptian astronomers</td>
    </tr>
  </tbody>
</table>


In [12]:
#Adding the table to the dictionary
mars_data.update({"Mars_facts": html_table})
mars_data

{'latest_title': "A Martian Roundtrip: NASA's Perseverance Rover Sample Tubes",
 'latest_paragraph': "Marvels of engineering, the rover's sample tubes must be tough enough to safely bring Red Planet samples on the long journey back to Earth in immaculate condition. ",
 'featured_image_url': 'https://www.jpl.nasa.gov/spaceimages/images/mediumsize/PIA17044_ip.jpg',
 'Mars_facts': '<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>Numbers</th>\n    </tr>\n    <tr>\n      <th>Parameters</th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>Equatorial Diameter:</th>\n      <td>6,792 km</td>\n    </tr>\n    <tr>\n      <th>Polar Diameter:</th>\n      <td>6,752 km</td>\n    </tr>\n    <tr>\n      <th>Mass:</th>\n      <td>6.39 × 10^23 kg (0.11 Earths)</td>\n    </tr>\n    <tr>\n      <th>Moons:</th>\n      <td>2 (Phobos &amp; Deimos)</td>\n    </tr>\n    <tr>\n      <th>Orbit Distance:</th>\n      <td>227,94

In [13]:
#Scrapping Hemispheres images with the additional function
url_astrogeology = "https://astrogeology.usgs.gov"
list_hemispheres_img_url = []
   
url_Cerberus = "/search/map/Mars/Viking/cerberus_enhanced"
url_Schiaparelli = "/search/map/Mars/Viking/schiaparelli_enhanced"
url_Syrtis_Major = "/search/map/Mars/Viking/syrtis_major_enhanced"
url_Valles_Marineris = "/search/map/Mars/Viking/valles_marineris_enhanced"

soup = parser(browser, url_astrogeology + url_Cerberus)
list_hemispheres_img_url = find_image_url(soup, url_astrogeology, list_hemispheres_img_url)

soup = parser(browser, url_astrogeology + url_Schiaparelli)
list_hemispheres_img_url = find_image_url(soup, url_astrogeology, list_hemispheres_img_url)

soup = parser(browser, url_astrogeology + url_Syrtis_Major)
list_hemispheres_img_url = find_image_url(soup, url_astrogeology, list_hemispheres_img_url)

soup = parser(browser, url_astrogeology + url_Valles_Marineris)
list_hemispheres_img_url = find_image_url(soup, url_astrogeology, list_hemispheres_img_url)

print(list_hemispheres_img_url)

['https://astrogeology.usgs.gov/cache/images/f5e372a36edfa389625da6d0cc25d905_cerberus_enhanced.tif_full.jpg', 'https://astrogeology.usgs.gov/cache/images/3778f7b43bbbc89d6e3cfabb3613ba93_schiaparelli_enhanced.tif_full.jpg', 'https://astrogeology.usgs.gov/cache/images/555e6403a6ddd7ba16ddb0e471cadcf7_syrtis_major_enhanced.tif_full.jpg', 'https://astrogeology.usgs.gov/cache/images/b3c7c6c9138f57b4756be9b9c43e3a48_valles_marineris_enhanced.tif_full.jpg']


In [14]:
#Adding images urls as a list objects to the list of dictionaries
hemisphere_image_urls = [
    {"title": "Cerberus Hemisphere", "img_url": list_hemispheres_img_url[0]},
    {"title": "Schiaparelli Hemisphere", "img_url": list_hemispheres_img_url[1]},
    {"title": "Syrtis Major Hemisphere", "img_url": list_hemispheres_img_url[2]},
    {"title": "Valles Marineris Hemisphere", "img_url": list_hemispheres_img_url[3]}
]
hemisphere_image_urls

[{'title': 'Cerberus Hemisphere',
  'img_url': 'https://astrogeology.usgs.gov/cache/images/f5e372a36edfa389625da6d0cc25d905_cerberus_enhanced.tif_full.jpg'},
 {'title': 'Schiaparelli Hemisphere',
  'img_url': 'https://astrogeology.usgs.gov/cache/images/3778f7b43bbbc89d6e3cfabb3613ba93_schiaparelli_enhanced.tif_full.jpg'},
 {'title': 'Syrtis Major Hemisphere',
  'img_url': 'https://astrogeology.usgs.gov/cache/images/555e6403a6ddd7ba16ddb0e471cadcf7_syrtis_major_enhanced.tif_full.jpg'},
 {'title': 'Valles Marineris Hemisphere',
  'img_url': 'https://astrogeology.usgs.gov/cache/images/b3c7c6c9138f57b4756be9b9c43e3a48_valles_marineris_enhanced.tif_full.jpg'}]

In [15]:
#Adding list of dictionaries with images urls and titles to the main dictionary
#for loading all scrapped data into Mongo DB
mars_data.update({"Hemispheres": hemisphere_image_urls})
mars_data

{'latest_title': "A Martian Roundtrip: NASA's Perseverance Rover Sample Tubes",
 'latest_paragraph': "Marvels of engineering, the rover's sample tubes must be tough enough to safely bring Red Planet samples on the long journey back to Earth in immaculate condition. ",
 'featured_image_url': 'https://www.jpl.nasa.gov/spaceimages/images/mediumsize/PIA17044_ip.jpg',
 'Mars_facts': '<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>Numbers</th>\n    </tr>\n    <tr>\n      <th>Parameters</th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>Equatorial Diameter:</th>\n      <td>6,792 km</td>\n    </tr>\n    <tr>\n      <th>Polar Diameter:</th>\n      <td>6,752 km</td>\n    </tr>\n    <tr>\n      <th>Mass:</th>\n      <td>6.39 × 10^23 kg (0.11 Earths)</td>\n    </tr>\n    <tr>\n      <th>Moons:</th>\n      <td>2 (Phobos &amp; Deimos)</td>\n    </tr>\n    <tr>\n      <th>Orbit Distance:</th>\n      <td>227,94

In [16]:
browser.quit()