In [1]:
# Imports
from bs4 import BeautifulSoup as bs
import requests
from splinter import Browser
import pandas as pd
import time


## 1/ Scraping NASA Mars news

In [2]:
# URL of page to be scraped
url = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest'
# Retrieve page with the requests module
response = requests.get(url)
# Create BeautifulSoup object; parse with 'lxml'
soup = bs(response.text, 'lxml')

In [3]:
# Grabbing the slides of the NASA mars news page
# results = soup.find_all('div', class_='slide')
# we only want the first slide
result = soup.find('div', class_='slide')
news_title = result.find('div', class_="content_title").text.strip()
news_p = result.find('div', class_="rollover_description_inner").text.strip()

In [4]:
# Building a list of dictionnaries containing the title and paragraph of each article
# mars_news = []
# for rr in results:
#     mars_news.append({'news_title':  rr.find('div', class_="content_title").text.strip(),
#                      'news_p': rr.find('div', class_="rollover_description_inner").text.strip()})

## 2/ JPL Mars Space Images

In [5]:
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)

In [6]:
url = "https://www.jpl.nasa.gov/spaceimages/?search=featured&category=Mars#submit"
browser.visit(url)
time.sleep(3)

In [7]:
# Let's try to fill the research field
# browser.fill('search', 'featured')
# couldn't make it work
# >> WebDriverException: Message: unknown error: call function result missing 'value'

# Let's try to select the category
# browser.choose('category', 'Mars')
# couldn't make it work either: no error but nothing happens

# if it's not to interact with the webpage I dont really know why we would need splinter?

In [8]:
# Scrape page into Soup
html = browser.html
soup = bs(html, "html.parser")
url_base = "https://www.jpl.nasa.gov"

In [9]:
result = soup.find('a', class_='fancybox')
featured_image_url = url_base+result["data-fancybox-href"]

In [10]:
# look for the url of the fullsize images
# a class="fancybox"
# data-fancybox-href
# results = soup.find_all('a', class_='fancybox')
# 
# featured_image_url = []
# for rr in results:
#     featured_image_url.append(url_base+rr["data-fancybox-href"])
    
# featured_image_url



In [11]:

# Close the browser after scraping
browser.quit()

## 3/ Mars Weather

In [12]:
url = "https://twitter.com/marswxreport?lang=en"
# Retrieve page with the requests module
response = requests.get(url)
# Create BeautifulSoup object; parse with 'lxml'
soup = bs(response.text, 'lxml')

In [13]:
result = soup.find('div', class_='js-tweet-text-container')
mars_weather = result.find('p').text.strip()

In [14]:
# results = soup.find_all('div', class_='js-tweet-text-container')
# mars_weather = []
# for rr in results:
#     mars_weather.append(rr.find('p').text.strip())
# mars_weather
mars_weather

'Sol 2230 (2018-11-14), high -5C/23F, low -72C/-97F, pressure at 8.59 hPa, daylight 06:22-18:39'

## 4/ Mars Fact

In [15]:
url = "https://space-facts.com/mars/"
tables = pd.read_html(url)

mars_fun_facts = tables[0]
# Adding column names
mars_fun_facts.columns = ['Description', 'Value']
# Remove the ":" at the end of the descriptions of the values
mars_fun_facts['Description'] = mars_fun_facts['Description'].str[:-1]

mars_fun_facts

# add column names
# let's remove the : at the end of the description columns


Unnamed: 0,Description,Value
0,Equatorial Diameter,"6,792 km"
1,Polar Diameter,"6,752 km"
2,Mass,6.42 x 10^23 kg (10.7% Earth)
3,Moons,2 (Phobos & Deimos)
4,Orbit Distance,"227,943,824 km (1.52 AU)"
5,Orbit Period,687 days (1.9 years)
6,Surface Temperature,-153 to 20 °C
7,First Record,2nd millennium BC
8,Recorded By,Egyptian astronomers


## 5/ Mars Hemispheres

In [61]:
url_base = "https://astrogeology.usgs.gov"
url = url_base+"/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"

In [62]:
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)

In [63]:
browser.visit(url)
time.sleep(3)

In [64]:
# Scrape page into Soup
html = browser.html
soup = bs(html, "html.parser")

# http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg

In [70]:
# we look for the div describing the images
results = soup.find_all('div', class_='description')
hemisphere_image_urls = []
for rr in results:
    # grab the tile of the picture
    title = rr.find('h3').text    
    # url to visit to get the full size imge
    url_for_pict = url_base + rr.find('a')['href']
    # go to the page where we can find the full size pict
    browser.visit(url_for_pict)
    # sleep a little bit
    time.sleep(3)
    # Scrape into soup
    html_pic = browser.html
    soup_pic = bs(html_pic, 'html.parser')
    # look for the link
    res_pic = soup_pic.find('img',  class_="wide-image")
    url_img = url_base+res_pic['src']
    # Append a dict with the scraped variable in the list
    hemisphere_image_urls.append({"title":title,
                                 "img_url": url_img})
    


In [71]:
hemisphere_image_urls

[{'title': 'Cerberus Hemisphere Enhanced',
  'img_url': 'https://astrogeology.usgs.gov/cache/images/cfa62af2557222a02478f1fcd781d445_cerberus_enhanced.tif_full.jpg'},
 {'title': 'Schiaparelli Hemisphere Enhanced',
  'img_url': 'https://astrogeology.usgs.gov/cache/images/3cdd1cbf5e0813bba925c9030d13b62e_schiaparelli_enhanced.tif_full.jpg'},
 {'title': 'Syrtis Major Hemisphere Enhanced',
  'img_url': 'https://astrogeology.usgs.gov/cache/images/ae209b4e408bb6c3e67b6af38168cf28_syrtis_major_enhanced.tif_full.jpg'},
 {'title': 'Valles Marineris Hemisphere Enhanced',
  'img_url': 'https://astrogeology.usgs.gov/cache/images/7cf2da4bf549ed01c17f206327be4db7_valles_marineris_enhanced.tif_full.jpg'}]

In [67]:
browser.visit(url_pic)
# sleep a little bit
time.sleep(3)

In [None]:
# Scrape into soup
html_pic = browser.html
soup_pic = bs(html_pic, 'html.parser')
# look for the link
res_pic = soup_pic.find('img',  class_="wide-image")
url_img = url_base+res_pic['src']

In [None]:
browser.quit()

In [43]:


# i couldnt make the clicking thing work
# browser.click_link_by_href(results[0].find('a')['href'])
# so let's open the page with the picture
url_pic = url_base + results[0].find('a')['href']

# Scrape into soup
html_pic = browser.html
soup_pic = bs(html_pic, 'html.parser')

In [46]:
# browser.click_link_by_href('http://www.the_site.com/my_link')
# browser.back()
# https://astrogeology.usgs.gov/search/map/Mars/Viking/cerberus_enhanced
res_pic = soup_pic.find('img',  class_="wide-image")
res_pic['src']
url_img = url_base+res_pic['src']

'/cache/images/cfa62af2557222a02478f1fcd781d445_cerberus_enhanced.tif_full.jpg'

In [38]:
print(res_pic)

[]


In [39]:
soup_pic

<!DOCTYPE html>
<html lang="en" xmlns="http://www.w3.org/1999/xhtml"><head>
<link href="//ajax.googleapis.com/ajax/libs/jqueryui/1.11.3/themes/smoothness/jquery-ui.css" rel="stylesheet" type="text/css"/>
<script async="" src="https://ssl.google-analytics.com/ga.js" type="text/javascript"></script><script src="https://ajax.googleapis.com/ajax/libs/jquery/1.11.1/jquery.min.js" type="text/javascript"></script>
<title>Astropedia Search Results | USGS Astrogeology Science Center</title>
<meta content="USGS Astrogeology Science Center Astropedia search results." name="description"/>
<meta content="USGS,Astrogeology Science Center,Cartography,Geology,Space,Geological Survey,Mapping" name="keywords"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<meta content="width=device-width, initial-scale=1, maximum-scale=1" name="viewport"/>
<meta content="x61hXXVj7wtfBSNOPnTftajMsZ5yB2W-qRoyr7GtOKM" name="google-site-verifica