In [1]:
from splinter import Browser
from bs4 import BeautifulSoup as soup
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd

In [2]:
#set the executable path and initialize a browser.
executable_path = {'executable_path': 'C:/Users/Mobeen/.wdm/drivers/chromedriver/win32/89.0.4389.23/chromedriver'}
browser = Browser('chrome', **executable_path, headless=False)

In [3]:
#Visit the mars nasa news site
url = 'https://data-class-mars.s3.amazonaws.com/Mars/index.html'
browser.visit(url)
#Optional delay for loading the page
browser.is_element_present_by_css('div.list_text', wait_time=1)
#With the following line, browser.is_element_present_by_css('div.list_text', wait_time=1), we are accomplishing two things.
#One is that we're searching for elements with a specific combination of tag (div) and attribute (list_text). 
#As an example, ul.item_list would be found in HTML as <ul class="item_list">.
#Secondly, we're also telling our browser to wait one second before searching for components. 
#The optional delay is useful because sometimes dynamic pages take a little while to load, 
#especially if they are image-heavy.

True

In [4]:
html = browser.html
news_soup = soup(html, 'html.parser')
slide_elem = news_soup.select_one('div.list_text')

In [5]:
slide_elem.find('div', class_='content_title')

<div class="content_title">Scientists Explore Outback as Testbed for Mars </div>

In [6]:
news_title = slide_elem.find('div', class_='content_title').get_text()
news_title

'Scientists Explore Outback as Testbed for Mars '

In [7]:
news_p = slide_elem.find('div', class_='article_teaser_body').get_text()
news_p
#There are two methods used to find tags and attributes with BeautifulSoup:
#.find() is used when we want only the first class and attribute we've specified.
#.find_all() is used when we want to retrieve all of the tags and attributes.
#For example, if we were to use .find_all() instead of .find() when pulling the summary, we would retrieve all of the summaries on the page instead of just the first one.

"Australia provides a great place for NASA's Mars 2020 and the ESA-Roscosmos ExoMars scientists to hone techniques in preparation for searching for signs ancient life on Mars."

In [8]:
#Scrape Mars Data: Featured Image
#Ultimately, with each item we scrape, we'll also save and then serve it on our own website. 
#We're basically using pieces from other websites to piece together our own website, 
#with news and images custom tailored to Robin's taste.

### Featured Images 

In [9]:
# Visit Url 
url = 'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/index.html'
browser.visit(url)

In [10]:
# Find and click the full image button 
full_image_elem = browser.find_by_tag('button')[1] # we want to click on the second button 
full_image_elem.click()

In [11]:
#Parse the resulting html with soup
html = browser.html
img_soup = soup(html, 'html.parser')

In [12]:
# Find the relative image url 
img_url_rel = img_soup.find('img', class_='fancybox-image').get('src')
img_url_rel
#We'll use the image tag and class (<img />and fancybox-img) to build the URL to the full-size image.
#Let's break it down:
#An img tag is nested within this HTML, so we've included it.
#.get('src') pulls the link to the image.
#What we've done here is tell BeautifulSoup to look inside the <img /> tag for an image with a class of fancybox-image. 
#Basically we're saying, "This is where the image we want lives—use the link that's inside these tags."

'image/featured/mars1.jpg'

In [13]:
#We pulled the link of the image 
#this is the partial link, base url isn't included 
img_url = f'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/{img_url_rel}'
img_url
#We're using an f-string for this print statement because it's a cleaner way to create print statements; 
#they're also evaluated at run-time. This means that it, and the variable it holds, 
#doesn't exist until the code is executed and the values are not constant. This works well for our scraping app because the data we're scraping is live and will be updated frequently.

'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/image/featured/mars1.jpg'

# Scrape Mars Data: Mars Facts

In [14]:
# In this case all the facts we want are conveniently stored into a table 
# Instead of scraping each row we will scrape the entire table with pandas read._html() function
df = pd.read_html('https://data-class-mars-facts.s3.amazonaws.com/Mars_Facts/index.html')[0]
#y specifying an index of 0, we're telling Pandas to pull only the first table it encounters, or the first item in the list. 
#Then, it turns the table into a DataFrame.
df.columns=['description', 'Mars', 'Earth']
#Here, we assign columns to the new DataFrame for additional clarity.
df.set_index('description', inplace=True)
#we're turning the Description column into the DataFrame's index. 
#inplace=True means that the updated index will remain in place, without having to reassign the DataFrame to a new variable.
df

Unnamed: 0_level_0,Mars,Earth
description,Unnamed: 1_level_1,Unnamed: 2_level_1
Mars - Earth Comparison,Mars,Earth
Diameter:,"6,779 km","12,742 km"
Mass:,6.39 × 10^23 kg,5.97 × 10^24 kg
Moons:,2,1
Distance from Sun:,"227,943,824 km","149,598,262 km"
Length of Year:,687 Earth days,365.24 days
Temperature:,-87 to -5 °C,-88 to 58°C


In [15]:
#This is exactly what Robin is looking to add to her web application. 
#How do we add the DataFrame to a web application? 
#Thankfully, Pandas also has a way to easily convert our DataFrame back into HTML-ready code using the .to_html() function. 
#Add this line to the next cell in your notebook and then run the code.
df.to_html

<bound method DataFrame.to_html of                                     Mars            Earth
description                                              
Mars - Earth Comparison             Mars            Earth
Diameter:                       6,779 km        12,742 km
Mass:                    6.39 × 10^23 kg  5.97 × 10^24 kg
Moons:                                 2                1
Distance from Sun:        227,943,824 km   149,598,262 km
Length of Year:           687 Earth days      365.24 days
Temperature:                -87 to -5 °C      -88 to 58°C>

In [16]:
browser.quit()
#To fully automate jupyter notebook need to convert into .py file 