## Web Scraping - Mission to Mars

## Background: 
* Build a flask web application that scrapes various websites for data related to the Mission to Mars and displays the information in a single HTML page.

### Final Output Screens:

![final_app_part1.png](images/final_app_part1.png)
![final_app_part2.png](images/final_app_part2.png)

In [1]:
#Import dependecies
from splinter import Browser
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
#Note: Please replace the path with your actual path to the chromedriver
executable_path = {'executable_path': '/Users/anuaj/bin/chromedriver'}
browser = Browser('chrome', **executable_path, headless=True)

### NASA Mars News

In [3]:
#Collect the latest News Title and Paragraph Text from https://mars.nasa.gov/. 
#news_title = "NASA's Next Mars Mission to Investigate Interior of Red Planet"
#news_p = "Preparation of NASA's next...West Coast."

In [4]:
#Mars news URL and visit the browser
mars_news_url = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest'
browser.visit(mars_news_url)

In [5]:
#Add wait time to load browser
browser.wait_time = 10

In [6]:
#Get the latest news title and news paragraph

#Set beautifulsoup object
mars_news_html = browser.html
mars_news_soup = BeautifulSoup(mars_news_html, 'html.parser')

#Review webpage and get html tag and class for searcg
mars_news = mars_news_soup.find_all('div', class_='list_text')

#Loop through news
for news in mars_news:
    try:
        #Save news tilte and news paragraph
        news_title=news.find('div', class_='content_title').text
        news_p=news.find('div', class_='article_teaser_body').text
        
        #Print results
        print("news_title = ", news_title,"\n")
        print("news_p = ", news_p,"\n")
        
        #Exit loop as we got the latest news
        break
    except:
        #If error continue to next news - this is simply to make sure we have news title and news paragraph 
        pass

news_title =  NASA's Perseverance Rover Goes Through Trials by Fire, Ice, Light and Sound 

news_p =  The agency's new Mars rover is put through a series of tests in vacuum chambers, acoustic chambers and more to get ready for the Red Planet. 



### JPL Mars Space Images - Featured Image

In [7]:
#Use splinter to navigate the site https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars
#Featured Mars Image and assign the url string to a variable called featured_image_url
#image url to the full size .jpg image - url string for this image
#imageurl = 'https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA16225_hires.jpg'

In [8]:
#JPL base url
jpl_base_url ='https://www.jpl.nasa.gov'

In [9]:
#Set JPL seach url and visit the browser
jpl_url = 'https://www.jpl.nasa.gov/spaceimages/details.php?id=PIA02570'
browser.visit(jpl_url)

In [10]:
#Add wait time to load browser
browser.wait_time = 10

In [11]:
#Get soup oject
jpl_html = browser.html
jpl_soup = BeautifulSoup(jpl_html, 'html.parser')

In [12]:
#Find the figure tage with ledge class with soup oject 
jpl_image_urls = jpl_soup.find_all('figure', class_='lede')

#Get to specific URL
for jpg_image_url in jpl_image_urls:
    featured_image_url = jpl_base_url+jpg_image_url.find("a")["href"]
    print("featuredimageurl = ", featured_image_url)

featuredimageurl =  https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA02570_hires.jpg


### Mars Weather

In [13]:
#Visit the Mars Weather twitter account and scrape the latest Mars weather tweet from the page. 
#Save the tweet text for the weather report as a variable called mars_weather.
#mars_weather = 'Sol 1801 (Aug 30, 2017), Sunny, high -21C/-5F, low -80C/-112F, pressure at 8.82 hPa, daylight 06:09-17:55' 

In [14]:
#Mars weather tweeter handle
mars_weather_twit_handle="https://twitter.com/marswxreport?lang=en"
browser.visit(mars_weather_twit_handle)

In [15]:
#Add wait time to load browser
browser.wait_time = 30

In [16]:
#Get the beautifulsoup object
mars_weather_twit_html = browser.html
mars_weather_twit_soup = BeautifulSoup(mars_weather_twit_html, 'html.parser')

In [17]:
#Parse tweet page using span tag and class
mars_weather_tweets= mars_weather_twit_soup.find_all("span",class_="css-901oao css-16my406 r-1qd0xha r-ad9z0x r-bcqeeo r-qvutc0")

In [19]:
#Loop through tweets - we just need latest
for mars_weather_tweet in mars_weather_tweets:
    #Get the Insight text - this is where weather info starts
    try:
        if (mars_weather_tweet.text.split()[0]=="InSight"):
            mars_weather =mars_weather_tweet.text
            print(mars_weather)
            break
    except:
        pass

InSight sol 524 (2020-05-18) low -92.5ºC (-134.4ºF) high 0.5ºC (32.9ºF)
winds from the SW at 4.8 m/s (10.7 mph) gusting to 15.3 m/s (34.3 mph)
pressure at 7.00 hPa


### Mars Facts

In [20]:
#Visit the Mars Facts webpage https://space-facts.com/mars/ 
#use Pandas to scrape the table containing facts about the planet including Diameter, Mass, etc.
#Use Pandas to convert the data to a HTML table string.

In [21]:
#Set url value
mars_fact_url="https://space-facts.com/mars/"

In [22]:
#Use Pandas to get html table 
mars_fact_dfs = pd.read_html(mars_fact_url)

In [23]:
#Get first table and display
mars_fact_df=mars_fact_dfs[0]
mars_fact_df

Unnamed: 0,0,1
0,Equatorial Diameter:,"6,792 km"
1,Polar Diameter:,"6,752 km"
2,Mass:,6.39 × 10^23 kg (0.11 Earths)
3,Moons:,2 (Phobos & Deimos)
4,Orbit Distance:,"227,943,824 km (1.38 AU)"
5,Orbit Period:,687 days (1.9 years)
6,Surface Temperature:,-87 to -5 °C
7,First Record:,2nd millennium BC
8,Recorded By:,Egyptian astronomers


In [24]:
#Rename df columns
mars_fact_df= mars_fact_df.rename(columns={0:"Description",1:"Value"})

In [25]:
#Set index= False to avoid getting index column.
mars_fact_df = mars_fact_df.to_html(index=False)

#Remove new line character
mars_fact_df.replace("\n","")

'<table border="1" class="dataframe">  <thead>    <tr style="text-align: right;">      <th>Description</th>      <th>Value</th>    </tr>  </thead>  <tbody>    <tr>      <td>Equatorial Diameter:</td>      <td>6,792 km</td>    </tr>    <tr>      <td>Polar Diameter:</td>      <td>6,752 km</td>    </tr>    <tr>      <td>Mass:</td>      <td>6.39 × 10^23 kg (0.11 Earths)</td>    </tr>    <tr>      <td>Moons:</td>      <td>2 (Phobos &amp; Deimos)</td>    </tr>    <tr>      <td>Orbit Distance:</td>      <td>227,943,824 km (1.38 AU)</td>    </tr>    <tr>      <td>Orbit Period:</td>      <td>687 days (1.9 years)</td>    </tr>    <tr>      <td>Surface Temperature:</td>      <td>-87 to -5 °C</td>    </tr>    <tr>      <td>First Record:</td>      <td>2nd millennium BC</td>    </tr>    <tr>      <td>Recorded By:</td>      <td>Egyptian astronomers</td>    </tr>  </tbody></table>'

In [26]:
#Display results
mars_fact_df

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th>Description</th>\n      <th>Value</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <td>Equatorial Diameter:</td>\n      <td>6,792 km</td>\n    </tr>\n    <tr>\n      <td>Polar Diameter:</td>\n      <td>6,752 km</td>\n    </tr>\n    <tr>\n      <td>Mass:</td>\n      <td>6.39 × 10^23 kg (0.11 Earths)</td>\n    </tr>\n    <tr>\n      <td>Moons:</td>\n      <td>2 (Phobos &amp; Deimos)</td>\n    </tr>\n    <tr>\n      <td>Orbit Distance:</td>\n      <td>227,943,824 km (1.38 AU)</td>\n    </tr>\n    <tr>\n      <td>Orbit Period:</td>\n      <td>687 days (1.9 years)</td>\n    </tr>\n    <tr>\n      <td>Surface Temperature:</td>\n      <td>-87 to -5 °C</td>\n    </tr>\n    <tr>\n      <td>First Record:</td>\n      <td>2nd millennium BC</td>\n    </tr>\n    <tr>\n      <td>Recorded By:</td>\n      <td>Egyptian astronomers</td>\n    </tr>\n  </tbody>\n</table>'

### Mars Hemispheres

In [27]:
#Visit the USGS Astrogeology site to obtain high resolution images for each of Mar's hemispheres.
# Save both the image url string for the full resolution hemisphere image, and the Hemisphere title containing the hemisphere name. 
# Use a Python dictionary to store the data using the keys img_url and title.

In [28]:
#Set base url
mars_hemispheres_base_url="https://astrogeology.usgs.gov"

In [29]:
#Set Mars Hemispheres URL and visit the site
mars_hemispheres_url="https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
browser.visit(mars_hemispheres_url)

In [30]:
#Add wait time to load browser
browser.wait_time = 10

In [31]:
#Set beacutifulsoup oject
mars_hemispheres_html = browser.html
mars_hemispheres_soup = BeautifulSoup(mars_hemispheres_html, 'html.parser')

In [32]:
#Search through html document to get to list of items
mars_hemispheres= mars_hemispheres_soup.find_all("div",class_="item")

In [33]:
#Let's build title list and page name list that will give us high res image
titles=[]
hemisphere_urls=[]

#Loop through beautifulsoup results
for mars_hemisphere in mars_hemispheres:
    #Search for h3 tag to get title
    mars_title =mars_hemisphere.find("h3").text
    #Append title to list
    titles.append(mars_title)
    #Get page reference
    pagelink=mars_hemispheres_base_url +mars_hemisphere.a['href']
    #Sppend page link to list
    hemisphere_urls.append(pagelink)
    
    #Display results - just for sanity check
    print(mars_title)
    print(pagelink,"\n")

Cerberus Hemisphere Enhanced
https://astrogeology.usgs.gov/search/map/Mars/Viking/cerberus_enhanced 

Schiaparelli Hemisphere Enhanced
https://astrogeology.usgs.gov/search/map/Mars/Viking/schiaparelli_enhanced 

Syrtis Major Hemisphere Enhanced
https://astrogeology.usgs.gov/search/map/Mars/Viking/syrtis_major_enhanced 

Valles Marineris Hemisphere Enhanced
https://astrogeology.usgs.gov/search/map/Mars/Viking/valles_marineris_enhanced 



In [34]:
#Get List of high res image
hemisphere_full_img=[]

#Let's loop through each URL
for hemisphere_url in hemisphere_urls:
    #Visit browser
    browser.visit(hemisphere_url)
    hemisphere_full_html = browser.html
    
    #Add wait time to load browser
    browser.wait_time = 10
    
    #Get beautifulsoup oject
    hemisphere_full_soup = BeautifulSoup(hemisphere_full_html, 'html.parser')
    
    #Parse div tages of download class
    mars_hemisphere_full_images= hemisphere_full_soup.find_all("div",class_="downloads")
    
    #Loop through list
    for mars_hemisphere_full_image in mars_hemisphere_full_images:
        #Get the a tag and href value
        #imgurls = mars_image.find_all("li")
        image_url = mars_hemisphere_full_image.a['href']
        #Append url in to list 
        hemisphere_full_img.append(image_url)
        print(image_url)

https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg
https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg
https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg
https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg


In [35]:
#Build disctionary object
hemisphere_full_imgs=[]

#Pandas to convert lists to dictionary 
hemisphereimageurls = {} 

#Create df
hemisphereimageurls_df=pd.DataFrame({"title":titles,"img_url":hemisphere_full_img})

#Convert df to dictionary
hemisphereimageurls = hemisphereimageurls_df.to_dict(orient='records')

#Show Disctionary object - final resuts
print("hemisphereimageurls =", hemisphereimageurls)

hemisphereimageurls = [{'title': 'Cerberus Hemisphere Enhanced', 'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg'}, {'title': 'Schiaparelli Hemisphere Enhanced', 'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg'}, {'title': 'Syrtis Major Hemisphere Enhanced', 'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg'}, {'title': 'Valles Marineris Hemisphere Enhanced', 'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg'}]


In [36]:
#Close browser
browser.quit()