# 6.3 Nasa Satellite images - solution 1 (with selenium; slow-ish)

1. Suppose we want to build a Computer vision dataset that involves satellite images. 
2. Your tasks are the following:
    * Collect satellite images from  https://earthobservatory.nasa.gov/images
    * Make sure to render the whole page using selenium and then use BeautifulSoup to scrape the data.
    * Create a repo and name it Images, Save the crowled images based on their titles. 
    * Create a dictionary where the keys are the images/titles and the values are the images’ descriptions.


# Class with scraping functions

In [4]:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.service import Service
from selenium import webdriver
import time
import os
import requests
from bs4 import BeautifulSoup
import json

class NasaScraper():

    # constructor
    def __init__(self, max_download_num = 100):
        ''' Initialize the driver and dictionary with meta info '''
        self.image_dict = {}
        self.driver = self.create_webdriver()
        self.max_download_num = max_download_num
        self.n_downloaded = 0
       
    def save_dictionary_to_JSON(self):
        ''' Save the dictionary with meta info to JSON file '''
        print("Saving dictionary as JSON")
        with open('nasa_images.json', 'w') as json_file:
            json.dump(self.image_dict, json_file)
    
    def create_webdriver(self):
        ''' create a webdriver -- we could move this to a separate class (or superclass), since we use this in multiple projects '''
        # create a webdriver instance
        exe_location = r"C:\Webdriver\geckodriver.exe"
        firefox_binary_location = r"C:\Program Files\Mozilla Firefox\firefox.exe"  
        options = Options()
        options.binary_location = firefox_binary_location
        service = Service(executable_path=exe_location)
        driver = webdriver.Firefox(service=service, options=options)
        return driver
    
    # download and save an image 
    def download_image(self, url, category, title):
        ''' download and save the image at the specified url '''
        # create filename
        filename = category + "_" + title.strip().replace(' ','_').replace('.','_').replace('?','') + ".jpg"
        filename = ''.join(c for c in filename if c.isalnum() or c.isspace() or c in ('.', '_', '-'))
        file_path = os.path.join("./nasa_images", filename)        
        # do nothing if file was already downloaded earlier
        if os.path.exists(file_path):
            print(f"Skip (already downloaded): {title}")
            return
        # otherwise, download and save the image
        print(f"Downloading image: {title}")
        response = requests.get(url, stream=True)
        response.raise_for_status()
        self.n_downloaded += 1
        # write to file
        with open("./nasa_images/" + filename, "wb") as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
    
    def scrape_images(self):
        ''' automatically navigate through the website with images and scrape all images '''
        self.driver.get("https://earthobservatory.nasa.gov/images")
        # define the pages (to do: read this list automatically from the page instead of hard-code it)
        button_names = ["atmosphere", "heat", "human", "land", "life", "naturalevent", "remote", "snowice", "water"]
        # loop over image pages and download the images
        for button_name in button_names:
            # press the menu button to open the current image page
            try:
                menu_button = WebDriverWait(self.driver, 10).until(
                    EC.element_to_be_clickable((By.CSS_SELECTOR, ".btn.btn-filter.btn-" + button_name + ".no-underline.hvr-rectangle-out"))
                )
                menu_button.click()
            except Exception as e:
                print("An error occurred:", e)
            # download the images and press 'explore more' button until there is no new content
            downloaded_image_ids = set()   # keep track of downloaded images to avoid duplicate downloads
            while self.n_downloaded < self.max_download_num:
                # Find all thumbnail divs
                page_content = self.driver.page_source
                soup = BeautifulSoup(page_content, "html.parser")
                thumbnail_divs = soup.find_all("div", class_="thumbnail-image")

                for div in thumbnail_divs:
                    image_tag = div.find("img")

                    if image_tag:
                        image_url = image_tag["src"]
                        image_title = image_tag["alt"]
                        image_id = image_url.split('/')[-1]  # Extract image ID from the URL
                        caption_div = div.find_next_sibling("div", class_="caption")
                        image_description = caption_div.find("p").text
                        self.image_dict[image_title] = image_description                        
                        
                        if image_id not in downloaded_image_ids and self.n_downloaded < self.max_download_num:
                            self.download_image(image_url, button_name, image_title)
                            downloaded_image_ids.add(image_id)
                # Try to click the "Explore More" button
                try:
                    explore_more_button = WebDriverWait(self.driver, 10).until(
                        EC.element_to_be_clickable((By.CSS_SELECTOR, ".explore-more"))
                    )
                    explore_more_button.click()
                    time.sleep(1)
                except Exception as e:
                    print("No more content to load or an error occurred:", e)
                    break
        print(f"Downloaded {self.n_downloaded} images.\nThe repo now contains {len(self.image_dict)} images.")
    
    def print_image_dict(self):
        for title, description in self.image_dict.items():
            print(f"Title: {title}\nDescription: {description}\n\n")
    
    def close_driver(self):
        self.driver.quit()


# Main script

In [6]:
# instantiate the class
print("Instantiating NasaScraper object...")
scraper = NasaScraper()

# scrape images
scraper.scrape_images()

# we are done -- close the driver
scraper.close_driver()

Instantiating NasaScraper object...
Skip (already downloaded): Deadly Blooms in the Gulf of Mannar
Skip (already downloaded): Popocatépetl Volcano Keeps on Puffing
Skip (already downloaded): Espíritu Santo Archipelago
Skip (already downloaded): Tulare Lake Grows
Skip (already downloaded): Freddy Brings Lean Times to Malawi
Skip (already downloaded): An Awesome Aurora
Skip (already downloaded): Wave Clouds Over the Crozet Islands
Skip (already downloaded): Swirly Clouds in the Canaries
Skip (already downloaded): Cyclone Ilsa Reaches Western Australia
Skip (already downloaded): Kamchatka Erupts
Skip (already downloaded): Tornado Razes a Path Through Wynne
Skip (already downloaded): How Dust Affects the World’s Health
Skip (already downloaded): For the Longest Time
Skip (already downloaded): Taking Stock of Carbon Dioxide Emissions
Skip (already downloaded): Nitrogen Dioxide in the Neighborhood
Skip (already downloaded): A Dazzling Aurora Borealis
Skip (already downloaded): Dust Blows Acr


KeyboardInterrupt



In [None]:
scraper.print_image_dict()