# 6.3 Nasa Satellite images - solution 2 (without selenium; faster)

1. Suppose we want to build a Computer vision dataset that involves satellite images. 
2. Your tasks are the following:
    * Collect satellite images from  https://earthobservatory.nasa.gov/images
    * Make sure to render the whole page using selenium and then use BeautifulSoup to scrape the data.
    * Create a repo and name it Images, Save the crowled images based on their titles. 
    * Create a dictionary where the keys are the images/titles and the values are the images’ descriptions.


# Scraping functions

In [1]:
import requests
import json
import os
import os.path
import time
import random
from concurrent.futures import ThreadPoolExecutor

def download_image(image_info):
    ''' Download and save a given image '''
    url = image_info['url']
    title = image_info['title']
    filename = ''.join(c for c in title if c.isalnum() or c.isspace() or c in (';', '&', '.', '_', '-')).rstrip() + ".jpg"
    filename = filename.replace(';', '-').replace('&', '_').replace(' ', '_').replace('\r', '').replace('\n', '')
    
    file_path = os.path.join("./nasa_images", filename)
    if os.path.exists(file_path):
        return
    response = requests.get(url, stream=True)
    response.raise_for_status()
    with open(file_path, "wb") as f:
        for chunk in response.iter_content(chunk_size=8192):
            f.write(chunk)

def process_image_page(page_number):
    ''' Process one page with image URLs '''
    sleep_time = 1
    # sleep briefly to avoid getting blocked
    time.sleep(sleep_time)
    url = f"https://earthobservatory.nasa.gov/images/getRecords?page={page_number}"
    # try up to 5 times - increase sleep_time each time a request fails
    for retry in range(10):  
        response = requests.get(url)   
        # handle a 503 error (service unavailable -> likely because of being rate-limited by the server)
        if response.status_code == 503:
            sleep_time *= 2
            print(f"Received 503 error, retrying in {sleep_time} seconds...")
            time.sleep(sleep_time)  # Exponential backoff
            continue
        # handle other errors:
        elif response.status_code != 200:
            print(f"Error: status code {response.status_code}, skipping page {page_number}")
            return []
        data = json.loads(response.text)
        break
    else:
        print(f"Failed to process page {page_number} after 5 retries.")
        return []        
    # we got data -> process it
    image_data = []
    for record in data['data']:
        # get the image data
        image_url = record['image_path'] + record['thumbnail_file']
        title = record['title']
        caption_short = record['caption_short']
        image_info = {'url': image_url, 'title': title, 'caption_short': caption_short}
        image_data.append(image_info)
        # download the image
        download_image(image_info)
    return image_data

def save_dictionary_to_JSON(data_dict, file_name):
    with open(file_name, 'w') as json_file:
        json.dump(data_dict, json_file, indent=4)        

# Main script

In [2]:
# some settings
image_directory = "./nasa_images"
page_number = 1
all_image_data = {}
n_workers = 16
n_seconds_total = 0

# make sure the image directory exists
if not os.path.exists(image_directory):
    os.makedirs(image_directory)

# process image pages in parallel
with ThreadPoolExecutor(max_workers=n_workers) as executor:
    while True:        
        print(f"Processing pages {page_number} to {page_number+n_workers-1}")
        start_time = time.time()

        # process the batch of pages in parallel
        time.sleep(10) 
        page_numbers = [page_number + i for i in range(n_workers)]
        image_data_list = list(executor.map(process_image_page, page_numbers))
        image_data = [image_info for sublist in image_data_list for image_info in sublist]

        # check if we are done
        if len(image_data) == 0:
            print("No more images found -> exiting")
            break

        # add meta data to dictionary
        for image_info in image_data:
            all_image_data[image_info['title']] = image_info

        # save meta data
        save_dictionary_to_JSON(all_image_data, 'nasa_image_data.json')

        # show some progress info
        n_seconds = time.time() - start_time
        n_seconds_total += n_seconds
        print(f" took {round(n_seconds,1)} seconds; total number of images = {len(all_image_data)}; images per second = {round(len(all_image_data)/n_seconds_total,1)}")
        
        # increase starting page number for next batch
        page_number += n_workers

Processing pages 1 to 16
 took 16.4 seconds; total number of images = 80; images per second = 4.9
Processing pages 17 to 32
 took 15.6 seconds; total number of images = 160; images per second = 5.0
Processing pages 33 to 48
 took 15.5 seconds; total number of images = 240; images per second = 5.1
Processing pages 49 to 64
 took 16.1 seconds; total number of images = 320; images per second = 5.0
Processing pages 65 to 80
 took 16.2 seconds; total number of images = 399; images per second = 5.0
Processing pages 81 to 96
 took 15.4 seconds; total number of images = 479; images per second = 5.0
Processing pages 97 to 112
 took 16.0 seconds; total number of images = 559; images per second = 5.0
Processing pages 113 to 128
 took 16.2 seconds; total number of images = 639; images per second = 5.0
Processing pages 129 to 144
 took 16.1 seconds; total number of images = 719; images per second = 5.0
Processing pages 145 to 160
 took 16.0 seconds; total number of images = 799; images per second =