In [None]:
# Load Dependencies

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from splinter import Browser
from bs4 import BeautifulSoup as bs
import time
import json
import csv
import os

In [67]:
# Setup Selenium with Chrome
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)

In [68]:
# Visit the USGS search results page
base_url = "https://astrogeology.usgs.gov"
search_url = f"{base_url}/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
driver.get(search_url)
time.sleep(2)

In [69]:
# Parse with BeautifulSoup
soup = bs(driver.page_source, 'html.parser')
items = soup.find_all('div', class_='description')

In [70]:
# Save to Dictionary
hemisphere_image_urls = []

In [71]:
for item in items:
    # Extract and clean the title
    raw_title = item.find('h3').text.strip()
    title = raw_title.replace(" Enhanced", "").replace("Hemisphere", "Hemisphere").strip()

    # Build full subpage URL
    partial_url = item.find('a')['href']
    full_link = base_url + partial_url

    # Navigate to subpage
    driver.get(full_link)
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.LINK_TEXT, 'downloads')))

    # Parse subpage
    sub_soup = bs(driver.page_source, 'html.parser')
    img_url = sub_soup.find('a', text='Sample')['href']

    # Append to list
    hemisphere_image_urls.append({
        'title': title,
        'page_url': full_link,
        'img_url': img_url
    })


In [72]:
import json
print(json.dumps(hemisphere_image_urls, indent=2))


[]


In [73]:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup as bs
import time

hemisphere_image_urls = []
base_url = "https://astrogeology.usgs.gov"

# Let the page load and parse
soup = bs(driver.page_source, 'html.parser')
items = soup.find_all('div', class_='item')

# Sanity check
print(f"Found {len(items)} items")

for item in items:
    try:
        # Build the full link to the hemisphere page
        partial_url = item.find('a')['href']
        full_link = base_url + partial_url

        # Go to the hemisphere's detail page
        driver.get(full_link)
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'h2')))
        time.sleep(1)  # Slight delay to ensure page is loaded

        # Parse the subpage
        sub_soup = bs(driver.page_source, 'html.parser')
        title = sub_soup.find('h2', class_='title').text.replace(' Enhanced', '').strip()
        img_url = sub_soup.find('a', text='Sample')['href']

        # Store in list
        hemisphere_image_urls.append({
            'title': title,
            'img_url': img_url
        })
        print(f"Added: {title}")

    except Exception as e:
        print(f"Error processing item: {e}")

# Final check
print(hemisphere_image_urls)


Found 0 items
[]


# Debugging For Loop

In [74]:
print(driver.current_url)
print(driver.title)


https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars



In [87]:
from splinter import Browser
from bs4 import BeautifulSoup as soup
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time

def hemisphere_scrape(browser):
    # Step 1: Visit the starting page
    url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(url)
    time.sleep(1)
    print("Visited main page.")

    hemisphere_image_urls = []

    # Step 2: Parse the page and find all hemisphere items
    html = browser.html
    hemi_soup = soup(html, 'html.parser')
    items = hemi_soup.find_all('div', class_='item')
    print(f"Found {len(items)} hemisphere items.")

    base_url = 'https://astrogeology.usgs.gov'

    # Step 3: Loop through each item
    for index, item in enumerate(items):
        print(f"\nProcessing hemisphere {index + 1}")

        title = item.find('h3').text
        partial_link = item.find('a')['href']
        full_link = f"{base_url}{partial_link}"
        print(f"Title: {title}")
        print(f"Link to detail page: {full_link}")

        # Step 4: Visit the hemisphere detail page
        browser.visit(full_link)
        time.sleep(1)
        html = browser.html
        img_soup = soup(html, 'html.parser')

        # Step 5: Extract the image URL
        try:
            img_url = img_soup.find('a', text='Sample')['href']
            print(f"Image URL: {img_url}")
        except TypeError:
            print("Image link not found.")
            img_url = None

        # Step 6: Save the result if image is found
        if img_url:
            hemisphere_image_urls.append({
                'title': title,
                'img_url': img_url
            })
        else:
            print(f"Skipping {title} due to missing image.")

        browser.back()

    print("\nFinal hemisphere image list:")
    print(hemisphere_image_urls)
    return hemisphere_image_urls

# Step 0: Set up the browser
service = Service(ChromeDriverManager().install())
browser = Browser('chrome', service=service, headless=False)

# Run the function
hemisphere_data = hemisphere_scrape(browser)

# Close browser
browser.quit()


Visited main page.
Found 0 hemisphere items.

Final hemisphere image list:
[]


# Debugging Code

In [90]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

options = Options()
options.headless = True  # Try running without GUI
driver = webdriver.Chrome(options=options)
driver.get("https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars")


In [91]:
import time
from bs4 import BeautifulSoup as bs

time.sleep(5)
soup = bs(driver.page_source, 'html.parser')
print(soup.prettify()[:1500])


<html>
 <head>
  <meta content="light dark" name="color-scheme"/>
  <meta charset="utf-8"/>
 </head>
 <body>
  <pre>{"collections":[{"abstract":"This page introduces the Kaguya Multiband Imager derived spectral and derived mineral maps by the Japan Aerospace Exploration Agency (JAXA) and the University of Hawaii. The mosaics were created from topographically-corrected MI","geoform":["Collection"],"name":"lunar-kaguya-multiband-imager-mosaics","onlink":"https://astrogeology.usgs.gov/search/map/lunar-kaguya-multiband-imager-mosaics","pubdate":"2015-01-01","thumb":"https://astrogeology.usgs.gov/ckan/dataset/a53c9dbf-e03a-4d7a-8641-fc7af97a75c2/resource/a7e1a610-f8ef-4080-99e0-1b637d6c269f/download/moon-selene-kaguya-mi-thumb.jpg","title":"Lunar Kaguya Multiband Imager Mosaics"},{"abstract":"&lt;span class=\"intro\"&gt;The following products were the first step of cartography planning&lt;/span&gt; in support of the Cassini-Huygens mission to the Saturian System. Five of Saturans moons are 

During the web scraping process, I initially attempted to extract data from the USGS Mars hemispheres webpage using traditional HTML parsing with BeautifulSoup. However, repeated attempts to load the page source resulted in errors, including crashes tied to the Selenium page_source method. Upon closer inspection, I noticed the response was not returning an expected HTML document, but rather a JSON payload embedded within a <pre> tag. This indicated that the endpoint was no longer serving HTML content for direct browser rendering. Instead, it now returns structured data in JSON format, which is meant for programmatic consumption via API calls. This discovery prompted a shift in approach—from HTML scraping to handling and parsing JSON responses using the requests library. This allowed for more efficient and reliable access to the data.