# National Parks Brochure Scraper

This notebook scrapes park brochures from the U.S. National Parks Service website, extracts text and metadata, and writes results to Google Sheets.

## Features
- Scrapes 20 National Park brochures (configurable)
- Extracts park name, state, established year, and size
- Writes to Google Sheets
- Respects 10-second delay between requests
- Comprehensive error handling

## Setup

First, install the required dependencies and import libraries.

In [None]:
# Install required packages
!pip install PyPDF2 gspread google-auth google-auth-oauthlib google-auth-httplib2 -q

print("‚úì Dependencies installed successfully!")

In [None]:
# Import libraries
import requests
import time
import re
import io
import json
from typing import Dict, List, Optional
from datetime import datetime

from PyPDF2 import PdfReader
import gspread
from google.auth import default
from google.colab import auth

print("‚úì Libraries imported successfully!")

## Google Authentication

Authenticate with Google to access Google Sheets.

In [None]:
# Authenticate with Google
auth.authenticate_user()
creds, _ = default()

print("‚úì Google authentication successful!")

## Configuration

Set your Google Sheets URL and scraping parameters.

In [None]:
# Configuration
SPREADSHEET_URL = "https://docs.google.com/spreadsheets/d/1QfsxIUok_5owSTJvI1_V5GNuzTsAOShHxktxh9w_jHA/edit?usp=sharing"
LIMIT = 20  # Number of parks to scrape
DELAY_SECONDS = 10  # Delay between requests

print(f"Configuration:")
print(f"  Spreadsheet: {SPREADSHEET_URL}")
print(f"  Parks to scrape: {LIMIT}")
print(f"  Delay: {DELAY_SECONDS} seconds")

## Scraper Class Definition

Define the main scraper class.

In [None]:
class NationalParksScraper:
    """Scraper for National Parks brochures"""

    # Major National Parks with their 4-letter codes
    PARK_CODES = [
        ('yell', 'Yellowstone'),
        ('yose', 'Yosemite'),
        ('grca', 'Grand Canyon'),
        ('zion', 'Zion'),
        ('acad', 'Acadia'),
        ('glac', 'Glacier'),
        ('romo', 'Rocky Mountain'),
        ('olym', 'Olympic'),
        ('grsm', 'Great Smoky Mountains'),
        ('shen', 'Shenandoah'),
        ('arch', 'Arches'),
        ('cany', 'Canyonlands'),
        ('brca', 'Bryce Canyon'),
        ('jotr', 'Joshua Tree'),
        ('deva', 'Death Valley'),
        ('seki', 'Sequoia'),
        ('redw', 'Redwood'),
        ('noca', 'North Cascades'),
        ('mora', 'Mount Rainier'),
        ('grte', 'Grand Teton'),
        ('badl', 'Badlands'),
        ('cave', 'Carlsbad Caverns'),
        ('pefo', 'Petrified Forest'),
        ('thro', 'Theodore Roosevelt'),
        ('meve', 'Mesa Verde'),
        ('crla', 'Crater Lake'),
        ('lavo', 'Lassen Volcanic'),
        ('chis', 'Channel Islands'),
        ('pinn', 'Pinnacles'),
        ('kova', 'Kobuk Valley'),
    ]

    def __init__(self, delay_seconds: int = 10):
        self.delay_seconds = delay_seconds
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        })
        self.results = []

    def find_brochure_url(self, park_code: str, park_name: str) -> Optional[str]:
        """Find the brochure URL for a given park"""
        patterns = [
            f"https://www.nps.gov/{park_code}/planyourvisit/upload/{park_name.replace(' ', '-')}-Brochure.pdf",
            f"https://www.nps.gov/{park_code}/planyourvisit/upload/{park_name.replace(' ', '-')}-brochure.pdf",
            f"https://www.nps.gov/{park_code}/learn/upload/{park_name.replace(' ', '-')}-Brochure.pdf",
        ]

        try:
            brochure_page = f"https://www.nps.gov/{park_code}/planyourvisit/brochures.htm"
            response = self.session.get(brochure_page, timeout=30)
            if response.status_code == 200:
                pdf_links = re.findall(r'href="([^"]*\.pdf)"', response.text)
                if pdf_links:
                    pdf_url = pdf_links[0]
                    if not pdf_url.startswith('http'):
                        pdf_url = f"https://www.nps.gov{pdf_url}" if pdf_url.startswith('/') else f"https://www.nps.gov/{park_code}/planyourvisit/{pdf_url}"
                    return pdf_url
        except:
            pass

        for pattern in patterns:
            try:
                response = self.session.head(pattern, timeout=10, allow_redirects=True)
                if response.status_code == 200:
                    return pattern
            except:
                continue
        return None

    def download_pdf(self, url: str) -> Optional[bytes]:
        """Download PDF from URL"""
        try:
            response = self.session.get(url, timeout=60)
            response.raise_for_status()
            return response.content
        except Exception as e:
            print(f"Error downloading: {e}")
            return None

    def extract_text_from_pdf(self, pdf_content: bytes) -> str:
        """Extract text from PDF content"""
        try:
            pdf_file = io.BytesIO(pdf_content)
            reader = PdfReader(pdf_file)
            text = ""
            for page in reader.pages:
                text += page.extract_text() + "\n"
            return text
        except Exception as e:
            print(f"Error extracting text: {e}")
            return ""

    def parse_park_info(self, text: str, park_name: str) -> Dict[str, str]:
        """Parse park information from extracted text"""
        info = {
            'park_name': park_name,
            'state': '',
            'established_year': '',
            'size': ''
        }

        # Extract state
        state_pattern = r'\b(Alabama|Alaska|Arizona|Arkansas|California|Colorado|Connecticut|Delaware|Florida|Georgia|Hawaii|Idaho|Illinois|Indiana|Iowa|Kansas|Kentucky|Louisiana|Maine|Maryland|Massachusetts|Michigan|Minnesota|Mississippi|Missouri|Montana|Nebraska|Nevada|New Hampshire|New Jersey|New Mexico|New York|North Carolina|North Dakota|Ohio|Oklahoma|Oregon|Pennsylvania|Rhode Island|South Carolina|South Dakota|Tennessee|Texas|Utah|Vermont|Virginia|Washington|West Virginia|Wisconsin|Wyoming)\b'
        state_match = re.search(state_pattern, text, re.IGNORECASE)
        if state_match:
            info['state'] = state_match.group(1)

        # Extract established year
        year_patterns = [
            r'[Ee]stablished[:\s]+(?:in\s+)?(\d{4})',
            r'[Dd]esignated[:\s]+(?:in\s+)?(\d{4})',
            r'[Cc]reated[:\s]+(?:in\s+)?(\d{4})',
        ]
        for pattern in year_patterns:
            year_match = re.search(pattern, text)
            if year_match:
                year = year_match.group(1)
                if 1850 <= int(year) <= datetime.now().year:
                    info['established_year'] = year
                    break

        # Extract size
        size_patterns = [
            r'(\d+[\d,]*)\s+acres',
            r'(\d+[\d,]*)\s+square\s+miles',
        ]
        for pattern in size_patterns:
            size_match = re.search(pattern, text, re.IGNORECASE)
            if size_match:
                info['size'] = size_match.group(0)
                break

        return info

    def scrape_parks(self, limit: int = 20) -> List[Dict[str, str]]:
        """Scrape park brochures"""
        print(f"Starting to scrape up to {limit} parks...")
        print("-" * 60)

        count = 0
        for park_code, park_name in self.PARK_CODES:
            if count >= limit:
                break

            print(f"\n[{count + 1}/{limit}] {park_name}...")

            try:
                brochure_url = self.find_brochure_url(park_code, park_name)
                if not brochure_url:
                    print(f"  ‚ùå No brochure found")
                    continue

                print(f"  üìÑ {brochure_url}")

                pdf_content = self.download_pdf(brochure_url)
                if not pdf_content:
                    print(f"  ‚ùå Download failed")
                    continue

                print(f"  ‚úì Downloaded ({len(pdf_content)} bytes)")

                text = self.extract_text_from_pdf(pdf_content)
                if not text:
                    print(f"  ‚ùå Text extraction failed")
                    continue

                print(f"  ‚úì Extracted {len(text)} chars")

                info = self.parse_park_info(text, park_name)
                info['brochure_url'] = brochure_url
                self.results.append(info)
                count += 1

                print(f"  ‚úì State: {info['state']}, Year: {info['established_year']}, Size: {info['size']}")

                if count < limit:
                    print(f"  ‚è≥ Waiting {self.delay_seconds} seconds...")
                    time.sleep(self.delay_seconds)

            except Exception as e:
                print(f"  ‚ùå Error: {e}")
                continue

        print(f"\n{'='*60}")
        print(f"‚úì Complete! Processed {len(self.results)} parks")
        return self.results

    def write_to_google_sheets(self, spreadsheet_url: str, creds):
        """Write results to Google Sheets"""
        try:
            gc = gspread.authorize(creds)
            spreadsheet_id = spreadsheet_url.split('/d/')[1].split('/')[0]
            spreadsheet = gc.open_by_key(spreadsheet_id)

            try:
                worksheet = spreadsheet.worksheet('Park Data')
            except:
                worksheet = spreadsheet.add_worksheet(title='Park Data', rows=100, cols=10)

            headers = ['Park Name', 'State', 'Established Year', 'Size', 'Brochure URL', 'Scraped Date']
            data = [headers]

            for result in self.results:
                row = [
                    result['park_name'],
                    result['state'],
                    result['established_year'],
                    result['size'],
                    result['brochure_url'],
                    datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                ]
                data.append(row)

            worksheet.clear()
            worksheet.update('A1', data)

            print(f"\n‚úì Wrote {len(self.results)} rows to Google Sheets")
            print(f"  Sheet: {spreadsheet.title}")

        except Exception as e:
            print(f"\n‚ùå Error writing to Google Sheets: {e}")

print("‚úì Scraper class defined!")

## Run the Scraper

Execute the scraping process.

In [None]:
# Create and run scraper
scraper = NationalParksScraper(delay_seconds=DELAY_SECONDS)
results = scraper.scrape_parks(limit=LIMIT)

# Display summary
print(f"\n{'='*60}")
print(f"Scraped {len(results)} parks successfully!")
print(f"{'='*60}")

## Write to Google Sheets

Save the results to your Google Sheet.

In [None]:
# Write to Google Sheets
if results:
    scraper.write_to_google_sheets(SPREADSHEET_URL, creds)
else:
    print("No results to write to Google Sheets")

## Preview Results

Display a preview of the scraped data.

In [None]:
# Display results preview
import pandas as pd

if results:
    df = pd.DataFrame(results)
    df = df[['park_name', 'state', 'established_year', 'size']]
    print("\nResults Preview:")
    print(df.to_string(index=False))
else:
    print("No results to display")

## Save to JSON (Optional)

Save a backup copy as JSON.

In [None]:
# Save to JSON
if results:
    with open('parks_data.json', 'w') as f:
        json.dump(results, f, indent=2)
    print("‚úì Results saved to parks_data.json")
    
    # Download the file
    from google.colab import files
    files.download('parks_data.json')