In [None]:
!pip install bs4 tqdm

In [None]:
import requests
from bs4 import BeautifulSoup
import re
import os
from concurrent.futures import ThreadPoolExecutor
import time
from tqdm import tqdm  # For progress bar
import math

## Harvest Zip URLs

In [None]:


# URL of the page to scrape
url = "https://disasters.geoplatform.gov/USA_Structures/"

# Send a GET request to fetch the HTML content of the page
response = requests.get(url)

# Save the HTML content to a local file
with open('USA_Structures.html', 'w', encoding='utf-8') as file:
    file.write(response.text)

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')

# Find all URLs that end with .zip (assuming these are the downloadable zip files)
zip_links = []
for link in soup.find_all('a', href=True):
    if re.search(r'\.zip$', link['href']):
        zip_links.append(link['href'])

# Print the list of zip file URLs
for zip_link in zip_links:
    print(zip_link)
    
print(len(zip_links))



# Download to Q Drive

In [None]:
# Destination folder where the files will be saved (SMB path)
destination_folder = "/Volumes/GIS/FEMA_USA_Structures_10_17_2024"

# Ensure the destination folder exists
os.makedirs(destination_folder, exist_ok=True)

# Function to download a single file with progress feedback
def download_zip(url, index, total_files):
    start_time = time.time()
    file_name = os.path.join(destination_folder, os.path.basename(url))
    response = requests.get(url, stream=True)
    
    if response.status_code == 200:
        total_size = int(response.headers.get('content-length', 0))
        block_size = 1024  # 1 Kilobyte
        t = tqdm(total=total_size, unit='B', unit_scale=True, desc=f"File {index+1}/{total_files}: {file_name}")

        with open(file_name, 'wb') as f:
            for data in response.iter_content(block_size):
                t.update(len(data))
                f.write(data)
        t.close()

        # Time calculation for this file
        download_time = time.time() - start_time
        file_size_MB = total_size / (1024 * 1024)
        print(f"Downloaded: {file_name} | Size: {file_size_MB:.2f} MB | Time: {download_time:.2f} s")

    else:
        print(f"Failed to download: {url}")

# Function to download files in parallel with a control on simultaneous downloads
def download_zips_in_parallel(zip_links, max_workers=4):
    total_files = len(zip_links)
    start_time = time.time()
    
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = []
        for index, url in enumerate(zip_links):
            futures.append(executor.submit(download_zip, url, index, total_files))

    # Total time calculation
    total_time_elapsed = time.time() - start_time
    print(f"All downloads completed in {total_time_elapsed:.2f} seconds")

# Call the function to start downloading with progress
download_zips_in_parallel(zip_links, max_workers=4)


# Unzip contents