In [1]:
!pip install bs4 tqdm



In [6]:
import requests
from bs4 import BeautifulSoup
import re
import os
from concurrent.futures import ThreadPoolExecutor
import time
from tqdm import tqdm  # For progress bar
import math
import zipfile
from osgeo import ogr
import json


## Harvest Zip URLs

In [7]:


# URL of the page to scrape
url = "https://disasters.geoplatform.gov/USA_Structures/"

# Send a GET request to fetch the HTML content of the page
response = requests.get(url)

# Save the HTML content to a local file
with open('USA_Structures.html', 'w', encoding='utf-8') as file:
    file.write(response.text)

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')

# Find all URLs that end with .zip (assuming these are the downloadable zip files)
zip_links = []
for link in soup.find_all('a', href=True):
    if re.search(r'\.zip$', link['href']):
        zip_links.append(link['href'])

# Print the list of zip file URLs
for zip_link in zip_links:
    print(zip_link)
    
print(len(zip_links))



https://fema-femadata.s3.amazonaws.com/Partners/ORNL/USA_Structures/Alabama/Deliverable20230526AL.zip
https://fema-femadata.s3.amazonaws.com/Partners/ORNL/USA_Structures/Alaska/Deliverable20230728AK.zip
https://fema-femadata.s3.amazonaws.com/Partners/ORNL/USA_Structures/American+Samoa/Deliverable20230831AS.zip
https://fema-femadata.s3.amazonaws.com/Partners/ORNL/USA_Structures/Arizona/Deliverable20230502AZ.zip
https://fema-femadata.s3.amazonaws.com/Partners/ORNL/USA_Structures/Arkansas/Deliverable20230630AR.zip
https://fema-femadata.s3.amazonaws.com/Partners/ORNL/USA_Structures/California/Deliverable20230728CA.zip
https://fema-femadata.s3.amazonaws.com/Partners/ORNL/USA_Structures/Colorado/Deliverable20230630CO.zip
https://fema-femadata.s3.amazonaws.com/Partners/ORNL/USA_Structures/Connecticut/Deliverable20230502CT.zip
https://fema-femadata.s3.amazonaws.com/Partners/ORNL/USA_Structures/Delaware/Deliverable20230630DE.zip
https://fema-femadata.s3.amazonaws.com/Partners/ORNL/USA_Structure

# Download to Q Drive

In [None]:
# Destination folder where the files will be saved (SMB path)
destination_folder = "/Volumes/GIS/FEMA_USA_Structures_10_17_2024"

# Ensure the destination folder exists
os.makedirs(destination_folder, exist_ok=True)

# Function to download a single file with progress feedback
def download_zip(url, index, total_files):
    start_time = time.time()
    file_name = os.path.join(destination_folder, os.path.basename(url))
    response = requests.get(url, stream=True)
    
    if response.status_code == 200:
        total_size = int(response.headers.get('content-length', 0))
        block_size = 1024  # 1 Kilobyte
        t = tqdm(total=total_size, unit='B', unit_scale=True, desc=f"File {index+1}/{total_files}: {file_name}")

        with open(file_name, 'wb') as f:
            for data in response.iter_content(block_size):
                t.update(len(data))
                f.write(data)
        t.close()

        # Time calculation for this file
        download_time = time.time() - start_time
        file_size_MB = total_size / (1024 * 1024)
        print(f"Downloaded: {file_name} | Size: {file_size_MB:.2f} MB | Time: {download_time:.2f} s")

    else:
        print(f"Failed to download: {url}")

# Function to download files in parallel with a control on simultaneous downloads
def download_zips_in_parallel(zip_links, max_workers=4):
    total_files = len(zip_links)
    start_time = time.time()
    
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = []
        for index, url in enumerate(zip_links):
            futures.append(executor.submit(download_zip, url, index, total_files))

    # Total time calculation
    total_time_elapsed = time.time() - start_time
    print(f"All downloads completed in {total_time_elapsed:.2f} seconds")

# Call the function to start downloading with progress
download_zips_in_parallel(zip_links, max_workers=4)


# Unzip folders

In [None]:


# Path to the folder containing the zip files
folder_path = "/Volumes/GIS/FEMA_USA_Structures_10_17_2024"

# Function to unzip all zip files in the folder
def unzip_files_in_folder(folder_path):
    # Iterate through all files in the folder
    for file_name in os.listdir(folder_path):
        # Check if the file is a zip file
        if file_name.endswith('.zip'):
            file_path = os.path.join(folder_path, file_name)
            # Define the directory where the files will be extracted
            extract_path = folder_path
            
            # Unzip the file
            with zipfile.ZipFile(file_path, 'r') as zip_ref:
                print(f"Unzipping {file_name}...")
                zip_ref.extractall(extract_path)
                print(f"Unzipped {file_name} to {extract_path}")

# Call the function to start unzipping
unzip_files_in_folder(folder_path)


# Convert GDB to GEoJSON with Fiona

In [None]:
# Path to the root folder where GDB files are stored
root_folder = "/Volumes/GIS/FEMA_USA_Structures_10_17_2024"

# Function to convert feature layer to newline-delimited GeoJSON
def convert_to_ndjson(input_gdb, layer_name, output_file):
    # Open the GDB layer using Fiona
    with fiona.open(input_gdb, layer=layer_name) as source:
        with open(output_file, 'w') as output_ndjson:
            # Iterate through each feature and write it in GeoJSON format
            for feature in source:
                output_ndjson.write(json.dumps(feature) + '\n')
            print(f"Converted {layer_name} to {output_file}")

# Function to traverse directories and find GDB files
def traverse_and_convert(root_folder):
    # Walk through the root folder
    for dirpath, dirnames, filenames in os.walk(root_folder):
        for dirname in dirnames:
            if dirname.endswith('.gdb'):
                gdb_path = os.path.join(dirpath, dirname)
                print(f"Found GDB: {gdb_path}")

                # List all layers in the GDB
                with fiona.Env():
                    layers = fiona.listlayers(gdb_path)
                    for layer in layers:
                        print(f"Processing layer: {layer}")
                        # Define output path for the newline-delimited GeoJSON
                        output_ndjson = os.path.join(dirpath, f"{layer}.ndjson")
                        # Convert the feature layer to newline-delimited GeoJSON
                        convert_to_ndjson(gdb_path, layer, output_ndjson)

# Start the traversal and conversion process
traverse_and_convert(root_folder)


# Use GDAL to Convert GDB to Geojson

In [None]:
# Path to the root folder where GDB files are stored
root_folder = "/Volumes/GIS/FEMA_USA_Structures_10_17_2024"
output_folder = "./geojson"

# Ensure the output folder exists
os.makedirs(output_folder, exist_ok=True)

# Function to convert feature layer to valid GeoJSON
def convert_to_geojson(input_gdb, layer_name, output_file):
    driver = ogr.GetDriverByName('OpenFileGDB')
    gdb = driver.Open(input_gdb, 0)  # 0 means read-only
    if not gdb:
        print(f"Failed to open GDB: {input_gdb}")
        return

    layer = gdb.GetLayerByName(layer_name)
    if not layer:
        print(f"Layer {layer_name} not found in GDB: {input_gdb}")
        return

    # Create a new GeoJSON file
    with open(output_file, 'w') as output_geojson:
        geojson = {
            "type": "FeatureCollection",
            "features": []
        }
        # Iterate through features in the layer
        for feature in layer:
            geojson_feature = json.loads(feature.ExportToJson())
            geojson["features"].append(geojson_feature)
        
        # Write the entire GeoJSON structure to file
        json.dump(geojson, output_geojson)
        print(f"Converted {layer_name} to {output_file}")

# Function to traverse directories and find GDB files
def traverse_and_convert(root_folder):
    # Walk through the root folder and find GDB files
    gdb_files = []
    for dirpath, dirnames, filenames in os.walk(root_folder):
        for dirname in dirnames:
            if dirname.endswith('.gdb'):
                gdb_files.append(os.path.join(dirpath, dirname))
    
    total_gdbs = len(gdb_files)
    print(f"Found {total_gdbs} GDB files to process.")
    
    # Process each GDB file
    for gdb_index, gdb_path in enumerate(tqdm(gdb_files, desc="Processing GDB files", unit="gdb")):
        driver = ogr.GetDriverByName('OpenFileGDB')
        gdb = driver.Open(gdb_path, 0)  # Read-only mode
        
        if gdb:
            layer_count = gdb.GetLayerCount()
            print(f"Processing GDB {gdb_index + 1}/{total_gdbs}: {gdb_path}, {layer_count} layers")

            # Process each layer in the GDB
            for i in range(layer_count):
                layer = gdb.GetLayerByIndex(i)
                layer_name = layer.GetName()
                print(f"Processing layer: {layer_name}")

                # Define the output path for the GeoJSON
                output_geojson = os.path.join(output_folder, f"{layer_name}.geojson")
                # Convert the feature layer to valid GeoJSON
                convert_to_geojson(gdb_path, layer_name, output_geojson)

# Start the traversal and conversion process with progress reporting
traverse_and_convert(root_folder)


Found 56 GDB files to process.


Processing GDB files:   0%|          | 0/56 [00:00<?, ?gdb/s]

Processing GDB 1/56: /Volumes/GIS/FEMA_USA_Structures_10_17_2024/Deliverable20230502AZ/AZ_Structures.gdb, 1 layers
Processing layer: AZ_Structures


