In [112]:
%run set_up.py 

import boto3
from botocore import UNSIGNED
from botocore.client import Config
import py7zr
import re
import requests
import shutil
from urllib.parse import urlparse
from urllib.parse import urljoin
from zipfile import ZipFile

In [149]:
def download_and_extract_zip(in_url, out_dir, out_zip_name=None, verbose=True):
    """
    Download and extract zip file
    
    Parameters:
    url (str): URL to download the zip file
    out_dir (str): Directory to save and extract the files
    
    Returns:
    str: Path to the extracted GDB directory
    """
    
    # Create output directory if it doesn't exist
    if not os.path.exists(out_dir):
        if verbose:
            print(f'Creating {out_dir}')
        os.makedirs(out_dir)

    # Get filename from URL if not provided
    if out_zip_name is None:
        if verbose:
            print('Did not provide output zip file name, extracting from URL')
        out_zip_name = os.path.basename(urlparse(in_url).path)
        
        # Check if file has zip or 7z extension
        if not re.match(r'.*\.(zip|7z)$', out_zip_name):
            raise ValueError("Could not extract file name with zip or 7z extension from URL")

    out_zip_path = os.path.join(out_dir, out_zip_name)
    
    # Download the file
    if not os.path.exists(out_zip_path):
        if verbose:
            print(f"Downloading from {in_url}...")
        response = requests.get(in_url, stream=True)
        response.raise_for_status()  # Raise an error for bad status codes
        
        # Save the zip file
        with open(out_zip_path, 'wb') as f:
            shutil.copyfileobj(response.raw, f)

        # Extract the zip file
        print("Extracting zip file...")
        with ZipFile(out_zip_path, 'r') as zip_ref:
            zip_ref.extractall(out_dir)
            # Get the name of the first directory in the zip file
            first_file = zip_ref.namelist()[0]
            unzipped_dir = os.path.dirname(first_file)
         
    else:
        if verbose:
            print(f'{out_zip_path} already exists. skipping....')

        with ZipFile(out_zip_path, 'r') as zip_ref:
            # Get the name of the first directory in the zip file
            first_file = zip_ref.namelist()[0]
            unzipped_dir = os.path.dirname(first_file)
            
    # Return the path to the extracted GDB directory
    return(os.path.join(out_dir, unzipped_dir))
        

In [120]:
#Download Water Body Dataset
wbd_url = "https://prd-tnm.s3.amazonaws.com/StagedProducts/Hydrography/WBD/National/GDB/WBD_National_GDB.zip"
nhd_dir = os.path.join(datdir, "nhd")  # Adjust this path as needed

try:
    extracted_path = download_and_extract_zip(
        in_url=wbd_url,
        out_dir=nhd_dir,
        verbose=False)
    #print(f"Files extracted to: {extracted_path}")
except Exception as e:
    print(f"An error occurred: {str(e)}")

In [122]:
#Download NHD
def download_nhdplus_hr_hu4(hu4, out_dir, verbose=True):
    if (not isinstance(hu4, str)) or (len(hu4) != 4) :
        raise TypeError("hu4 argument must be a 4-digit string")
        
    root_url = "https://prd-tnm.s3.amazonaws.com/StagedProducts/Hydrography/NHDPlusHR/Beta/GDB/"
    
    zip_name = f"NHDPLUS_H_{hu4}_HU4_GDB.zip"
    full_url = urljoin(root_url, zip_name)

    out_path = download_and_extract_zip(
        in_url=full_url, 
        out_dir=out_dir, 
        out_zip_name=zip_name,
        verbose=verbose)

    return(out_path)

In [128]:
#Download AWS S3 bucket
#- For multiple patterns: `"Contents[?contains(Key, 'vpu-boundaries.gpkg') || contains(Key, 'other-pattern')]"`
#- For exact suffix match: `"Contents[?ends_with(Key, 'vpu-boundaries.gpkg')]"`

def download_s3_bucket_contents(in_bucket_name, in_prefix, out_dir, key=None,
                               verbose=True):
    # Create the S3 client with the unsigned configuration
    s3 = boto3.client(
        's3',
        region_name='us-west-2',
        config=Config(signature_version=UNSIGNED) # Create an unsigned ("anonymous") configuration
    )
    
    # Create output directory if it doesn't exist
    if not os.path.exists(out_dir):
        print(f'Creating {out_dir}')
        os.makedirs(out_dir)

    try:
        # List all objects in the bucket
        paginator = s3.get_paginator('list_objects_v2')
        page_iterator = paginator.paginate(Bucket=in_bucket_name, 
                                           Prefix=in_prefix)

        if key:
            full_key = os.path.join(in_prefix, key)
            fname = os.path.basename(key) #".".join(key.split(".")[:2]))
            outf_path = os.path.join(out_dir, fname)
            if not os.path.exists(outf_path):
                if verbose: 
                    print(f"Downloading: {key}")
                s3.download_file(in_bucket_name, full_key, 
                                Filename=outf_path)
                if verbose: 
                    print("Download completed successfully")
            else:
                if verbose: 
                    print(f'{outf_path} already exists. Skipping...')
        else:
            # Download each object
            for page in page_iterator:
                if 'Contents' in page:
                    for obj in page['Contents']:
                        # Get the object key
                        key = obj['Key']
                
                        # Download the file
                        fname = os.path.basename(key) #".".join(key.split(".")[:2]))
                        outf_path = os.path.join(out_dir, fname)
                        if not os.path.exists(outf_path):
                            if verbose: 
                                print(f"Downloading: {key}")
                            s3.download_file(in_bucket_name, key, 
                                             Filename=outf_path)
                            if verbose: 
                                print("Download completed successfully")
                        else:
                            if verbose: 
                                print(f'{outf_path} already exists. Skipping...')
                        
    except Exception as e:
        print(f"An error occurred: {str(e)}")

In [132]:
#Download geoglows data

geoglows_dir = os.path.join(datdir, 'geoglows')
#Download VPU to boundaries to know which streams/catchments to use
#http://geoglows-v2.s3-us-west-2.amazonaws.com/streams-global/vpu-boundaries.gpkg
download_s3_bucket_contents(in_bucket_name="geoglows-v2",
                            in_prefix='streams-global/', 
                            out_dir=geoglows_dir,
                            key='vpu-boundaries.gpkg',
                            verbose=False)

#Download streams
#"http://geoglows-v2.s3-website-us-west-2.amazonaws.com/#streams/"
download_s3_bucket_contents(in_bucket_name="geoglows-v2",
                            in_prefix='streams/', 
                            out_dir=os.path.join(geoglows_dir, 'streams'),
                            verbose=False)
#Download catchments
download_s3_bucket_contents(in_bucket_name="geoglows-v2",
                            in_prefix='catchments/', 
                            out_dir=os.path.join(geoglows_dir, 'catchments'),
                            verbose=False)

#Download tables
download_s3_bucket_contents(in_bucket_name="geoglows-v2",
                            in_prefix='tables/', 
                            out_dir=os.path.join(geoglows_dir, 'tables'),
                            verbose=False)

In [138]:
#Download GADM
#https://gadm.org/download_world.html
gadm_url = "https://geodata.ucdavis.edu/gadm/gadm4.1/gadm_410-gpkg.zip"
gadm_dir = os.path.join(datdir, "gadm")  # Adjust this path as needed

try:
    gadm_path = download_and_extract_zip(in_url=gadm_url, 
                                        out_dir=gadm_dir,
                                        verbose=False)
except Exception as e:
    print(f"An error occurred: {str(e)}")

In [140]:
#Download HydroATLAS
#https://www.hydrosheds.org/hydroatlas
basinatlas_url = "https://figshare.com/ndownloader/files/20082137"
riveratlas_url = "https://figshare.com/ndownloader/files/20087321"
hydroatlas_dir = os.path.join(datdir, 'hydroatlas')
basinatlas_zip_path = os.path.join(hydroatlas_dir, 'BasinATLAS_Data_v10.gdb.zip')
riveratlas_zip_path = os.path.join(hydroatlas_dir, 'RiverATLAS_Data_v10.gdb.zip')

basinatlas_path = download_and_extract_zip(
    in_url=basinatlas_url,
    out_dir=hydroatlas_dir,
    out_zip_name=basinatlas_zip_path,
    verbose=False
)

riveratlas_path = download_and_extract_zip(
    in_url=riveratlas_url,
    out_dir=hydroatlas_dir,
    out_zip_name=riveratlas_zip_path,
    verbose=False
)