# Cloud Native Geospatial Capstone Project

## Project Goals

### Summary

Create a file collection summary (e.g. number of datapoints / deployments)  by applying SQL commands to any parquet data set stored in `aodn-cloud-optimised` using DuckDB. (I now use dask as well) 

### Test collection

slocum_glider_delayed_qc.parquet/ -> 553 objects, ~34.2GB 

### Motivation

Using familiar tools such as xarray and pandas are slow!



Import packages

In [76]:
import duckdb
import dask.dataframe as dd
import s3fs
import sys
import os
from io import StringIO
from datetime import datetime
import time
from datetime import timedelta

# for timing the notebook execution
notebook_start = time.perf_counter()


Get a summary of the data set using dask

In [77]:
dataset_name = 'slocum_glider_delayed_qc'
# dataset_name = 'autonomous_underwater_vehicle'
s3_uri = f's3://aodn-cloud-optimised/{dataset_name}.parquet/'

In [78]:
# get s3 size of dataset and number of files
def get_s3_size_and_file_count(s3_uri):
    fs = s3fs.S3FileSystem(anon=True)
    total_size = 0
    file_count = 0
    
    files = fs.glob(s3_uri + '**/*.parquet')

    for file in files:
        total_size += fs.info(file)['Size']
        file_count += 1
        
    print(f"Total size of dataset: {total_size / (1024**3):.2f} GB")
    print(f"Total number of files: {file_count}")

    return total_size, file_count, files

_,_,files = get_s3_size_and_file_count(s3_uri)

Total size of dataset: 34.16 GB
Total number of files: 552


In [79]:
# get the date it was created and last updated

fs = s3fs.S3FileSystem()

latest = None
first = None

for path in fs.find(s3_uri):
    info = fs.info(path)
    modified = info["LastModified"]
    
    if latest is None or modified > latest:
        latest = modified
    if first is None or modified < first:
        first = modified
        

In [80]:
# extract NetCDF files appended to the parquet dataset

nc_files = [
    os.path.basename(p).split(".nc")[0] + ".nc"
    for p in files
]
nc_files 

['IMOS_ANFOG_BCEOPSTUVN_20151021T035731Z_SL416_FV01_timeseries_END-20151027T015319Z.nc',
 'IMOS_ANFOG_BCEOPSTUVN_20151126T035130Z_SL416_FV01_timeseries_END-20151215T231233Z.nc',
 'IMOS_ANFOG_BCEOPSTUV_20160301T234750Z_SL286_FV01_timeseries_END-20160321T002220Z.nc',
 'IMOS_ANFOG_BCEOPSTUV_20160301T234750Z_SL286_FV01_timeseries_END-20160321T002220Z.nc',
 'IMOS_ANFOG_BCEOPSTUV_20170321T001243Z_SL286_FV01_timeseries_END-20170407T013118Z.nc',
 'IMOS_ANFOG_BCEOPSTUV_20170321T001243Z_SL286_FV01_timeseries_END-20170407T013118Z.nc',
 'IMOS_ANFOG_BCEOPSTUV_20170321T001243Z_SL286_FV01_timeseries_END-20170407T013118Z.nc',
 'IMOS_ANFOG_BCEOPSTUV_20180305T090817Z_SL287_FV01_timeseries_END-20180403T234348Z.nc',
 'IMOS_ANFOG_BCEOPSTUV_20180305T090817Z_SL287_FV01_timeseries_END-20180403T234348Z.nc',
 'IMOS_ANFOG_BCEOPSTUV_20180305T090817Z_SL287_FV01_timeseries_END-20180403T234348Z.nc',
 'IMOS_ANFOG_BCEOPSTUV_20180405T234400Z_SL287_FV01_timeseries_END-20180514T010916Z.nc',
 'IMOS_ANFOG_BCEOPSTUV_2018040

In [81]:
def summarise_ds(s3_uri):
    # use dask to read the parquet file from S3
    ddf = dd.read_parquet(s3_uri, engine='pyarrow',
                        storage_options={"profile": "edge-admin"})
    for col, dtype in ddf.dtypes.items():
        print(f"{col:<35} {dtype}")
    
    # summarise partitions
    print(' ' * 150)
    print(f"This data set has {ddf.npartitions} partitions.")
    print(' ' * 150)
    fs = s3fs.S3FileSystem()
    partitions = fs.ls(s3_uri)
    partition_names = [p.split("=")[-1] for p in partitions]
    # print("\n".join(p.split("=")[-1] for p in partitions))count
    
    partition_column_names = list(
        ddf.select_dtypes(include="category").columns
    )
    column_names = list(
        ddf.select_dtypes(exclude="category").columns
    )
    print(f"Partition column names: {', '.join(partition_column_names)}")

    return partition_names, partition_column_names, column_names  

# Or just run it directly to see output:
partition_names, partition_column_names, column_names = summarise_ds(s3_uri)
    

PLATFORM                            string
DEPLOYMENT                          string
SENSOR1                             string
SENSOR2                             string
SENSOR3                             string
SENSOR4                             string
LATITUDE                            float64
LATITUDE_quality_control            float32
LONGITUDE                           float64
LONGITUDE_quality_control           float32
TIME                                datetime64[ns]
TIME_quality_control                float32
HEAD                                float64
HEAD_quality_control                float32
UCUR                                float64
UCUR_quality_control                float32
VCUR                                float64
VCUR_quality_control                float32
UCUR_GPS                            float64
UCUR_GPS_quality_control            float32
VCUR_GPS                            float64
VCUR_GPS_quality_control            float32
PHASE                          

Use DuckDB to get information

In [82]:
# Initialize DuckDB connection and setup

con = duckdb.connect()

con.execute("INSTALL httpfs; LOAD httpfs;")
con.execute("""
            SET s3_region='ap-southeast-2';
            """) 
con.execute("PRAGMA enable_progress_bar") # show progress bar for long-running queries
con.execute("PRAGMA threads=14") # use all available threads for query execution

<_duckdb.DuckDBPyConnection at 0x71e19428cfb0>

In [83]:
# SQL query

partition_names_sql = ", ".join(f"'{d}'" for d in partition_names[1::]) # skip the first partition which is metadata

partition_column_name = partition_column_names[0]

if 'TIME' in column_names and 'LONGITUDE' in column_names and 'LATITUDE' in column_names:

    # This is an IMOS dataset

    sql = f"""
    SELECT
    {partition_column_name},
    MIN(TIME) AS start_time,
    MAX(TIME) AS end_time,
    MIN(LONGITUDE) AS min_longitude,
    MAX(LONGITUDE) AS max_longitude,
    MIN(LATITUDE) AS min_latitude,
    MAX(LATITUDE) AS max_latitude,
    COUNT(*)  AS n_rows
    FROM read_parquet(
    '{s3_uri}**/*.parquet',
    hive_partitioning=1
    )
    WHERE {partition_column_name} IN ({partition_names_sql})
    GROUP BY {partition_column_name}
    ORDER BY {partition_column_name}
    """
    
else:
    
    # This is not an IMOS dataset, so just get row counts by partition
    # Variable names will be different
    
    sql = f"""
    SELECT
    {partition_column_name},
    COUNT(*)  AS n_rows
    FROM read_parquet(
    '{s3_uri}**/*.parquet',
    hive_partitioning=1
    )
    WHERE {partition_column_name} IN ({partition_names_sql})
    GROUP BY {partition_column_name}
    ORDER BY {partition_column_name}
    """
    

In [None]:
stats = con.execute(sql).fetchdf()

# Testing 
# hive partitioning, LIMIT 1, group/order by deployment code, count rows, min/max time = 6m45s
# Specifying a list of partition names to loop through increased speed to 1.5s (5m when trying later, but for all deployments)



  4% ▕█▌                                    ▏ (~37.0 minutes remaining) 

In [None]:
# Format start_time and end_time as strings for better display in HTML
stats["start_time"] = stats["start_time"].dt.strftime("%Y-%m-%d %H:%M")
stats["end_time"]   = stats["end_time"].dt.strftime("%Y-%m-%d %H:%M")

stats

In [None]:
total_rows = stats["n_rows"].sum()
print(total_rows)

Export information to a html 

In [None]:
def capture_prints_to_html(func, dataset_name, output_file='output.html', *args, **kwargs):
    """
    Execute a function and capture all print statements to an HTML file.
    
    Parameters:
    -----------
    func : callable
        The function to execute
    dataset_name : str
        Name of the dataset
    output_file : str
        Path to the output HTML file
    *args, **kwargs
        Arguments to pass to the function
    
    Returns:
    --------
    result : any
    func : callable
        The function to execute
    output_file : str
        Path to the output HTML file
    *args, **kwargs
        Arguments to pass to the function
    
    Returns:
    --------
    result : any
        The return value of the function
    """
    # Create a StringIO object to capture stdout
    captured_output = StringIO()
    old_stdout = sys.stdout
    
    try:
        # Redirect stdout to our StringIO object
        sys.stdout = captured_output
        
        # Execute the function
        result = func(*args, **kwargs)
        
    finally:
        # Restore stdout
        sys.stdout = old_stdout
    
    # Get the captured output
    output_text = captured_output.getvalue()
    
    # Create HTML content
    html_content = f"""<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>{dataset_name}</title>
    <style>
        body {{
            font-family: 'Courier New', monospace;
            font-size: 16px;
            background-color: #f5f5f5;
            padding: 20px;
            max-width: 95vw;
            margin: 0 auto;
        }}
        .container {{
            background-color: white;
            padding: 30px;
            border-radius: 8px;
            box-shadow: 0 2px 4px rgba(0,0,0,0.1);
        }}
        h1 {{
            color: #333;
            font-size: 28px;
            border-bottom: 2px solid #4CAF50;
            padding-bottom: 10px;
        }}
        .timestamp {{
            color: #666;
            font-size: 14px;
            margin-bottom: 20px;
        }}
        pre {{
            background-color: #f8f8f8;
            border: 1px solid #ddd;
            border-radius: 4px;
            padding: 15px;
            overflow-x: auto;
            line-height: 1.6;
            font-size: 15px;
        }}
    </style>
</head>
<body>
    <div class="container">
        <h1>{dataset_name}</h1>
        <div class="timestamp">Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</div>
        <pre>{output_text}</pre>
    </div>
</body>
</html>"""
    
    # Write to file
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(html_content)
    
    print(f"Output saved to: {output_file}")
    
    return result


In [None]:
# Combined analysis functionmax-width: 1200px;
def html_summary_content():
    # S3 summaryTotal notebook runtime: 2m 11.0s
    print("=" * 150)
    print("PARQUET FILE SUMMARY")
    print("=" * 150)
    print(' ' * 150)
    _,_,files = get_s3_size_and_file_count(s3_uri)
    print(' ' * 150)
    print(f"Parquet file created: {first}")
    print(f"Parquet file last updated: {latest}")
    print(' ' * 150)
    
    # Dask summary
    print("=" * 150)
    print("DATASET SUMMARY")
    print("=" * 150)
    print(' ' * 150)
    summarise_ds(s3_uri)
    print(' ' * 150)
    
    # Deployment stats
    print("=" * 150)
    print("Deployment Statistics")
    print("=" * 150)
    print(f"\nTotal rows: {total_rows}")
    print(' ' * 150)
    print(stats.to_string())
    print(' ' * 150)
    
    # List the files in the dataset
    print("=" * 150)
    print("PARQUET FILES IN DATASET")
    print("=" * 150)
    print(' ' * 150)
    for file in files:
        print(file)
    print("=" * 150)
    print(' ' * 150)
    print("=" * 150)
    print("NETCDF FILES APPENDED")
    print("=" * 150)
    print(' ' * 150)
    for file in nc_files:
        print(file)
    print("=" * 150)
    print(' ' * 150)
    
    # work out total notebook runtime
    notebook_end = time.perf_counter()
    total_elapsed = notebook_end - notebook_start
    minutes = int(total_elapsed // 60)
    seconds = total_elapsed % 60
    print(' ' * 150)
    print(f"Total notebook runtime: {minutes}m {seconds:.1f}s")
    print(' ' * 150)


# Capture all output to a single HTML file
capture_prints_to_html(html_summary_content, dataset_name, f'{dataset_name}_summary.html')
