# Cloud Native Geospatial Capstone Project

## Project Goals

### Summary

Create a file collection summary (e.g. number of datapoints / deployments)  by applying SQL commands to any parquet data set stored in `aodn-cloud-optimised` using DuckDB. (I now use dask as well) 

### Test collection

slocum_glider_delayed_qc.parquet/ -> 553 objects, ~34.2GB 

### Motivation

Using familiar tools such as xarray and pandas are slow!



Import packages

In [None]:
import duckdb
import dask.dataframe as dd
import s3fs
import sys
from io import StringIO
from datetime import datetime


Get a summary of the data set using dask

In [29]:
dataset_name = 'slocum_glider_delayed_qc'
s3_uri = f's3://aodn-cloud-optimised/{dataset_name}.parquet/'

In [61]:
def summarise_ds(s3_uri):
    # use dask to read the parquet file from S3
    ddf = dd.read_parquet(s3_uri, engine='pyarrow',
                        storage_options={"profile": "edge-admin"})
    print(ddf) # print structure of dataset
    
    # summarise partitions
    print(f"This data set has {ddf.npartitions} partitions.")
    fs = s3fs.S3FileSystem()
    partitions = fs.ls(s3_uri)
    partition_names = [p.split("=")[-1] for p in partitions]
    print("\n".join(p.split("=")[-1] for p in partitions))

    return partition_names  

# Or just run it directly to see output:
partition_names = summarise_ds(s3_uri)
    

Dask DataFrame Structure:
                PLATFORM DEPLOYMENT SENSOR1 SENSOR2 SENSOR3 SENSOR4 LATITUDE LATITUDE_quality_control LONGITUDE LONGITUDE_quality_control            TIME TIME_quality_control     HEAD HEAD_quality_control     UCUR UCUR_quality_control     VCUR VCUR_quality_control UCUR_GPS UCUR_GPS_quality_control VCUR_GPS VCUR_GPS_quality_control    PHASE PHASE_quality_control  PROFILE PROFILE_quality_control     PRES PRES_quality_control    DEPTH DEPTH_quality_control     TEMP TEMP_quality_control     CNDC CNDC_quality_control     PSAL PSAL_quality_control     DOX2 DOX2_quality_control     DOX1 DOX1_quality_control     CPHL CPHL_quality_control     CDOM CDOM_quality_control     VBSC VBSC_quality_control     NTRA NTRA_quality_control filename IRRAD443 IRRAD443_quality_control IRRAD490 IRRAD490_quality_control IRRAD555 IRRAD555_quality_control IRRAD670 IRRAD670_quality_control      BBP BBP_quality_control  deployment_code        timestamp          polygon
npartitions=552      

Use DuckDB to get information

In [3]:
# Initialize DuckDB connection and setup

con = duckdb.connect()

con.execute("INSTALL httpfs; LOAD httpfs;")
con.execute("""
            SET s3_region='ap-southeast-2';
            """) 
con.execute("PRAGMA enable_progress_bar") # show progress bar for long-running queries
con.execute("PRAGMA threads=14") # use all available threads for query execution

<_duckdb.DuckDBPyConnection at 0x76d6210f37f0>

In [68]:
# SQL query

partition_names_sql = ", ".join(f"'{d}'" for d in partition_names[1::]) # skip the first partition which is metadata


sql = f"""
SELECT
  deployment_code,
  MIN(TIME) AS start_time,
  MAX(TIME) AS end_time,
  COUNT(*)  AS n_rows
FROM read_parquet(
  's3://aodn-cloud-optimised/slocum_glider_delayed_qc.parquet/**/*.parquet',
  hive_partitioning=1
)
WHERE deployment_code IN ({partition_names_sql})
GROUP BY deployment_code
ORDER BY deployment_code
"""

In [67]:
partition_names_sql

"'AIMS20151021', 'AIMS20151127', 'BassStrait20160302', 'BassStrait20170321', 'BassStrait20180305', 'BassStrait20180406', 'BassStrait20190123', 'BassStrait20190216', 'BassStrait20200220', 'BassStrait20200706', 'BassStrait20210617', 'BassStrait20210716', 'BassStrait20220422', 'BassStrait20230518', 'BassStrait20231109', 'BassStrait20240321', 'Bremer20130221', 'Cairns20151130', 'Cairns20160324', 'Cairns20160427', 'Cairns20160726', 'Cairns20180621', 'Cairns20190618', 'Cairns20200514', 'Cairns20230608', 'Cairns20230628', 'CapricornBunker20200307', 'Challenger20180812', 'CharlotteBay20151124', 'Coffs20111112', 'Cooktown20160217', 'Cooktown20160322', 'Cooktown20160503', 'Cooktown20160727', 'Cooktown20191113', 'Cooktown20221117', 'CrowdyHead20091002', 'Dampier20190523', 'Forster120230714', 'Forster20170911', 'Forster20180205', 'Forster20180505', 'Forster20180629', 'Forster20181201', 'Forster20190312', 'Forster20190614', 'Forster20190905', 'Forster20191108', 'Forster20200228', 'Forster20200613',

In [None]:
stats = con.execute(sql).fetchdf()

# Testing 
# hive partitioning, LIMIT 1, group/order by deployment code, count rows, min/max time = 6m45s
# Specifying a list of partition names to loop through increased speed to 1.5s 



In [70]:
stats

Unnamed: 0,deployment_code,start_time,end_time,n_rows
0,AIMS20151021,2015-10-21 03:57:31.734053632,2015-10-27 01:33:01.493292288,76193
1,AIMS20151127,2015-11-26 03:51:30.948281856,2015-12-15 20:21:05.045032704,239474
2,BassStrait20160302,2016-03-01 23:53:20.618420480,2016-03-21 00:22:10.832089600,784590
3,BassStrait20170321,2017-03-21 00:12:43.747376128,2017-04-07 01:31:18.178131456,702002
4,BassStrait20180305,2018-03-05 09:08:35.163017472,2018-04-03 23:42:02.840785152,1225939
...,...,...,...,...
326,Yamba20181031,2018-10-30 23:28:12.828922368,2018-11-24 20:38:16.580535808,1029125
327,Yamba20191108,2019-11-07 23:52:46.675363840,2019-11-28 21:30:20.796973568,839453
328,Yamba20201030,2020-10-29 22:33:10.812895744,2020-11-26 19:33:23.487641088,1113261
329,Yamba20211102,2021-11-02 03:22:01.777531904,2021-11-29 00:49:13.624967168,1015935


In [7]:
con.execute("""
DESCRIBE
SELECT *
FROM parquet_metadata(
    's3://aodn-cloud-optimised/slocum_glider_delayed_qc.parquet/**/*.parquet'
)
""").fetchdf()

Unnamed: 0,column_name,column_type,null,key,default,extra
0,file_name,VARCHAR,YES,,,
1,row_group_id,BIGINT,YES,,,
2,row_group_num_rows,BIGINT,YES,,,
3,row_group_num_columns,BIGINT,YES,,,
4,row_group_bytes,BIGINT,YES,,,
5,column_id,BIGINT,YES,,,
6,file_offset,BIGINT,YES,,,
7,num_values,BIGINT,YES,,,
8,path_in_schema,VARCHAR,YES,,,
9,type,VARCHAR,YES,,,


In [10]:
count = con.execute("""
    SELECT SUM(row_group_num_rows)
    FROM parquet_metadata(
        's3://aodn-cloud-optimised/slocum_glider_delayed_qc.parquet/**/*.parquet'
    )
""").fetchone()[0]


100% ▕██████████████████████████████████████▏ (00:01:23.35 elapsed)     
Number of rows: 15946447312


In [11]:
print(f"Number of rows: {count}")

Number of rows: 15946447312


Export information to a html 

In [None]:
def capture_prints_to_html(func, dataset_name, output_file='output.html', *args, **kwargs):
    """
    Execute a function and capture all print statements to an HTML file.
    
    Parameters:
    -----------
    func : callable
        The function to execute
    dataset_name : str
        Name of the dataset
    output_file : str
        Path to the output HTML file
    *args, **kwargs
        Arguments to pass to the function
    
    Returns:
    --------
    result : any
    func : callable
        The function to execute
    output_file : str
        Path to the output HTML file
    *args, **kwargs
        Arguments to pass to the function
    
    Returns:
    --------
    result : any
        The return value of the function
    """
    # Create a StringIO object to capture stdout
    captured_output = StringIO()
    old_stdout = sys.stdout
    
    try:
        # Redirect stdout to our StringIO object
        sys.stdout = captured_output
        
        # Execute the function
        result = func(*args, **kwargs)
        
    finally:
        # Restore stdout
        sys.stdout = old_stdout
    
    # Get the captured output
    output_text = captured_output.getvalue()
    
    # Create HTML content
    html_content = f"""<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>{dataset_name}</title>
    <style>
        body {{
            font-family: 'Courier New', monospace;
            font-size: 16px;
            background-color: #f5f5f5;
            padding: 20px;
            max-width: 1200px;
            margin: 0 auto;
        }}
        .container {{
            background-color: white;
            padding: 30px;
            border-radius: 8px;
            box-shadow: 0 2px 4px rgba(0,0,0,0.1);
        }}
        h1 {{
            color: #333;
            font-size: 28px;
            border-bottom: 2px solid #4CAF50;
            padding-bottom: 10px;
        }}
        .timestamp {{
            color: #666;
            font-size: 14px;
            margin-bottom: 20px;
        }}
        pre {{
            background-color: #f8f8f8;
            border: 1px solid #ddd;
            border-radius: 4px;
            padding: 15px;
            overflow-x: auto;
            line-height: 1.6;
            font-size: 15px;
        }}
    </style>
</head>
<body>
    <div class="container">
        <h1>{dataset_name}</h1>
        <div class="timestamp">Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</div>
        <pre>{output_text}</pre>
    </div>
</body>
</html>"""
    
    # Write to file
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(html_content)
    
    print(f"Output saved to: {output_file}")
    
    return result


In [71]:
# Combined analysis function
def html_summary_content():
    # Part 1: Dask summary
    print("=" * 60)
    print("DASK DATASET SUMMARY")
    print("=" * 60)
    summarise_ds(s3_uri)
    
    # Part 2: Data summary
    print("\n\n")
    print("=" * 60)
    print("DATA SUMMARY REPORT")
    print("=" * 60)
    print(f"\nTotal rows: {count}")
    print(f"\nDeployment Statistics:")
    print(stats.to_string())

# Capture all output to a single HTML file
capture_prints_to_html(html_summary_content, dataset_name, f'{dataset_name}_summary.html')


Output saved to: slocum_glider_delayed_qc_summary.html
