In [1]:
from aws_tools import *

# Update state machine and job def

create_aws_resources()

In [2]:
# Get the s3 addresses for the dataset

prefix = 'south_atlantic'
resolution = '5min'
staging_bucket = 'kerchunk-staging'
run_name = 'south_atlantic-5min-2'

files = get_dataset('nrel-pds-wtk', prefix=prefix, resolution=resolution)
files

['nrel-pds-wtk/south_atlantic/monthly/v1.0.0/satlantic_2000-01.h5',
 'nrel-pds-wtk/south_atlantic/monthly/v1.0.0/satlantic_2000-02.h5',
 'nrel-pds-wtk/south_atlantic/monthly/v1.0.0/satlantic_2000-03.h5',
 'nrel-pds-wtk/south_atlantic/monthly/v1.0.0/satlantic_2000-04.h5',
 'nrel-pds-wtk/south_atlantic/monthly/v1.0.0/satlantic_2000-05.h5',
 'nrel-pds-wtk/south_atlantic/monthly/v1.0.0/satlantic_2000-06.h5',
 'nrel-pds-wtk/south_atlantic/monthly/v1.0.0/satlantic_2000-07.h5',
 'nrel-pds-wtk/south_atlantic/monthly/v1.0.0/satlantic_2000-08.h5',
 'nrel-pds-wtk/south_atlantic/monthly/v1.0.0/satlantic_2000-09.h5',
 'nrel-pds-wtk/south_atlantic/monthly/v1.0.0/satlantic_2000-10.h5',
 'nrel-pds-wtk/south_atlantic/monthly/v1.0.0/satlantic_2000-11.h5',
 'nrel-pds-wtk/south_atlantic/monthly/v1.0.0/satlantic_2000-12.h5',
 'nrel-pds-wtk/south_atlantic/monthly/v1.0.0/satlantic_2001-01.h5',
 'nrel-pds-wtk/south_atlantic/monthly/v1.0.0/satlantic_2001-02.h5',
 'nrel-pds-wtk/south_atlantic/monthly/v1.0.0/sat

In [3]:
# Generate the state machine input for this dataset

s3_comb_ref_file = f'wtk/{prefix}/kerchunk_{resolution}_ref_s3.json'
az_comb_ref_file = f'wtk/{prefix}/kerchunk_{resolution}_ref.json'
create_state_machine_input(files, staging_bucket, s3_comb_ref_file, az_comb_ref_file, run_name=run_name)

'{"s3_files":["nrel-pds-wtk\\/south_atlantic\\/monthly\\/v1.0.0\\/satlantic_2000-01.h5","nrel-pds-wtk\\/south_atlantic\\/monthly\\/v1.0.0\\/satlantic_2000-02.h5","nrel-pds-wtk\\/south_atlantic\\/monthly\\/v1.0.0\\/satlantic_2000-03.h5","nrel-pds-wtk\\/south_atlantic\\/monthly\\/v1.0.0\\/satlantic_2000-04.h5","nrel-pds-wtk\\/south_atlantic\\/monthly\\/v1.0.0\\/satlantic_2000-05.h5","nrel-pds-wtk\\/south_atlantic\\/monthly\\/v1.0.0\\/satlantic_2000-06.h5","nrel-pds-wtk\\/south_atlantic\\/monthly\\/v1.0.0\\/satlantic_2000-07.h5","nrel-pds-wtk\\/south_atlantic\\/monthly\\/v1.0.0\\/satlantic_2000-08.h5","nrel-pds-wtk\\/south_atlantic\\/monthly\\/v1.0.0\\/satlantic_2000-09.h5","nrel-pds-wtk\\/south_atlantic\\/monthly\\/v1.0.0\\/satlantic_2000-10.h5","nrel-pds-wtk\\/south_atlantic\\/monthly\\/v1.0.0\\/satlantic_2000-11.h5","nrel-pds-wtk\\/south_atlantic\\/monthly\\/v1.0.0\\/satlantic_2000-12.h5","nrel-pds-wtk\\/south_atlantic\\/monthly\\/v1.0.0\\/satlantic_2001-01.h5","nrel-pds-wtk\\/south_at

In [4]:
# Run the state machine

run_state_machine('kerchunk-h5', run_name=run_name)

{'executionArn': 'arn:aws:states:us-west-2:351672045885:execution:kerchunk-h5:south_atlantic-5min-2',
 'startDate': datetime.datetime(2023, 9, 7, 16, 18, 29, 364000, tzinfo=tzlocal()),
 'ResponseMetadata': {'RequestId': 'e9ced2eb-9fdb-40f2-9e2a-83c369b986ee',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'e9ced2eb-9fdb-40f2-9e2a-83c369b986ee',
   'date': 'Thu, 07 Sep 2023 16:18:29 GMT',
   'content-type': 'application/x-amz-json-1.0',
   'content-length': '129',
   'connection': 'keep-alive'},
  'RetryAttempts': 0}}

If the state machine fully executed without error, then there should now be a set of transformed h5 files, s3 refs and az refs, as well as a combined s3 ref file in the staging bucket. Use the test_staging.ipynb notebook to verify that the transformation was successful by loading the combined s3 ref file.

Once you are satisfied, continue to the next cell to copy the data to Azure and generate the combined az ref file.

Make sure to update the .env file with AWS credentials!

In [None]:
copy_s3_dataset_to_azure(files, staging_bucket, dry_run=False)

In [5]:
comb_ref_file = f'wtk/{prefix}/kerchunk_{resolution}_ref.json'
create_combined_ref(files, staging_bucket, comb_ref_file=comb_ref_file, remote_protocol='abfs')

INFO: Scanning...
INFO: Any empty folders will not be processed, because source and/or destination doesn't have full folder support

Job d5998f8f-4aad-1649-4924-6aa7e8fe7c7e has started
Log file is located at: /home/ec2-user/.azcopy/d5998f8f-4aad-1649-4924-6aa7e8fe7c7e.log

100.0 %, 1 Done, 0 Failed, 0 Pending, 0 Skipped, 1 Total,                                  


Job d5998f8f-4aad-1649-4924-6aa7e8fe7c7e summary
Elapsed Time (Minutes): 0.0667
Number of File Transfers: 1
Number of Folder Property Transfers: 0
Number of Symlink Transfers: 0
Total Number of Transfers: 1
Number of File Transfers Completed: 1
Number of Folder Transfers Completed: 0
Number of File Transfers Failed: 0
Number of Folder Transfers Failed: 0
Number of File Transfers Skipped: 0
Number of Folder Transfers Skipped: 0
TotalBytesTransferred: 507255254
Final Job Status: Completed



Once these tasks have finished, you can open the wtk example notebook and verify that the dataset can now be loaded from Azure.