diff --git a/LICENSE b/LICENSE index d716797..da7e522 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ BSD 3-Clause License -Copyright (c) 2023 Alliance for Sustainable Energy, LLC and Skye Analytics, Inc. +Copyright (c) 2024 Alliance for Sustainable Energy, LLC and Skye Analytics, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/README.md b/README.md index 7c7ee54..29ef781 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # Open Data Access Tools The Open Energy Data Initiative (OEDI) provides a number of tools to enable the use of the open data published through this initiative. The source is largely written in Python, including Jupyter notebooks. -Copyright (c) 2023 Alliance for Sustainable Energy, LLC and Skye Analytics, Inc. +Copyright (c) 2024 Alliance for Sustainable Energy, LLC and Skye Analytics, Inc. Open Data Access Tools: NREL SWR-20-57. Azure Data Tools: SWR-23-92. diff --git a/azure/dev-env.yml b/azure/dev-env.yml new file mode 100644 index 0000000..bdb2dad --- /dev/null +++ b/azure/dev-env.yml @@ -0,0 +1,18 @@ +name: oedi-azure-dev +channels: + - conda-forge + - defaults +dependencies: + - python=3.10.12 + - h5py=3.9.0 + - boto3 + - cftime + - kerchunk + - planetary-computer + - s3fs=2023.6.0 + - pandas + - ujson + - xarray + - zarr + - ipykernel + - adlfs diff --git a/azure/documentation/ pv_rooftop.md b/azure/documentation/ pv_rooftop.md new file mode 100644 index 0000000..2ce34c1 --- /dev/null +++ b/azure/documentation/ pv_rooftop.md @@ -0,0 +1,346 @@ +# PV Rooftop Database + +## Overview + +The National Renewable Energy Laboratory's (NREL) PV Rooftop Database (PVRDB) is a lidar-derived, geospatially-resolved dataset of suitable roof surfaces and their PV technical potential for 128 metropolitan regions in the United States. The source lidar data and building footprints were obtained by the U.S. Department of Homeland Security Homeland Security Infrastructure Program for 2006-2014. Using GIS methods, NREL identified suitable roof surfaces based on their size, orientation, and shading parameters Gagnon et al. (2016). Standard 2015 technical potential was then estimated for each plane using NREL's System Advisory Model. + +The PVRDB is down-loadable by city and year of lidar collection. Four geospatial layers are available for each city and year: 1) the raster extent of the lidar collection, 2) buildings identified from the lidar data, 3) suitable developable planes for each building, and 4) aspect values of the developable planes. + +## Storage Resources + +The pv-rooftop Dataset is made available in Parquet format in the following container: + +`https://nrel.blob.core.windows.net/oedi` + +### Data + +The data are located in the `pv-rooftops/` directory. The four main datasets are stored in the following subdirectories: + +Main datasets + - `/aspects` + - `/buildings` + - `/developable_planes` + - `/rasd` + +Each partition is stored in an individual folder within each subdirectory. + +Partitions + +- `/city_year=__` + +e.g. `/city_year=dover_de_09` + +### Data Format + +The PV Rooftops dataset is provided in geoparquet format partitioned by city_year. There are 4 core datasets: + +#### `oedi/pv-rooftop/aspects` +field | data_type | description +-- | -- | -- +`gid` | bigint |   +`city` | string | city of source lidar dataset +`state` | string | state of source lidar dataset +`year` | bigint | year of source lidar dataset +`bldg_fid` | bigint | building id +`aspect` | bigint | aspect value +`the_geom_96703` | string | projected geometry ([US Contiguous Albers Equal Area Conic - SRID 6703](https://spatialreference.org/ref/sr-org/6703/)) +`the_geom_4326` | string | geometry ([WGS 1984 - SRID 4326](https://spatialreference.org/ref/epsg/4326/)) +`region_id` | bigint |   + + +#### `oedi/pv-rooftop/buildings` + +field | data_type | description +-- | -- | -- +`gid` | bigint |   +`bldg_fid` | bigint | the building fid +`the_geom_96703` | string | projected geometry ([US Contiguous Albers Equal Area Conic - SRID 6703](https://spatialreference.org/ref/sr-org/6703/)) +`the_geom_4326` | string | geometry ([WGS 1984 - SRID 4326](https://spatialreference.org/ref/epsg/4326/)) +`city` | string | the city of the source lidar data +`state` | string | the state of the source lidar data +`year` | bigint | the year of the source lidar data +`region_id` | bigint |   + + +#### `oedi/pv-rooftop/developable_planes` + +field | data_type | description +-- | -- | -- +`bldg_fid` | bigint | building ID associated with the developable plane +`footprint_m2` | double | developable plane footprint area (m2) +`slope` | bigint | slope value +`flatarea_m2` | double | flat area of the developable plane (m2) +`slopeconversion` | double | the slope conversion factor used to convert the flat area into the sloped area +`slopearea_m2` | double | sloped area of the developable plane (m2) +`zip` | string | zipcode +`zip_perc` | double |   +`aspect` | bigint | the aspect value of the developable plane +`gid` | bigint | unique developable plane ID +`city` | string | the city of the source lidar data +`state` | string | the state of the source lidar data +`year` | bigint | the year of the source lidar data +`region_id` | bigint |   +`the_geom_96703` | string | projected geometry ([US Contiguous Albers Equal Area Conic - SRID 6703](https://spatialreference.org/ref/sr-org/6703/)) +`the_geom_4326` | string | geometry ([WGS 1984 - SRID 4326](https://spatialreference.org/ref/epsg/4326/)) + + +#### `oedi/pv-rooftop/rasd` + +field | data_type | description +-- | -- | -- +`gid` | bigint | the unique geographic ID of the raster domain +`the_geom_96703` | string | projected geometry ([US Contiguous Albers Equal Area Conic - SRID 6703](https://spatialreference.org/ref/sr-org/6703/)) +`the_geom_4326` | string | geometry ([WGS 1984 - SRID 4326](https://spatialreference.org/ref/epsg/4326/)) +`city` | string | the city of the source lidar data +`state` | string | the state of the source lidar data +`year` | bigint | the year of the source lidar data +`region_id` | bigint |   +`serial_id` | bigint |   +`__index_level_0__` | bigint |   + + +Within each core dataset there are paritions by city_state_year(YY) that can be queried using Apache pyarrow tools or dask, or downloaded as individual geoparquet format data files. + +Aspects Lookup: +``` +1 337.5 - 22.5 north +2 22.5 - 67.5 northeast +3 67.5 - 112.5 east +4 112.5 - 157.5 southeast +5 157.5 - 202.5 south +6 202.5 - 247.5 southwest +7 247.5 - 292.5 west +8 292.5 - 337.5 northwest +0 flat flat +``` + +Regions Lookup: +``` +1 Albany NY 2006-01-01 +2 Albany NY 2013-01-01 +3 Albuquerque NM 2006-01-01 +4 Albuquerque NM 2012-01-01 +5 Allentown PA 2006-01-01 +6 Amarillo TX 2008-01-01 +7 Anaheim CA 2010-01-01 +8 Arnold MO 2006-01-01 +9 Atlanta GA 2008-01-01 +10 Atlanta GA 2013-01-01 +11 Augusta GA 2010-01-01 +12 Augusta ME 2008-01-01 +13 Austin TX 2006-01-01 +14 Austin TX 2012-01-01 +15 Bakersfield CA 2010-01-01 +16 Baltimore MD 2008-01-01 +17 Baltimore MD 2013-01-01 +18 Baton Rouge LA 2006-01-01 +19 Baton Rouge LA 2012-01-01 +20 Birmingham AL 2008-01-01 +21 Bismarck ND 2008-01-01 +22 Boise ID 2007-01-01 +23 Boise ID 2013-01-01 +24 Boulder CO 2014-01-01 +25 Bridgeport CT 2006-01-01 +26 Bridgeport CT 2013-01-01 +27 Buffalo NY 2008-01-01 +28 Carson City NV 2009-01-01 +29 Charleston SC 2010-01-01 +30 Charleston WV 2009-01-01 +31 Charlotte NC 2006-01-01 +32 Charlotte NC 2012-01-01 +33 Cheyenne WY 2008-01-01 +34 Chicago IL 2008-01-01 +35 Chicago IL 2012-01-01 +36 Cincinnati OH 2010-01-01 +37 Cleveland OH 2012-01-01 +38 Colorado Springs CO 2006-01-01 +39 Colorado Springs CO 2013-01-01 +40 Columbia SC 2009-01-01 +41 Columbus GA 2009-01-01 +42 Columbus OH 2006-01-01 +43 Columbus OH 2012-01-01 +44 Concord NH 2009-01-01 +45 Corpus Christi TX 2012-01-01 +46 Dayton OH 2006-01-01 +47 Dayton OH 2012-01-01 +48 Denver CO 2012-01-01 +49 Des Moines IA 2010-01-01 +50 Detroit MI 2012-01-01 +51 Dover DE 2009-01-01 +52 El Paso TX 2007-01-01 +53 Flint MI 2009-01-01 +54 Fort Wayne IN 2008-01-01 +55 Frankfort KY 2012-01-01 +56 Fresno CA 2006-01-01 +57 Fresno CA 2013-01-01 +58 Ft Belvoir DC 2012-01-01 +59 Grand Rapids MI 2013-01-01 +60 Greensboro NC 2009-01-01 +61 Harrisburg PA 2009-01-01 +62 Hartford CT 2006-01-01 +63 Hartford CT 2013-01-01 +64 Helena MT 2007-01-01 +65 Helena MT 2013-01-01 +66 Houston TX 2010-01-01 +67 Huntsville AL 2009-01-01 +68 Indianapolis IN 2006-01-01 +69 Indianapolis IN 2012-01-01 +70 Jackson MS 2007-01-01 +71 Jacksonville FL 2010-01-01 +72 Jefferson City MO 2008-01-01 +73 Kansas City MO 2010-01-01 +74 Kansas City MO 2012-01-01 +75 LaGuardia JFK NY 2012-01-01 +76 Lancaster PA 2010-01-01 +77 Lansing MI 2007-01-01 +78 Lansing MI 2013-01-01 +79 Las Vegas NV 2009-01-01 +80 Lexington KY 2012-01-01 +81 Lincoln NE 2008-01-01 +82 Little Rock AR 2008-01-01 +83 Los Angeles CA 2007-01-01 +84 Louisville KY 2006-01-01 +85 Louisville KY 2012-01-01 +86 Lubbock TX 2008-01-01 +87 Madison WI 2010-01-01 +88 Manhattan NY 2007-01-01 +89 McAllen TX 2008-01-01 +90 Miami FL 2009-01-01 +91 Milwaukee WI 2007-01-01 +92 Milwaukee WI 2013-01-01 +93 Minneapolis MN 2007-01-01 +94 Minneapolis MN 2012-01-01 +95 Mission Viejo CA 2013-01-01 +96 Mobile AL 2010-01-01 +97 Modesto CA 2010-01-01 +98 Montgomery AL 2007-01-01 +99 Montpelier VT 2009-01-01 +100 Newark NJ 2007-01-01 +101 New Haven CT 2007-01-01 +102 New Haven CT 2013-01-01 +103 New Orleans LA 2008-01-01 +104 New Orleans LA 2012-01-01 +105 New York NY 2005-01-01 +106 New York NY 2013-01-01 +107 Norfolk VA 2007-01-01 +108 Oklahoma City OK 2007-01-01 +109 Oklahoma City OK 2013-01-01 +110 Olympia WA 2010-01-01 +111 Omaha NE 2007-01-01 +112 Omaha NE 2013-01-01 +113 Orlando FL 2009-01-01 +114 Oxnard CA 2010-01-01 +115 Palm Bay FL 2010-01-01 +116 Pensacola FL 2009-01-01 +117 Philadelphia PA 2007-01-01 +118 Pierre SD 2008-01-01 +119 Pittsburgh PA 2004-01-01 +120 Pittsburgh PA 2012-01-01 +121 Portland OR 2012-01-01 +122 Poughkeepsie NY 2012-01-01 +123 Providence RI 2004-01-01 +124 Providence RI 2012-01-01 +125 Raleigh-Durham NC 2010-01-01 +126 Reno NV 2007-01-01 +127 Richmond VA 2008-01-01 +128 Richmond VA 2013-01-01 +129 Rochester NY 2008-01-01 +130 Rochester NY 2014-01-01 +131 Sacramento CA 2012-01-01 +132 Salem OR 2008-01-01 +133 Salt Lake City UT 2012-01-01 +134 San Antonio TX 2008-01-01 +135 San Antonio TX 2013-01-01 +137 San Diego CA 2008-01-01 +138 San Diego CA 2013-01-01 +139 San Francisco CA 2013-01-01 +140 Santa Fe NM 2009-01-01 +141 Sarasota FL 2009-01-01 +142 Scranton PA 2008-01-01 +143 Seattle WA 2011-01-01 +144 Shreveport LA 2008-01-01 +145 Spokane WA 2008-01-01 +146 Springfield IL 2009-01-01 +147 Springfield MA 2007-01-01 +148 Springfield MA 2013-01-01 +149 St Louis MO 2008-01-01 +150 St Louis MO 2013-01-01 +151 Stockton CA 2010-01-01 +152 Syracuse NY 2008-01-01 +153 Tallahassee FL 2009-01-01 +154 Tampa FL 2008-01-01 +155 Toledo OH 2006-01-01 +156 Toledo OH 2012-01-01 +157 Topeka KS 2008-01-01 +158 Trenton NJ 2008-01-01 +159 Tucson AZ 2007-01-01 +160 Tulsa OK 2008-01-01 +161 Washington DC 2009-01-01 +162 Washington DC 2012-01-01 +163 Wichita KS 2012-01-01 +164 Winston-Salem NC 2009-01-01 +165 Worcester MA 2009-01-01 +166 Youngstown OH 2008-01-01 +167 Andrews AFB DC 2012-01-01 +136 San Bernardino-Riverside CA 2012-01-01 +168 Tampa FL 2013-01-01 +``` + + +## Sample code + +A complete Python example of accessing and visualizing some of these data is available in the accompanying [sample notebook](https://nbviewer.jupyter.org/github/microsoft/AIforEarthDataSets/blob/main/data/pv_rooftop.ipynb). + +## Mounting the container + +We also provide a read-only SAS (shared access signature) token to allow access via, e.g., [BlobFuse](https://github.com/Azure/azure-storage-fuse), which allows you to mount blob containers as drives: + +`https://nrel.blob.core.windows.net/oedi?sv=2019-12-12&si=oedi-ro&sr=c&sig=uslpLxKf3%2Foeu79ufIHbJkpI%2FTWDH3lblJMa5KQRPmM%3D` + +Mounting instructions for Linux are [here](https://docs.microsoft.com/en-us/azure/storage/blobs/storage-how-to-mount-container-linux). + +## References + +Main References: +1. [Rooftop Solar Photovoltaic Technical Potential in the United States: A Detailed Assessment](https://www.nrel.gov/docs/fy16osti/65298.pdf) + +2. [Using GIS-based methods and lidar data to estimate rooftop solar technical potential in US cities](https://iopscience.iop.org/article/10.1088/1748-9326/aa7225/pdf) + +3. [Estimating rooftop solar technical potential across the US using a combination of GIS-based methods, lidar data, and statistical modeling](https://iopscience.iop.org/article/10.1088/1748-9326/aaa554/pdf) + +4. [Rooftop Photovoltaic Technical Potential in the United States](https://data.nrel.gov/submissions/121) + +5. [U.S. PV-Suitable Rooftop Resources](https://data.nrel.gov/submissions/47) + +Related Reference: + +1. [Rooftop Solar Technical Potential for Low-to-Moderate Income Households in the United States](https://www.nrel.gov/docs/fy18osti/70901.pdf) + +2. [Rooftop Energy Potential of Low Income Communities in America REPLICA](https://data.nrel.gov/submissions/81) + +3. [Puerto Rico Solar-for-All: LMI PV Rooftop Technical Potential and Solar Savings Potential](https://data.nrel.gov/submissions/144) + + +## Disclaimer and Attribution + +Copyright (c) 2020, Alliance for Sustainable Energy LLC, All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. + +* Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +## Contact + +For questions about this dataset, contact [`aiforearthdatasets@microsoft.com`](mailto:aiforearthdatasets@microsoft.com?subject=oedi%20question). + + +## Notices + +Microsoft provides this dataset on an "as is" basis. Microsoft makes no warranties (express or implied), guarantees, or conditions with respect to your use of the dataset. To the extent permitted under your local law, Microsoft disclaims all liability for any damages or losses * including direct, consequential, special, indirect, incidental, or punitive * resulting from your use of this dataset. This dataset is provided under the original terms that Microsoft received source data. \ No newline at end of file diff --git a/azure/documentation/PR100.md b/azure/documentation/PR100.md new file mode 100644 index 0000000..69e9052 --- /dev/null +++ b/azure/documentation/PR100.md @@ -0,0 +1,55 @@ +# NREL Puerto Rico 100 Dataset (PR100) + + +## Overview + +The [Puerto Rico Grid Resilience and Transitions to 100% Renewable Energy Study](https://www.energy.gov/gdo/puerto-rico-grid-resilience-and-transitions-100-renewable-energy-study-pr100) is a 2-year study by the U.S. Department of Energy’s (DOE’s) Grid Deployment Office and six national laboratories to comprehensively analyze stakeholder-driven pathways to Puerto Rico’s clean energy future. + +The PR100 dataset is a collection of geospasial data that will be useful for renewable energy development in Puerto Rico. The dataset is curated by the National Renewable Energy Laboratory. + + +## Storage resources + +The data are stored in Azure Blob Storage, in the following container: + +`https://nrel.blob.core.windows.net/oedi` + + +### Data + +The data are located in the `pr100/` directory and have been categorized into 5 subdirectories: + +- `Boundaries/` +- `Habitat/` +- `Hazards/` +- `Infrastructure/` +- `Topography/` + + +### Data format + +Vector data are stored in the geoparquet format and rasters are stored as cloud-optimized geotiffs. + + +## Sample code + +A complete Python example of accessing and visualizing some of these data is available in the accompanying [sample notebook](https://nbviewer.jupyter.org/github/microsoft/AIforEarthDataSets/blob/main/data/PR100.ipynb). + + +## Mounting the container + +We also provide a read-only SAS (shared access signature) token to allow access via, e.g., [BlobFuse](https://github.com/Azure/azure-storage-fuse), which allows you to mount blob containers as drives: + +`https://nrel.blob.core.windows.net/oedi?sv=2019-12-12&si=oedi-ro&sr=c&sig=uslpLxKf3%2Foeu79ufIHbJkpI%2FTWDH3lblJMa5KQRPmM%3D` + +Mounting instructions for Linux are [here](https://docs.microsoft.com/en-us/azure/storage/blobs/storage-how-to-mount-container-linux). + + +## Contact + +For questions about this dataset, contact [`aiforearthdatasets@microsoft.com`](mailto:aiforearthdatasets@microsoft.com?subject=oedi%20question). + + +## Notices + +Microsoft provides this dataset on an "as is" basis. Microsoft makes no warranties (express or implied), guarantees, or conditions with respect to your use of the dataset. To the extent permitted under your local law, Microsoft disclaims all liability for any damages or losses * including direct, consequential, special, indirect, incidental, or punitive * resulting from your use of this dataset. This dataset is provided under the original terms that Microsoft received source data. \ No newline at end of file diff --git a/azure/documentation/az_cli_guide.md b/azure/documentation/az_cli_guide.md new file mode 100644 index 0000000..5e153be --- /dev/null +++ b/azure/documentation/az_cli_guide.md @@ -0,0 +1,19 @@ +## Azure CLI Guide + +OEDI data exist as blobs in Azure. Blobs live in containers. Containers live in storage accounts. For most of our data, the storage account is 'nrel' and the container is 'oedi'. There is a directory structure within the container to organize different data sets. Currently, the datasets present are 'PR100', 'pv-rooftop' and part of 'sup3rcc'. NSRDB lives in the 'nrel' storage account but in a different container called 'nrel-nsrdb'. + +In order to access data from the command line, you will need to obtain a temporary SAS token from the planetary computer. You can then use that token as an argument for any commands you make with the CLI. CLI reference for interacting with blobs: https://learn.microsoft.com/en-us/cli/azure/storage/blob?view=azure-cli-latest#az-storage-blob-download + +Finally, if the goal is to move large amounts of data from blob storage to S3 or local, then the best tool is azcopy: https://learn.microsoft.com/en-us/azure/storage/common/storage-use-azcopy-v10 + +Obtain a planetary computer temporary access token: + +`curl https://planetarycomputer.microsoft.com/api/sas/v1/token/nrel/oedi > sas.json` + +View a list of blobs in the PR100 dataset: + +`az storage blob list --account-name nrel --container-name oedi --output table --prefix PR100 --sas-token ""` + +Download a blob from the PR100 dataset: + +`az storage blob download --account-name nrel --container-name oedi --name PR100/Infrastructure/setbacks_runway.parquet --file setbacks_runway.parquet --sas-token ""` diff --git a/azure/documentation/sup3rcc.md b/azure/documentation/sup3rcc.md new file mode 100644 index 0000000..10679b6 --- /dev/null +++ b/azure/documentation/sup3rcc.md @@ -0,0 +1,103 @@ +# Super-Resolution for Renewable Energy Resource Data with Climate Change Impacts (Sup3rCC) + + +## Overview + +The Super-Resolution for Renewable Energy Resource Data with Climate Change Impacts (Sup3rCC) data is a collection of 4km hourly wind, solar, temperature, humidity, and pressure fields for the contiguous United States under climate change scenarios. + +Sup3rCC is downscaled Global Climate Model (GCM) data. For example, the initial dataset "sup3rcc_conus_mriesm20_ssp585_r1i1p1f1" is downscaled from MRI ESM 2.0 for climate change scenario SSP5 8.5 and variant label r1i1p1f1. The downscaling process was performed using a generative machine learning approach called sup3r: Super-Resolution for Renewable Energy Resource Data ([Sup3r GitHub Repo](https://github.com/NREL/sup3r)). The data includes both historical and future weather years, although the historical years represent the historical average climate, not the actual historical weather that we experienced. + +The Sup3rCC data is intended to help researchers study the impact of climate change on energy systems with high levels of wind and solar capacity. Please note that all climate change data is only a representation of the *possible* future climate and contains significant uncertainty. Analysis of multiple climate change scenarios and multiple climate models can help quantify this uncertainty. + +For more info, view the [OEDI Sup3rcc catalogue entry](https://data.openei.org/submissions/5839). + +## Storage Resources + +The Sup3rcc Dataset is made available in h5 format in the following container: + +`https://nrel.blob.core.windows.net/oedi` + +### Data + +The data are located in the `sup3rcc/` directory. The initial datset is the subdirectory `conus_mriesm20_ssp585_r1i1p1f1/`. + +Each h5 file's name encodes info about the variables it contains and the year. + +e.g. `sup3rcc_conus_mriesm20_ssp585_r1i1p1f1_pressure_2015.h5` + +### Data Format + +The Sup3rcc dataset is provided in h5 format. A kerchunk reference file is also included to facilitate faster access. + +#### `Dimensions:` +field | data_type +-- | -- +`time_index` | int +`latitude` | float +`longitude` | float + +#### `Location Metadata:` + +field | data_type +-- | -- +`country` | string +`state` | string +`county` | string +`timezone` | string +`eez` | string +`elevation` | string +`offshore` | string + +#### `Variables:` + +field | data_type +-- | -- +`dhi` | float +`dni` | float +`ghi` | float +`pressure_0m` | float +`relativehumidity_2m` | float +`temperature_2m` | float +`winddirection_100m` | float +`winddirection_10m` | float +`winddirection_200m` | float +`windspeed_100m` | float +`windspeed_10m` | float +`windspeed_200m` | float +`offshore` | float + +## Sample code + +A complete Python example of accessing and visualizing some of these data is available in the accompanying [sample notebook](https://nbviewer.jupyter.org/github/microsoft/AIforEarthDataSets/blob/main/data/sup3rcc.ipynb). + +## Mounting the container + +We also provide a read-only SAS (shared access signature) token to allow access via, e.g., [BlobFuse](https://github.com/Azure/azure-storage-fuse), which allows you to mount blob containers as drives: + +`https://nrel.blob.core.windows.net/oedi?sv=2019-12-12&si=oedi-ro&sr=c&sig=uslpLxKf3%2Foeu79ufIHbJkpI%2FTWDH3lblJMa5KQRPmM%3D` + +Mounting instructions for Linux are [here](https://docs.microsoft.com/en-us/azure/storage/blobs/storage-how-to-mount-container-linux). + +## Disclaimer and Attribution + +Copyright (c) 2020, Alliance for Sustainable Energy LLC, All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. + +* Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +## Contact + +For questions about this dataset, contact [`aiforearthdatasets@microsoft.com`](mailto:aiforearthdatasets@microsoft.com?subject=oedi%20question). + + +## Notices + +Microsoft provides this dataset on an "as is" basis. Microsoft makes no warranties (express or implied), guarantees, or conditions with respect to your use of the dataset. To the extent permitted under your local law, Microsoft disclaims all liability for any damages or losses * including direct, consequential, special, indirect, incidental, or punitive * resulting from your use of this dataset. This dataset is provided under the original terms that Microsoft received source data. \ No newline at end of file diff --git a/azure/examples/PR100.ipynb b/azure/examples/PR100.ipynb new file mode 100644 index 0000000..476eb9f --- /dev/null +++ b/azure/examples/PR100.ipynb @@ -0,0 +1,183 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Demo Notebook for Accessing PR100 Data on Azure\n", + "\n", + "Launched on February 2, 2022, a two-year study entitled Puerto Rico Grid Resilience and Transitions to 100% Renewable Energy (PR100) will perform a comprehensive analysis of stakeholder-driven pathways to Puerto Rico’s clean energy future. For more information, please visit [https://www.energy.gov/gdo/puerto-rico-grid-resilience-and-transitions-100-renewable-energy-study-pr100].\n", + "\n", + "To support the PR100 project, the Open Energy Data Initiative has made an assortment of data sets available for free public access. This notebook will demonstrate how to access the PR100 data located in Azure BLOB storage." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Get Access Token\n", + "\n", + "You do not need an Azure account to access public data. Instead, you can obtain a temporary access token via the Planetary Computer's API. This can be accomplished via either the requests or planetary_computer libraries. Both options are shown below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# get a token with requests\n", + "import requests\n", + "\n", + "token = requests.get(\n", + " 'https://planetarycomputer.microsoft.com/api/sas/v1/token/nrel/oedi'\n", + ").json()['token']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# get a token with planetary-computer\n", + "import planetary_computer\n", + "\n", + "token = planetary_computer.sas.get_token('nrel', 'oedi').token\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Explore Container\n", + "\n", + "Use the token to create a PyFileSystem object. You can explore the contents of the container using the get_file_info method. The PR100 data consists of geoparquet and geotiff files that are organized into directories:\n", + "- Boundaries\n", + "- Habitat\n", + "- Hazards\n", + "- Infrastructure\n", + "- Topography" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pyarrow.fs import PyFileSystem, FSSpecHandler, FileSelector\n", + "from adlfs import AzureBlobFileSystem\n", + "\n", + "# Create file system\n", + "fs = PyFileSystem(\n", + " FSSpecHandler(\n", + " AzureBlobFileSystem('nrel', credential=token)\n", + " )\n", + ")\n", + "\n", + "# View files in the 'Boundaries' directory\n", + "fs.get_file_info(FileSelector('/oedi/PR100/Boundaries/'))" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load Vector Data\n", + "\n", + "Let's load one of those files into a geodataframe and visualize it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import geopandas\n", + "\n", + "df = geopandas.read_parquet('oedi/PR100/Boundaries/land_protected_areas.parquet', filesystem=fs)\n", + "df.explore()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load Raster Data\n", + "\n", + "If we look in the Topography directory, we'll see some tif files. These are cloud optimized GeoTiffs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fs.get_file_info(FileSelector('/oedi/PR100/Topography/'))" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can load these files with the rasterio package." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import rasterio\n", + "import rasterio.plot\n", + "\n", + "with fs.open_input_file('oedi/PR100/Topography/elevation.tif') as file:\n", + " raster = rasterio.open(file)\n", + " print(raster.meta)\n", + " rasterio.plot.show(raster, adjust=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pr100-env", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "d2e0ca302a5f5f673bd05f1fbb5f2420578af44ff0f5cef95f9f5d4b68b66ae3" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/azure/examples/pv_rooftop.ipynb b/azure/examples/pv_rooftop.ipynb new file mode 100644 index 0000000..1bcb163 --- /dev/null +++ b/azure/examples/pv_rooftop.ipynb @@ -0,0 +1,308 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Demo Notebook for Accessing PV Rooftop Data on Azure\n", + "\n", + "The National Renewable Energy Laboratory's (NREL) PV Rooftop Database (PVRDB) is a lidar-derived, geospatially-resolved dataset of suitable roof surfaces and their PV technical potential for 128 metropolitan regions in the United States. The source lidar data and building footprints were obtained by the U.S. Department of Homeland Security Homeland Security Infrastructure Program for 2006-2014. Using GIS methods, NREL identified suitable roof surfaces based on their size, orientation, and shading parameters Gagnon et al. (2016). Standard 2015 technical potential was then estimated for each plane using NREL's System Advisory Model.\n", + "\n", + "This notebook will demonstrate how to access the PV Rooftop data located in Azure BLOB storage." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Get Access Token\n", + "\n", + "You do not need an Azure account to access public data. Instead, you can obtain a temporary access token via the Planetary Computer's API. This can be accomplished via either the requests or planetary_computer libraries. Both options are shown below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# get a token with requests\n", + "import requests\n", + "\n", + "token = requests.get(\n", + " 'https://planetarycomputer.microsoft.com/api/sas/v1/token/nrel/oedi'\n", + ").json()['token']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# get a token with planetary-computer\n", + "import planetary_computer\n", + "\n", + "token = planetary_computer.sas.get_token('nrel', 'oedi').token" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Explore Container\n", + "\n", + "First, we use the token to create a PyFileSystem object. We can then use ParquetDataset objects to explore the metadata for each table. pv_rooftop consists of 4 tables:\n", + "- buildings\n", + "- aspects\n", + "- developable-planes\n", + "- rasd\n", + "\n", + "Each table is partitioned by city_year." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pyarrow.fs import PyFileSystem, FSSpecHandler\n", + "from adlfs import AzureBlobFileSystem\n", + "import pyarrow.parquet as pq\n", + "\n", + "# Create file system using token\n", + "fs = PyFileSystem(\n", + " FSSpecHandler(\n", + " AzureBlobFileSystem('nrel', credential=token)\n", + " )\n", + ")\n", + "\n", + "# Create ParquetDataset for the buildings table\n", + "buildings_dataset = pq.ParquetDataset('oedi/pv-rooftop/buildings', filesystem=fs)\n", + "\n", + "# View the partition keys\n", + "city_years = buildings_dataset.partitioning.dictionaries\n", + "city_years\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# View the schema for the buildings table\n", + "buildings_dataset.schema" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Read Data\n", + "\n", + "pv_rooftop is a large data set. For the purposes of this example, we will read data from a single partition, city_year=albany_ny_13, and take a random sample of 100 buildings. We will read the tables directly into geodataframes. This may take several minutes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import geopandas as gpd\n", + "\n", + "# Read the bldg_fid column from the buildings table and take a random sample of 100 buildings.\n", + "bldg_fid_sample = pd.read_parquet(\n", + " 'oedi/pv-rooftop/buildings',\n", + " filesystem=fs,\n", + " filters=[('city_year', '=', 'albany_ny_13')],\n", + " columns=['bldg_fid']\n", + ").sample(100)['bldg_fid']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Read buildings table using bldg_fid_sample as a filter\n", + "buildings = gpd.read_parquet(\n", + " 'oedi/pv-rooftop/buildings',\n", + " filesystem=fs,\n", + " filters=[\n", + " ('city_year', '=', 'albany_ny_13'),\n", + " ('bldg_fid', 'in', bldg_fid_sample)\n", + " ],\n", + " columns=['gid', 'city', 'state', 'year', 'bldg_fid', 'the_geom_4326']\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Read aspects table using bldg_fid_sample as a filter\n", + "aspects = gpd.read_parquet(\n", + " 'oedi/pv-rooftop/aspects',\n", + " filesystem=fs,\n", + " filters=[\n", + " ('city_year', '=', 'albany_ny_13'),\n", + " ('bldg_fid', 'in', bldg_fid_sample)\n", + " ],\n", + " columns=['gid', 'city', 'state', 'year', 'bldg_fid', 'aspect', 'the_geom_4326']\n", + ")\n", + "\n", + "# Add a column for the aspect_string\n", + "aspect_lookup = {\n", + " 0: 'flat',\n", + " 1: 'north',\n", + " 2: 'northeast',\n", + " 3: 'east',\n", + " 4: 'southeast',\n", + " 5: 'south',\n", + " 6: 'southwest',\n", + " 7: 'west',\n", + " 8: 'northwest'\n", + "}\n", + "aspects['aspect_string'] = aspects['aspect'].replace(aspect_lookup)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Read developable-planes table using bldg_fid_sample as a filter\n", + "developable_planes = gpd.read_parquet(\n", + " 'oedi/pv-rooftop/developable-planes',\n", + " filesystem=fs,\n", + " filters=[\n", + " ('city_year', '=', 'albany_ny_13'),\n", + " ('bldg_fid', 'in', bldg_fid_sample)\n", + " ],\n", + " columns=['gid', 'city', 'state', 'year', 'bldg_fid', 'footprint_m2', 'slope', 'flatarea_m2', 'slopeconversion', 'slopearea_m2', 'aspect', 'the_geom_4326']\n", + ")\n", + "\n", + "# Add a column for the aspect_string\n", + "developable_planes['aspect_string'] = developable_planes['aspect'].replace(aspect_lookup)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Read rasd table\n", + "rasd = gpd.read_parquet(\n", + " 'oedi/pv-rooftop/rasd',\n", + " filesystem=fs,\n", + " filters=[\n", + " ('city_year', '=', 'albany_ny_13')\n", + " ],\n", + " columns=['gid', 'city', 'state', 'year', 'the_geom_4326']\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Visualize Data\n", + "\n", + "We are now ready to visualize the data using folium." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import folium\n", + "\n", + "# Dictionary for coloring the polygons based on aspect\n", + "color_dict = {\n", + " 'flat': 'yellow',\n", + " 'north': 'red',\n", + " 'northeast': 'red',\n", + " 'east': 'yellow',\n", + " 'southeast': 'green',\n", + " 'south': 'green',\n", + " 'southwest': 'green',\n", + " 'west': 'yellow',\n", + " 'northwest': 'red'\n", + "}\n", + "color = aspects['aspect_string'].replace(color_dict)\n", + "m = buildings.explore(color='gray', name='buildings')\n", + "m = aspects.explore(m=m, name='aspects', color=color)\n", + "m = developable_planes.explore(m=m, name='developable-planes', color=color)\n", + "m = rasd.explore(m=m, name='rasd')\n", + "folium.LayerControl().add_to(m)\n", + "m" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Export Data\n", + "\n", + "There are many options for exporting the data for use in GIS software. Here, we demonstrate writing a geopackage." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "file_name = 'pv_rooftop_albany_ny_13.gpkg'\n", + "buildings.to_file(file_name, layer='buildings', driver=\"GPKG\")\n", + "aspects.to_file(file_name, layer='aspects', driver=\"GPKG\")\n", + "developable_planes.to_file(file_name, layer='developable-planes', driver=\"GPKG\")\n", + "rasd.to_file(file_name, layer='rasd', driver=\"GPKG\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "oedi-env", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "4c7bf1489743dc7ac4eb5d54993539996d2b573f88c885c7af86ecea3199729c" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/azure/pipeline/ASL/job.json b/azure/pipeline/ASL/job.json new file mode 100644 index 0000000..dee78c2 --- /dev/null +++ b/azure/pipeline/ASL/job.json @@ -0,0 +1,21 @@ +{ + "jobName": "test3", + "jobDefinition": "arn:aws:batch:us-west-2:351672045885:job-definition/kerchunk-h5-new:1", + "jobQueue": "arn:aws:batch:us-west-2:351672045885:job-queue/kerchunk-h5", + "dependsOn": [], + "arrayProperties": {}, + "parameters": {}, + "containerOverrides": { + "resourceRequirements": [], + "environment": [ + { + "name": "staging_bucket", + "value": "kerchunk-staging" + }, + { + "name": "s3_file", + "value": "nrel-pds-wtk/south_atlantic/yearly_hr/v1.0.0/satlantic_2000_hr.h5" + } + ] + } + } \ No newline at end of file diff --git a/azure/pipeline/ASL/job_definition.json b/azure/pipeline/ASL/job_definition.json new file mode 100644 index 0000000..f831696 --- /dev/null +++ b/azure/pipeline/ASL/job_definition.json @@ -0,0 +1,31 @@ +{ + "jobDefinitionName": "kerchunk-h5", + "type": "container", + "containerProperties": { + "image": "351672045885.dkr.ecr.us-west-2.amazonaws.com/transform_h5_container", + "jobRoleArn": "arn:aws:iam::351672045885:role/ecsTaskExecutionRole", + "executionRoleArn": "arn:aws:iam::351672045885:role/ecsTaskExecutionRole", + "resourceRequirements": [ + { + "value": "1", + "type": "VCPU" + }, + { + "value": "100000", + "type": "MEMORY" + } + ], + "environment": [], + "secrets": [], + "linuxParameters": { + "tmpfs": [], + "devices": [] + }, + "mountPoints": [], + "ulimits": [] + }, + "platformCapabilities": [ + "EC2" + ], + "parameters": {} +} \ No newline at end of file diff --git a/azure/pipeline/ASL/kerchunk-1TB.json b/azure/pipeline/ASL/kerchunk-1TB.json new file mode 100644 index 0000000..c88cb75 --- /dev/null +++ b/azure/pipeline/ASL/kerchunk-1TB.json @@ -0,0 +1 @@ +{"EbsOptimized":true,"IamInstanceProfile":{"Arn":"arn:aws:iam::351672045885:instance-profile\/ec2-base-role"},"BlockDeviceMappings":[{"DeviceName":"\/dev\/xvda","Ebs":{"Encrypted":true,"DeleteOnTermination":true,"Iops":3000,"KmsKeyId":"arn:aws:kms:us-west-2:351672045885:key\/a14e1832-d4ca-4667-a986-631341c44db8","SnapshotId":"snap-0b98405d74debf232","VolumeSize":1000,"VolumeType":"gp3","Throughput":125}}],"NetworkInterfaces":[{"AssociatePublicIpAddress":false,"DeleteOnTermination":true,"Description":"","DeviceIndex":0,"Groups":["sg-0dd899f63f3874c77"],"InterfaceType":"interface","Ipv6Addresses":[],"PrivateIpAddresses":[{"Primary":true,"PrivateIpAddress":"172.18.37.24"}],"SubnetId":"subnet-002fd73ee4a6c6baf","NetworkCardIndex":0}],"ImageId":"ami-038c0c1c6c6b1fb07","InstanceType":"x2iedn.xlarge","KeyName":"matt-key","Monitoring":{"Enabled":false},"Placement":{"AvailabilityZone":"us-west-2b","GroupName":"","Tenancy":"default"},"DisableApiTermination":false,"InstanceInitiatedShutdownBehavior":"stop","TagSpecifications":[{"ResourceType":"instance","Tags":[{"Key":"Name","Value":"kerchunk-1TiB"}]}],"CpuOptions":{"CoreCount":2,"ThreadsPerCore":2},"CapacityReservationSpecification":{"CapacityReservationPreference":"open"},"HibernationOptions":{"Configured":false},"MetadataOptions":{"HttpTokens":"required","HttpPutResponseHopLimit":2,"HttpEndpoint":"enabled","HttpProtocolIpv6":"disabled","InstanceMetadataTags":"disabled"},"EnclaveOptions":{"Enabled":false},"PrivateDnsNameOptions":{"HostnameType":"ip-name","EnableResourceNameDnsARecord":false,"EnableResourceNameDnsAAAARecord":false},"MaintenanceOptions":{"AutoRecovery":"default"},"DisableApiStop":false} \ No newline at end of file diff --git a/azure/pipeline/ASL/state_machine_input.json b/azure/pipeline/ASL/state_machine_input.json new file mode 100644 index 0000000..79be9b9 --- /dev/null +++ b/azure/pipeline/ASL/state_machine_input.json @@ -0,0 +1 @@ +{"s3_files":["nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2000-01.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2000-02.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2000-03.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2000-04.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2000-05.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2000-06.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2000-07.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2000-08.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2000-09.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2000-10.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2000-11.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2000-12.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2001-01.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2001-02.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2001-03.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2001-04.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2001-05.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2001-06.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2001-07.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2001-08.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2001-09.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2001-10.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2001-11.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2001-12.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2002-01.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2002-02.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2002-03.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2002-04.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2002-05.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2002-06.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2002-07.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2002-08.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2002-09.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2002-10.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2002-11.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2002-12.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2003-01.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2003-02.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2003-03.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2003-04.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2003-05.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2003-06.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2003-07.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2003-08.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2003-09.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2003-10.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2003-11.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2003-12.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2004-01.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2004-02.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2004-03.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2004-04.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2004-05.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2004-06.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2004-07.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2004-08.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2004-09.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2004-10.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2004-11.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2004-12.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2005-01.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2005-02.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2005-03.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2005-04.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2005-05.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2005-06.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2005-07.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2005-08.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2005-09.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2005-10.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2005-11.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2005-12.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2006-01.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2006-02.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2006-03.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2006-04.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2006-05.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2006-06.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2006-07.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2006-08.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2006-09.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2006-10.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2006-11.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2006-12.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2007-01.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2007-02.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2007-03.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2007-04.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2007-05.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2007-06.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2007-07.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2007-08.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2007-09.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2007-10.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2007-11.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2007-12.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2008-01.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2008-02.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2008-03.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2008-04.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2008-05.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2008-06.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2008-07.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2008-08.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2008-09.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2008-10.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2008-11.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2008-12.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2009-01.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2009-02.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2009-03.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2009-04.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2009-05.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2009-06.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2009-07.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2009-08.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2009-09.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2009-10.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2009-11.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2009-12.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2010-01.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2010-02.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2010-03.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2010-04.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2010-05.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2010-06.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2010-07.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2010-08.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2010-09.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2010-10.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2010-11.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2010-12.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2011-01.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2011-02.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2011-03.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2011-04.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2011-05.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2011-06.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2011-07.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2011-08.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2011-09.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2011-10.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2011-11.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2011-12.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2012-01.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2012-02.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2012-03.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2012-04.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2012-05.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2012-06.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2012-07.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2012-08.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2012-09.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2012-10.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2012-11.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2012-12.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2013-01.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2013-02.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2013-03.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2013-04.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2013-05.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2013-06.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2013-07.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2013-08.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2013-09.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2013-10.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2013-11.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2013-12.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2014-01.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2014-02.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2014-03.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2014-04.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2014-05.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2014-06.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2014-07.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2014-08.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2014-09.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2014-10.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2014-11.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2014-12.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2015-01.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2015-02.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2015-03.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2015-04.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2015-05.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2015-06.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2015-07.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2015-08.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2015-09.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2015-10.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2015-11.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2015-12.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2016-01.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2016-02.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2016-03.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2016-04.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2016-05.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2016-06.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2016-07.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2016-08.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2016-09.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2016-10.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2016-11.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2016-12.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2017-01.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2017-02.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2017-03.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2017-04.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2017-05.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2017-06.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2017-07.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2017-08.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2017-09.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2017-10.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2017-11.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2017-12.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2018-01.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2018-02.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2018-03.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2018-04.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2018-05.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2018-06.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2018-07.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2018-08.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2018-09.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2018-10.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2018-11.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2018-12.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2019-01.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2019-02.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2019-03.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2019-04.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2019-05.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2019-06.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2019-07.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2019-08.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2019-09.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2019-10.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2019-11.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2019-12.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2020-01.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2020-02.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2020-03.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2020-04.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2020-05.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2020-06.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2020-07.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2020-08.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2020-09.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2020-10.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2020-11.h5","nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/satlantic_2020-12.h5"],"staging_bucket":"kerchunk-staging","s3_comb_ref_file":"wtk\/south_atlantic\/kerchunk_5min_ref_s3.json","az_comb_ref_file":"wtk\/south_atlantic\/kerchunk_5min_ref.json","run_name":"south_atlantic-5min-2"} \ No newline at end of file diff --git a/azure/pipeline/ASL/state_machine_template.json b/azure/pipeline/ASL/state_machine_template.json new file mode 100644 index 0000000..8db9ed2 --- /dev/null +++ b/azure/pipeline/ASL/state_machine_template.json @@ -0,0 +1,74 @@ +{ + "Comment": "Takes a set of s3 paths to h5 files as input and transforms them to be compatible with kerchunk. Kerchunk reference files are generated for each file for both s3 and Azure,then the combined reference files are generated. All files are uploaded to an s3 staging bucket for testing.", + "StartAt": "Map", + "States": { + "Map": { + "Type": "Map", + "Parameters": { + "ContainerOverrides": { + "Command": ["python", "transform.py"], + "Environment": [ + { + "Name": "s3_file", + "Value.$": "$$.Map.Item.Value" + }, + { + "Name": "staging_bucket", + "Value.$": "$.staging_bucket" + } + ] + } + }, + "ItemProcessor": { + "ProcessorConfig": { + "Mode": "INLINE" + }, + "StartAt": "transform-h5-files", + "States": { + "transform-h5-files": { + "Type": "Task", + "Resource": "arn:aws:states:::batch:submitJob.sync", + "Parameters": { + "JobName": "transform-file", + "JobDefinition": "arn:aws:batch:us-west-2:351672045885:job-definition/kerchunk-h5", + "JobQueue": "arn:aws:batch:us-west-2:351672045885:job-queue/kerchunk-h5", + "ContainerOverrides.$": "$.ContainerOverrides" + }, + "End": true + } + } + }, + "ItemsPath": "$.s3_files", + "MaxConcurrency": 20, + "Next": "generate-references", + "ResultPath": null + }, + "generate-references": { + "Type": "Task", + "Resource": "arn:aws:states:::batch:submitJob.sync", + "Parameters": { + "JobName": "refjob", + "JobDefinition": "arn:aws:batch:us-west-2:351672045885:job-definition/kerchunk-h5", + "JobQueue": "arn:aws:batch:us-west-2:351672045885:job-queue/kerchunk-h5", + "ContainerOverrides": { + "Command": ["python", "gen_ref.py"], + "Environment": [ + { + "Name": "staging_bucket", + "Value.$": "$.staging_bucket" + }, + { + "Name": "run_name", + "Value.$": "$.run_name" + }, + { + "Name": "s3_comb_ref_file", + "Value.$": "$.s3_comb_ref_file" + } + ] + } + }, + "End": true + } + } +} diff --git a/azure/pipeline/__init__.py b/azure/pipeline/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/azure/pipeline/aws_glob_patterns.json b/azure/pipeline/aws_glob_patterns.json new file mode 100644 index 0000000..0deedfc --- /dev/null +++ b/azure/pipeline/aws_glob_patterns.json @@ -0,0 +1 @@ +{"nrel-pds-wtk":{"pr100":{"hourly":"nrel-pds-wtk\/pr100\/hourly\/*.h5","5min":"nrel-pds-wtk\/pr100\/5min\/*.h5"},"south_atlantic":{"hourly":"nrel-pds-wtk\/south_atlantic\/yearly_hr\/v1.0.0\/*.h5","5min":"nrel-pds-wtk\/south_atlantic\/monthly\/v1.0.0\/*.h5"}}} \ No newline at end of file diff --git a/azure/pipeline/aws_tools.py b/azure/pipeline/aws_tools.py new file mode 100644 index 0000000..5b7f465 --- /dev/null +++ b/azure/pipeline/aws_tools.py @@ -0,0 +1,291 @@ +import boto3 +import ujson +import s3fs +from etl_tools import load_oedi_sas, gen_ref_comb +from azure.storage.blob import ContainerClient +from dotenv import load_dotenv +import subprocess +import h5py + +def get_tags(org, billingid, task, owner): + tags = [ + { + 'key': 'org', + 'value': org + }, + { + 'key': 'billingid', + 'value': billingid + }, + { + 'key': 'task', + 'value': task + }, + { + 'key': 'owner', + 'value': owner + } + ] + return tags + +def get_dataset(bucket, prefix=None, extension='.h5', resolution=None): + """ + This is a convenience function that generates a list of s3 bucket+key paths for a given dataset + + Parameters + ---------- + bucket : str (required) + Bucket in which the dataset lives (e.g. 'nrel-pds-wtk') + prefix : str + Prefix of all files in the dataset (e.g. 'Great_Lakes') + extension : str + File extension for all files in the dataset (e.g. '.h5') + resolution : str + For WIND data only. Options are 'hourly' or '5min' + + Returns + ------- + files : list + List of all bucket+key paths to all files in the bucket subject to the provided options. + """ + s3 = s3fs.S3FileSystem(anon=True) + with open('aws_glob_patterns.json') as f: + aws_glob_patterns = ujson.load(f) + + if prefix and resolution: + files = s3.glob(aws_glob_patterns[bucket][prefix][resolution]) + elif prefix: + files = s3.glob(aws_glob_patterns[bucket][prefix]) + else: + files = s3.glob(aws_glob_patterns[bucket]) + + files = [file for file in files if file.endswith(extension)] + + return files + +def get_StepFunctionRole(): + # TODO: Add code that creates the role if it doesn't exist + """ + This function obtains the StepFunctionRole to be used when creating a step function. + + Parameters + ---------- + None + + Returns + ------- + roleArn : str + Amazon resource number of the StepFunctionRole + """ + iam = boto3.client('iam') + roleArn = iam.get_role(RoleName='StepFunctionRole')['Role']['Arn'] + return roleArn + +def create_state_machine(name, definition='./ASL/state_machine_template.json', tags=None, region_name='us-west-2'): + """ + This is a convenience function that creates or updates a state machine in AWS from the definition. + + Parameters + ---------- + name : str (required) + The name given to the state machine in AWS. + definition : str + Path to the json file that contains the ASL definition of the state machine. + tags : dict + key-value pairs for tracking aws resources. Defaults will be used if none are provided (see get_tags). + Returns + ------- + stateMachineArn : str + The amazon resource number of the state machine. + """ + + sf = boto3.client('stepfunctions', region_name=region_name) + + sms = sf.list_state_machines()['stateMachines'] + stateMachineArn = '' + for sm in sms: + if sm['name'] == name: + stateMachineArn = sm['stateMachineArn'] + break + + if not tags: + tags = get_tags() + + with open(definition) as f: + if stateMachineArn: + sf.update_state_machine(stateMachineArn=stateMachineArn, definition=f.read()) + else: + roleArn = get_StepFunctionRole() + stateMachineArn = sf.create_state_machine(name=name, definition=f.read(), roleArn=roleArn, tags=tags)['stateMachineArn'] + return stateMachineArn + +def get_state_machine(name, region_name='us-west-2'): + """ + This function gets the ARN for a state machine by name. + + Parameters + ---------- + name : str (required) + The name of the state machine in AWS. + + Returns + ------- + stateMachineArn : str + The amazon resource number of the state machine. + """ + sf = boto3.client('stepfunctions', region_name=region_name) + sms = sf.list_state_machines()['stateMachines'] + stateMachineArn = '' + for sm in sms: + if sm['name'] == name: + stateMachineArn = sm['stateMachineArn'] + break + if not stateMachineArn: + raise Exception(f'State machine {name} not found.') + return stateMachineArn + +def create_state_machine_input(files, staging_bucket, s3_comb_ref_file, az_comb_ref_file, run_name=None, input_file='ASL/state_machine_input.json'): + # TODO: Check access/existence to/of staging bucket + """ + This function generates the state machine input to process a dataset. + + Parameters + ---------- + files : list (required) + A list of bucket+key paths to the files of the dataset + staging_bucket : str + Name of the bucket where transformed files and json references will be written + s3_comb_ref_file : str + Key for the combined kerchunk reference file that points to the dataset in staging + az_comb_ref_file : str + Key for the combined kerchunk reference file that points to the dataset in azure + run_name : str + The name of the run. This will be used to create a json file in S3 containing the inputs needed for the run + input_file : str + A path in which to store a local copy of the json inputs needed for the run. + Returns + ------- + input_data : str + A serialized copy of the input data + """ + smi = { + 's3_files': files, + 'staging_bucket': staging_bucket, + 's3_comb_ref_file': s3_comb_ref_file, + 'az_comb_ref_file': az_comb_ref_file, + 'run_name' : run_name + } + with open(input_file, 'w') as f: + ujson.dump(smi, f) + + s3 = s3fs.S3FileSystem() + s3.put_file(input_file, f'{staging_bucket}/{run_name}.json') + + input_data = ujson.dumps(smi) + return input_data + +def run_state_machine(name, run_name = 'sm_run', input_file='ASL/state_machine_input.json', region_name='us-west-2'): + sf = boto3.client('stepfunctions', region_name=region_name) + stateMachineArn = get_state_machine(name) + with open(input_file) as f: + input = f.read() + response = sf.start_execution(stateMachineArn=stateMachineArn, name=run_name, input=input) + return response + +def create_job_def(job_def_file='./ASL/job_definition.json', region_name='us-west-2'): + with open(job_def_file) as f: + job_def = ujson.load(f) + tags = get_tags() + job_def['tags'] = {} + for tag in tags: + job_def['tags'][tag['key']] = tag['value'] + job_def['propagateTags'] = True + batch = boto3.client('batch', region_name=region_name) + response = batch.register_job_definition(**job_def) + return response + +def create_launch_templates(): + # TODO: Need to add the 2TB and 3TB versions + ec2 = boto3.client('ec2') + LaunchTemplateNames = ['kerchunk-1TB'] + for LaunchTemplateName in LaunchTemplateNames: + with open(f'./ASL/{LaunchTemplateName}.json') as f: + LaunchTemplateData = ujson.load(f) + existing_template = ec2.describe_launch_templates(Filters=[{'Name': 'launch-template-name', 'Values': [LaunchTemplateName]}]) + if existing_template: + ec2.create_launch_template_version(LaunchTemplateName=LaunchTemplateName, LaunchTemplateData=LaunchTemplateData) + else: + ec2.create_launch_template(LaunchTemplateName=LaunchTemplateName, LaunchTemplateData=LaunchTemplateData) + +def create_cluster(): + batch = boto3.client('batch') + +def create_aws_resources(): + create_state_machine('kerchunk-h5') + create_job_def() + +def process_h5_dataset(files, staging_bucket, s3_comb_ref_file, az_comb_ref_file, state_machine_name='kerchunk_h5', region_name='us-west-2'): + smi = create_state_machine_input(files, staging_bucket, s3_comb_ref_file, az_comb_ref_file) + stateMachineArn = get_state_machine(state_machine_name) + sf = boto3.client('stepfunctions', region_name=region_name) + sf.start_execution(stateMachineArn=stateMachineArn, input=smi) + +def copy_s3_dataset_to_azure(files, staging_bucket, dry_run=False): + CONTAINER_NAME = 'oedi' + sas = load_oedi_sas() + load_dotenv() # Store AWS credentials in .env file + cmd = [ + 'azcopy', + 'copy', + f'https://s3.us-west-2.amazonaws.com/{staging_bucket}', + f'https://nrel.blob.core.windows.net/{CONTAINER_NAME}?{sas}', + '--include-path', + ';'.join(files) + ] + + if dry_run: + cmd.append('--dry-run') + + subprocess.run(cmd) + +def create_combined_ref(files, staging_bucket, comb_ref_file=None, remote_protocol='s3'): + s3 = s3fs.S3FileSystem() + f = h5py.File(s3.open(f'{staging_bucket}/{files[0]}')) + identical_dims = list(f.attrs['identical_dims']) + if remote_protocol == 's3': + ref_files = [file.replace('.h5', '_s3.json') for file in files] + elif remote_protocol == 'abfs': + ref_files = [file.replace('.h5', '.json') for file in files] + else: + raise Exception('remote_protocol must be "s3" or "abfs"') + refs = [] + for ref_file in ref_files: + with s3.open(f'{staging_bucket}/{ref_file}', 'rb') as f: + refs.append(ujson.load(f)) + ref_comb = gen_ref_comb(refs, identical_dims=identical_dims, remote_protocol=remote_protocol) + temp_file = 'temp.json' + with open(temp_file, 'wb') as f: + f.write(ujson.dumps(ref_comb).encode()) + s3.put_file(temp_file, f's3://{staging_bucket}/{comb_ref_file}') + if remote_protocol=='abfs': + sas = load_oedi_sas() + CONTAINER_NAME = 'oedi' + dest = f'https://nrel.blob.core.windows.net/{CONTAINER_NAME}/{comb_ref_file}?{sas}' + subprocess.run(['azcopy', 'copy', temp_file, dest]) + +def copy_s3_file_to_azure(source, dest, sas=None, container='oedi'): + s3 = s3fs.S3FileSystem() + if not sas: + sas = load_oedi_sas() + client = ContainerClient.from_container_url(f'https://nrel.blob.core.windows.net/{container}?{sas}') + blob = client.get_blob_client(dest) + with s3.open(source, 'rb') as f: + blob.upload_blob(f.read()) + +def copy_local_file_to_azure(source, dest, sas=None, container='oedi'): + if not sas: + sas = load_oedi_sas() + client = ContainerClient.from_container_url(f'https://nrel.blob.core.windows.net/{container}?{sas}') + blob = client.get_blob_client(dest) + with open(source, 'rb') as f: + blob.upload_blob(f.read()) diff --git a/azure/pipeline/azure_tools.py b/azure/pipeline/azure_tools.py new file mode 100644 index 0000000..c8568f1 --- /dev/null +++ b/azure/pipeline/azure_tools.py @@ -0,0 +1,34 @@ +import planetary_computer + +def get_fs(account='nrel', container='oedi'): + return planetary_computer.get_adlfs_filesystem(account, container) + +def get_size(path, units='B'): + fs = get_fs() + size = fs.du(path, total=True) + if units=='B': + pass + elif units=='kB': + size = size * 10 ** -3 + elif units=='MB': + size = size * 10 ** -6 + elif units=='GB': + size = size * 10 ** -9 + elif units=='TB': + size = size * 10 ** -12 + elif units=='PB': + size = size * 10 ** -15 + elif units=='kiB': + size = size * 2 ** -10 + elif units=='MiB': + size = size * 2 ** -20 + elif units=='GiB': + size = size * 2 ** -30 + elif units=='TiB': + size = size * 2 ** -40 + elif units=='PiB': + size = size * 2 ** -50 + else: + raise NotImplementedError(f'Units "{units}" not recognized.') + + return size diff --git a/azure/pipeline/blob_access_example.ipynb b/azure/pipeline/blob_access_example.ipynb new file mode 100644 index 0000000..ba24a9c --- /dev/null +++ b/azure/pipeline/blob_access_example.ipynb @@ -0,0 +1,83 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Deleting blobs\n", + "\n", + "from azure.storage.blob import ContainerClient\n", + "from etl_tools import load_oedi_sas\n", + "\n", + "sas_token = load_oedi_sas() # Loads oedi rw sas token\n", + "client = ContainerClient.from_container_url(f'https://nrel.blob.core.windows.net/oedi?{sas_token}')\n", + "for blob in client.list_blobs():\n", + " if \"wtk\" in blob.name and 'test' in blob.name:\n", + " print(blob.name)\n", + " #client.delete_blob(blob)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Copying blobs within Azure\n", + "\n", + "source_blob = client.get_blob_client('wtk/wtk_bangladesh_hourly_ref.json')\n", + "dest_blob = client.get_blob_client('wtk/bangladesh/kerchunk_hourly_ref.json')\n", + "dest_blob.start_copy_from_url(source_blob.url)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Copy objects from S3 into Azure BLOB storage\n", + "\n", + "import s3fs\n", + "from azure.storage.blob import ContainerClient\n", + "from etl_tools import load_oedi_sas\n", + "\n", + "def copy_file_to_azure(source, dest, sas=None, container='oedi'):\n", + " s3 = s3fs.S3FileSystem()\n", + " if not sas:\n", + " sas = load_oedi_sas()\n", + "\n", + " client = ContainerClient.from_container_url(f'https://nrel.blob.core.windows.net/{container}?{sas}')\n", + " blob = client.get_blob_client(dest)\n", + " with s3.open(source, 'rb') as f:\n", + " blob.upload_blob(f.read())\n", + "\n", + "copy_file_to_azure('s3://kerchunk-staging/test.txt', 'wtk/test/test.txt')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".env", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/azure/pipeline/etl_tools.py b/azure/pipeline/etl_tools.py new file mode 100644 index 0000000..61e86bb --- /dev/null +++ b/azure/pipeline/etl_tools.py @@ -0,0 +1,381 @@ +import h5py +import pandas as pd +import numpy as np +from kerchunk.hdf import SingleHdf5ToZarr +from kerchunk.combine import MultiZarrToZarr +import ujson +import subprocess +import planetary_computer +import os +from time import time +import s3fs +import logging + +def time_index_bytestring_to_float(dset): + t = pd.Series(dset) + t = t.str.decode('utf8') + t = t.str.split('+', expand=True)[0] + t = np.array(t,dtype=np.datetime64) + t = t.astype('int') + return t + +def copy_attrs(obj1, obj2): + # Copy the attributes from obj1 to obj2, which may be h5 File objects or h5 dataset objects + for key in obj1.attrs.keys(): + obj2.attrs[key] = obj1.attrs[key] + +def copy_dataset(f_in, f_out, var, mem_limit_GB=80): + # Determine sizes of slices to read + dtype_size = f_in[var].dtype.itemsize + max_read_size = mem_limit_GB * 10 ** 9 # Read 80 GB at a time + time_index_read_size = f_in[var].shape[0] # Read all time values + gid_index_read_size = int(max_read_size / time_index_read_size // dtype_size) # Number of sites to read at a time + + # Create slices + end = f_in[var].shape[1] + starts = np.arange(0, end, gid_index_read_size) + stops = starts[1:] + stops = np.append(stops, end) + + # Copy slices + for start, stop in zip(starts, stops): + f_out[var][:, start:stop] = f_in[var][:, start:stop] + +def elapsed_time(st): + return f'{(time() - st) / 60:.2f} min' + +def load_oedi_sas(): + # read/write sas token must be stored in a plain text file located at $HOME/.sas or oedi_azure/.sas + home = os.path.expanduser('~') + if os.path.isfile(f'{home}/.sas'): + path = f'{home}/.sas' + elif os.path.isfile('./.sas'): + path = './.sas' + elif os.path.isfile('../.sas'): + path = '../.sas' + else: + path = None + + if path: + with open(path) as f: + sas = f.read() + else: + raise Exception('.sas file not found. Please save your read/write .sas token to a file called .sas located in the oedi_azure directory.') + + return sas + +def transform_wtk_h5_file(in_file, out_file, chunk_size=2, weeks_per_chunk=None, in_file_on_s3=False): + # This is an updated version of transform_h5_file, designed for wtk. wtk does not have a nice rectangular coordinate grid, + # so the data will be left in 2 dims rather than be converted to 3 dims. + + # h5_file should be a path to a local h5 file. The file will be opened in write-mode, transformed and then closed. + # chunk_size is the desired size of each chunk in MiB + # weeks_per_chunk determines the length of chunks in the time_index dimension + + # Summary of data transformations: + + # 1. time_index is converted from byte-string to int (when read by xarray, this will automatically convert to np.datetime64) + # 2. A gid dataset is created to index the locations + # 3. time_index and gid are converted to dimension scales + # 4. Each variable is rechunked so that we will have consistent chunk sizes accross all files + # 5. The dimension scales are attached to each variable's dimensions + # 6. The scale_factor metadata is inverted (new_sf = 1 / old_sf) + # 7. The meta variable is unpacked + + # Notes: + # Once again, the download/upload steps are what will take all of the time here. To scale up to wtk, this transformation + # should either happen on Eagle (where the data are already local) or the transformation should be containerized for use + # with AWS batch. + + # Begin logging + st = time() + file_name = out_file.split('/')[-1] + logging.info(f'{elapsed_time(st)} - {file_name}: Starting transformation.') + + # Open input file + if in_file_on_s3: + s3 = s3fs.S3FileSystem() + f_in = h5py.File(s3.open(in_file)) + else: + f_in = h5py.File(in_file, 'r') + + # Delete output file if it exists, and then create it (note that 'w' mode for h5py would be better, but is unreliable) + if os.path.exists(out_file): + os.remove(out_file) + f_out = h5py.File(out_file, 'a') + + # Copy file attrs + copy_attrs(f_in, f_out) + logging.info(f'{elapsed_time(st)} - {file_name}: File attrs copied!') + + # Get the length of time_index and coordinates + time_len = f_in['time_index'].len() + nloc = len(f_in['coordinates']) + + # Convert time_index from bytes to float. + t = time_index_bytestring_to_float(f_in['time_index']) + + # Create time_index variable in new file. 'units' metadata required for xarray to interpret as datetime. + f_out.create_dataset('time_index', data=t) + copy_attrs(f_in['time_index'], f_out['time_index']) + f_out['time_index'].attrs['units'] = b'seconds since 1970-01-01' + + # Create gid variable + f_out.create_dataset('gid', data=np.arange(nloc, dtype=np.int32), fillvalue=-1) + logging.info(f'{elapsed_time(st)} - {file_name}: gid created.') + + # Convert to dimension scales + f_out['time_index'].make_scale() + f_out['gid'].make_scale() + + # Determine time_index chunksize + time_step = t[1] - t[0] + if not weeks_per_chunk: + if time_step == 5 * 60: # 5min data + weeks_per_chunk = 1 + elif time_step == 10 * 60: # 10min data + weeks_per_chunk = 2 + elif time_step == 15 * 60: # 15min data + weeks_per_chunk = 3 + elif time_step == 60 * 60: # hourly data + weeks_per_chunk = 12 + else: + weeks_per_chunk = 8 # other resolution + logging.info(f'Warning: Non-standard resolution of {time_step / 60} min detected.') + + time_index_chunk_len = int(min(weeks_per_chunk * 7 * 24 * 60 * 60 / time_step, time_len)) + + logging.info(f'{elapsed_time(st)} - {file_name}: time_index and gid created') + + # Get var names + vars = [var for var in f_in.keys() if var not in ['meta', 'time_index', 'latitude', 'longitude', 'gid', 'coordinates']] + + # Loop over vars copying them to the new file + for var in vars: + logging.info(f'{elapsed_time(st)} - {file_name}: Processing {var}...') + + # Check dims + if not f_in[var].shape[0] == time_len: + raise Exception(f'Dim 0 of {var} has different length than time_index.') + if not f_in[var].shape[1] == nloc: + raise Exception(f'Dim 1 of {var} has different length than gid.') + + # Determine location chunk size + element_size = f_in[var].dtype.itemsize # size of single element in bytes + gid_chunk_len = int(min(chunk_size * 2 ** 20 / time_index_chunk_len // element_size, nloc)) + + # Create dataset in new file + chunks=(time_index_chunk_len, gid_chunk_len) + f_out.create_dataset(var, shape=f_in[var].shape, dtype=f_in[var].dtype, chunks=chunks) + copy_dataset(f_in, f_out, var) + copy_attrs(f_in[var], f_out[var]) + + # Add chunks attribute + f_out[var].attrs['chunks'] = chunks + + # Fix scale_factor + if 'scale_factor' in f_out[var].attrs.keys(): + f_out[var].attrs['scale_factor'] = 1 / f_out[var].attrs['scale_factor'] + + # Attach scales to the dims + f_out[var].dims[0].attach_scale(f_out['time_index']) + f_out[var].dims[1].attach_scale(f_out['gid']) + + # Progress report + logging.info(f'{elapsed_time(st)} - {file_name}: Done!') + + logging.info(f'{elapsed_time(st)} - {file_name}: All variables transformed!') + + # Start tracking identical_dims (anything with only a gid dimension) + identical_dims = ['gid'] + + # Unpack metadata variables + for var in f_in['meta'].dtype.names: + logging.info(f'{elapsed_time(st)} - {file_name}: Unpacking {var} from meta...') + element_size = f_in['meta'][var].dtype.itemsize + gid_chunk_len = min(chunk_size * 2 ** 20 // element_size, nloc) + chunks = (gid_chunk_len,) + f_out.create_dataset(var, data=f_in['meta'][var], chunks=chunks) + + # Add chunks attribute + f_out[var].attrs['chunks'] = chunks + + # Attach dimension scales to the dimensions + f_out[var].dims[0].attach_scale(f_out['gid']) + + # Append to identical_dims + identical_dims.append(var) + + logging.info(f'{elapsed_time(st)} - {file_name}: Done!') + + logging.info(f'{elapsed_time(st)} - {file_name}: meta unpacked!') + + # Add identical_dims to file metadata so we can pass to kerchunk later + f_out.attrs['identical_dims'] = identical_dims + + # Close the datasets to ensure changes are written + f_in.close() + f_out.close() + + logging.info(f'{elapsed_time(st)} - {file_name}: Done with transormations!') + + return + +def transform_sup3rcc_h5_file(infile, outfile): + # This function is designed to transform h5 files for the Sup3rcc dataset, to prepare them for use with Kerchunk. + # infile and outfile should both be local file paths. infile is the original Sup3rcc h5 file. outfile will be created + # by copying and transforming the data from infile. + + # The Sup3rcc data uses a nice rectangular, evenly-spaced grid of lon/lat coordinates. This allowed for easy transformation + # from 2 dimensions to 3 dimensions, which results in improved user experience when loading the data with xarray. + + # Summary of data transformations: + + # 1. time_index is converted from byte-string to int (when read by xarray, this will automatically convert to np.datetime64) + # 2. latitude and longitude are given their own datsets + # 3. time_index, latitude and longitude are converted to dimension scales + # 4. Each variable is reshaped from 2 dims (time_index, location) to 3 dims (time_index, latitude, longitude) + # 5. Each variable is rechunked, resulting in about 1.8 MB per chunk + # 6. The dimension scales are attached to each variable's dimensions + # 7. The scale_factor metadata is inverted (new_sf = 1 / old_sf) + + # TODO + # 1. Future iterations of this transformation should modify the original h5 file, rather than copying the contents to a new file + # 2. Rechunking should be automated (currently the choice of chunk size is specific to the Sup3rcc dataset) + + # Open infile, create outfile + f1 = h5py.File(infile) + f2 = h5py.File(outfile, 'a') + + # Copy attributes + for attr in f1.attrs.keys(): + f2.attrs[attr] = f1.attrs[attr] + + # Get the length of time_index + time_len = f1['time_index'].len() + + # Convert time_index from bytes to float. + t = pd.Series(f1['time_index']) + t = t.str.decode('utf8') + t = t.str.split('+', expand=True)[0] + t = np.array(t,dtype=np.datetime64) + t = t.astype('int') + + # Grab the lat and lon coordinates from meta + lat = f1['meta']['latitude'].reshape(650, 1475)[:, 0] + lon = f1['meta']['longitude'].reshape(650, 1475)[0, :] + + # Add time_index dimension to the temp dataset. 'units' metadata required for xarray to interpret as datetime. + f2.create_dataset('time_index', data=t) + f2['time_index'].attrs['units'] = b'seconds since 1970-01-01' + + # Add lon/lat dimensions to temp dataset + f2.create_dataset('latitude', data=lat) + f2.create_dataset('longitude', data=lon) + + # Convert them to dimension scales + f2['time_index'].make_scale() + f2['latitude'].make_scale() + f2['longitude'].make_scale() + + logging.info('Dimension scales created.') + + # Get var names + vars = [var for var in f1.keys() if var not in ['meta', 'time_index']] + + # Loop over the variables and transfer them to the temp data set + for var in vars: + # Check dimensions + time_len = f1['time_index'].len() + assert f1[var].shape[0] == time_len + assert f1[var].shape[1] == 650 * 1475 + + # Copy data, reshape it and rechunk it. Now we have 3 dims, time, lat, lon + # Note that chunks=True will result in auto-chunking. This doesn't really work when + # data sets have different lengths for the time_index (as is the case for Sup3rcc) + chunks = (24, 130, 295) + f2.create_dataset(var, data=f1[var][:].reshape(time_len, 650, 1475), chunks=chunks) # Results in 1.8 MB chunks for pressure data + logging.info(f'{var} reshaped and transferred to new dataset.') + + # Add attributes + for attr in f1[var].attrs.keys(): + if attr == 'scale_factor': + f2[var].attrs[attr] = 1 / f1[var].attrs[attr] + elif attr != 'chunks': + f2[var].attrs[attr] = f1[var].attrs[attr] + f2[var].attrs['chunks'] = chunks + + # Label the dimensions of the main variable + f2[var].dims[0].label = 'time_index' + f2[var].dims[1].label = 'latitude' + f2[var].dims[2].label = 'longitude' + + # Attach dimension scales to the dimensions + f2[var].dims[0].attach_scale(f2['time_index']) + f2[var].dims[1].attach_scale(f2['latitude']) + f2[var].dims[2].attach_scale(f2['longitude']) + + logging.info(f'Dimension scales attached to {var}.') + + # Add metadata variables + for var in f1['meta'].dtype.names: + if var not in ['latitude', 'longitude']: + chunks = (130, 295) + f2.create_dataset(var, data=f1['meta'][var].reshape(650, 1475), chunks=chunks) + + # Add chunks attribute + f2[var].attrs['chunks'] = chunks + + # Label the dimensions of the main variable + f2[var].dims[0].label = 'latitude' + f2[var].dims[1].label = 'longitude' + + # Attach dimension scales to the dimensions + f2[var].dims[0].attach_scale(f2['latitude']) + f2[var].dims[1].attach_scale(f2['longitude']) + + # Close the new dataset to ensure changes are written + f1.close() + f2.close() + + return + +def gen_ref(local_path, storage_path, ref_file=None): + # local_path is the file to be analyzed. storage_path is the path to the same file in cloud storage. ref_file is + # an optional argument that can be used to save the kerchunk references as a json.\ + + with open(local_path, 'rb') as f: + ref = SingleHdf5ToZarr(f, storage_path, inline_threshold=300).translate() + + if ref_file: + with open(ref_file, 'wb') as f: + f.write(ujson.dumps(ref).encode()) + + return ref + +def gen_ref_comb(refs, ref_file=None, concat_dims=['time_index'], identical_dims=None, remote_protocol='abfs'): + # This function takes a list of kerchunk references and combines them into a single reference. + # For sup3rcc, we used identical_dims=['country', 'county', 'eez', 'elevation', 'latitude', 'longitude', 'offshore', 'state', 'timezone'], + # however, None would probably have been fine... + # Generate combo reference + + if remote_protocol not in ['s3', 'abfs']: + raise NotImplementedError() + + kwargs = { + 'remote_protocol': remote_protocol, + 'concat_dims': concat_dims, + 'identical_dims': identical_dims + } + if remote_protocol == 'abfs': + token = planetary_computer.sas.get_token('nrel', 'oedi').token + kwargs['remote_options'] = {'account_name': 'nrel', "credential": token} + + ref_comb = MultiZarrToZarr(refs, **kwargs).translate() + + # Write to json file + if ref_file: + with open(ref_file, 'wb') as f: + f.write(ujson.dumps(ref_comb).encode()) + + return ref_comb diff --git a/azure/pipeline/hpc_gen_refs.py b/azure/pipeline/hpc_gen_refs.py new file mode 100644 index 0000000..c629023 --- /dev/null +++ b/azure/pipeline/hpc_gen_refs.py @@ -0,0 +1,60 @@ +import ujson +from etl_tools import gen_ref_comb, load_oedi_sas +import xarray as xr +import planetary_computer +import os +import sys +import logging + +# Get input +# First arg should be the path for the combined ref file +# Next should be any number of paths to individual ref files + +args = sys.argv +comb_ref_file = args[1] +ref_paths = args[2:] + +USER = os.getenv('USER') +CONTAINER_NAME = 'oedi' + +az_path = comb_ref_file.replace(f'/scratch/{USER}/', '') + +if 'sup3rcc' in ref_paths[0]: + DATASET = 'sup3rcc' + identical_dims = ['country', 'county', 'eez', 'elevation', 'latitude', 'longitude', 'offshore', 'state', 'timezone'] +elif 'WIND' in ref_paths[0]: + DATASET = 'wtk' + az_path = az_path.replace('WIND/', 'wtk/') + # Open one dataset to get the identical_dims attribute + token = planetary_computer.sas.get_token('nrel', CONTAINER_NAME).token + ds = xr.open_dataset( + "reference://", engine="zarr", + backend_kwargs={ + "storage_options": { + "fo": ref_paths[0], + "remote_protocol": "abfs", + "remote_options": {'account_name': 'nrel', "credential": token} + }, + "consolidated": False, + } + ) + identical_dims = ds.attrs['identical_dims'] +else: + raise NotImplementedError('The only implemented Eagle datasets are sup3rcc and WIND.') + +logging.info(f'Identical dims: {identical_dims}') + +# Open all reference files +refs = [] +for rp in ref_paths: + with open(rp, 'rb') as f: + refs.append(ujson.load(f)) + +# Generate the combined reference file +gen_ref_comb(refs, ref_file=comb_ref_file, identical_dims=identical_dims) + +# Send to Azure +sas_token = load_oedi_sas() +blob_address = f'https://nrel.blob.core.windows.net/{CONTAINER_NAME}' +dest = f'{blob_address}/{az_path}?{sas_token}' +os.system(f'azcopy cp "{comb_ref_file}" "{dest}"') diff --git a/azure/pipeline/hpc_migration.ipynb b/azure/pipeline/hpc_migration.ipynb new file mode 100644 index 0000000..7e2ca2c --- /dev/null +++ b/azure/pipeline/hpc_migration.ipynb @@ -0,0 +1,154 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this notebook, we demonstrate how to use this package to migrate h5 data from Eagle to Azure." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First, we need to identify the files that we want to migrate. On Eagle, data are located in the `/datasets` directory." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of files: 2\n" + ] + } + ], + "source": [ + "from pipeline.hpc_tools import get_dataset\n", + "\n", + "dataset = 'WIND/kazakhstan'\n", + "resolution = '15min'\n", + "\n", + "files = get_dataset(dataset, resolution=resolution)\n", + "\n", + "print(f'Number of files: {len(files)}')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, we launch a series of jobs to copy and transform each file in the set." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Starting 1 transformation jobs.\n", + "Starting job to copy dataset to Azure.\n", + "Starting job to combine references.\n", + "All jobs scheduled!\n" + ] + } + ], + "source": [ + "from pipeline.hpc_tools import process_h5_dataset\n", + "\n", + "comb_ref_file = f'/scratch/mheine/{dataset}/kerchunk_{resolution}_ref.json'\n", + "job_ids = process_h5_dataset(files, comb_ref_file=comb_ref_file)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After a run, use scan_err to identify any file transformation jobs that failed." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Timeouts: 1\n", + "Other errors: 0\n", + "Total Files: 1\n", + "The smallest file that timed out in WIND/conus/v1.1.0 was 1537 GB.\n" + ] + } + ], + "source": [ + "from pipeline.hpc_tools import scan_err\n", + "dataset = 'WIND/conus/v1.1.0'\n", + "resolution = 'hourly'\n", + "files, timeout_redos, other_redos = scan_err(dataset=dataset, resolution=resolution)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "process_h5_redos allows you to launch a partial job. `files` should be all files in the dataset, and `redos` should be a subset of them that you want to reprocess." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Starting 1 transformation jobs.\n", + "Starting job to copy dataset to Azure.\n", + "Starting job to combine references.\n", + "All jobs scheduled!\n" + ] + } + ], + "source": [ + "from pipeline.hpc_tools import process_h5_redos\n", + "\n", + "comb_ref_file = f'/scratch/mheine/{dataset}/kerchunk_{resolution}_ref.json'\n", + "job_ids = process_h5_redos(files, timeout_redos + other_redos, comb_ref_file=comb_ref_file)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".env", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/azure/pipeline/hpc_process_file.py b/azure/pipeline/hpc_process_file.py new file mode 100644 index 0000000..0aa948f --- /dev/null +++ b/azure/pipeline/hpc_process_file.py @@ -0,0 +1,33 @@ +import sys +import os +from etl_tools import transform_wtk_h5_file, transform_sup3rcc_h5_file, gen_ref +from hpc_tools import construct_paths +from time import time +import logging + +# Start timer +start_time = time() + +CONTAINER_NAME = 'oedi' +USER = os.getenv('USER') + +# Get input +args = sys.argv +if len(args) != 2: + raise Exception('Must provide exactly one file path.') +source_path = args[1] + +# Construct paths +file_name, job_name, job_dir, ref_file, az_path = construct_paths(source_path) +scratch_path = f'{job_dir}{file_name}' +if 'WIND' in source_path: + transform_wtk_h5_file(source_path, scratch_path) +elif 'sup3rcc' in source_path: + transform_sup3rcc_h5_file(source_path, scratch_path) +else: + raise NotImplementedError(f'The only Eagle datasets that have been implemented are WIND and sup3rcc.') +logging.info(f'{(time() - start_time) / 60:.2f} min: {file_name} transformed.') + +# Generate references +gen_ref(scratch_path, f'abfs://{CONTAINER_NAME}/{az_path}', ref_file=ref_file) +logging.info(f'{(time() - start_time) / 60:.2f} min: {job_name} references generated.') diff --git a/azure/pipeline/hpc_to_azure.py b/azure/pipeline/hpc_to_azure.py new file mode 100644 index 0000000..7d5a1a1 --- /dev/null +++ b/azure/pipeline/hpc_to_azure.py @@ -0,0 +1,17 @@ +import planetary_computer +import sys +import subprocess +from etl_tools import load_oedi_sas +import os + +args = sys.argv + +blob_address = 'https://nrel.blob.core.windows.net/oedi' +sas_token = load_oedi_sas() + +for arg in args[1:]: + source, dest = arg.split(':') + source = f"'{source}'" + dest = f"'{blob_address}/{dest}?{sas_token}'" + + os.system(f'azcopy copy {source} {dest} --overwrite ifSourceNewer') diff --git a/azure/pipeline/hpc_tools.py b/azure/pipeline/hpc_tools.py new file mode 100644 index 0000000..996c513 --- /dev/null +++ b/azure/pipeline/hpc_tools.py @@ -0,0 +1,470 @@ +import os +import h5py +import subprocess +import math +from glob import glob +import re +import logging + +def run_job(job_file): + job_submission = subprocess.run(['sbatch', job_file], capture_output=True) + output = job_submission.stdout.decode() + if 'Submitted batch job ' in output: + jobid = output.split()[3] + else: + jobid = 0 + logging.error(f'Job submission failure: {job_submission.stderr.decode()}') + return jobid + +def cancel_jobs(job_ids): + for job_id in job_ids: + subprocess.run(['scancel', job_id]) + +def construct_paths(file): + # Need username to access scratch + user = os.getenv('USER') + + file_name = file.split('/')[-1] + job_name = file_name.replace('.h5', '') + job_dir = file.replace('/datasets', f'/scratch/{user}').replace(file_name, '') + ref_file = f'{job_dir}{job_name}.json' + + if 'WIND' in file: + az_path = file.replace('/datasets/WIND', 'wtk') + elif 'sup3rcc' in file: + az_path = file.replace('/datasets/', '') + else: + raise NotImplementedError(f'The only Eagle datasets that have been implemented are WIND and sup3rcc.') + + return file_name, job_name, job_dir, ref_file, az_path + +def get_dep_str(dependency): + if not isinstance(dependency, (list, tuple)): + dependency = [dependency, ] + + return '#SBATCH --dependency=afterok:' + ':'.join([str(id) for id in dependency]) + +def get_dataset(dataset, resolution=None): + files = [] + if 'WIND' in dataset: + subsets = ['North_Atlantic', 'gulf_of_mexico'] + subsets2 = ['india'] + if any([subset in dataset for subset in subsets]): + if resolution == 'hourly': + files = glob(f'/datasets/{dataset}/yearly_hr/*.h5') + elif resolution == '5min': + files = glob(f'/datasets/{dataset}/yearly/*.h5') + elif any([subset in dataset for subset in subsets2]): + if resolution == '5min': + files = glob(f'/datasets/{dataset}/*.h5') + else: + files = [] + else: + if resolution == 'hourly': + files = glob(f'/datasets/{dataset}/*.h5') + elif resolution == '5min': + files = glob(f'/datasets/{dataset}/*/*.h5') + else: # 10min and 15min resolutions + files = glob(f'/datasets/{dataset}/*.h5') + return files + +def gen_hpc_single_job(file, job_dir, job_name, mem_GB=None, time_limit_hrs=4, debug=False): + + # Get bash path + bash_path = os.popen('which bash').read().replace('\n', '') + + # Construct paths + file_name, job_name, job_dir, ref_file, az_path = construct_paths(file) + + # Get user + user = os.getenv('USER') + + # Set parameters + nodes = 1 + ntasks = 1 + + # Create job file paths + job_file = f'{job_dir}{job_name}.sh' + output_file = f'{job_dir}{job_name}_out' + error_file = f'{job_dir}{job_name}_err' + + # Add debug partition if desired + if debug: + add_debug = '#SBATCH --partition=debug' + time_limit_hrs = 1 + else: + add_debug = '' + + if mem_GB: + add_mem = f'#SBATCH --mem={mem_GB}GB' + else: + add_mem = '' + + with open(job_file, 'w') as f: + # Write SBATCH inputs + f.write( +f"""#!{bash_path} +#SBATCH --job-name='{job_name}' +#SBATCH --nodes={nodes} +#SBATCH --ntasks={ntasks} +#SBATCH --time={time_limit_hrs:.0f}:00:00 +#SBATCH -o {output_file} +#SBATCH -e {error_file} +#SBATCH --export=ALL +#SBATCH --account=oedi +{add_mem} +{add_debug} + +#------------------ + +cd /scratch/$USER +module load conda +conda activate .env2 +srun python /home/{user}/oedi_azure/pipeline/hpc_process_file.py {file} +""" + ) + + return job_file + +def gen_hpc_combine_refs_job(comb_ref_file, ref_files, time_limit_hrs=4, dependency=None, debug=False, py_file='/home/mheine/oedi_azure/pipeline/hpc_gen_refs.py'): + bash_path = os.popen('which bash').read().replace('\n', '') + + comb_ref_file_name = comb_ref_file.split('/')[-1] + job_dir = comb_ref_file.replace(comb_ref_file_name, '') + job_name = comb_ref_file_name.replace('.json', '') + job_file = f'{job_dir}{job_name}.sh' + + # Add dependency if any + if dependency: + add_dependency = get_dep_str(dependency) + else: + add_dependency = '' + + # Add debug partition if desired + if debug: + add_debug = '#SBATCH --partition=debug' + time_limit_hrs = 1 + else: + add_debug = '' + + # Create job file + with open(job_file, 'w') as f: + # Write SBATCH inputs + f.write( +f"""#!{bash_path} +#SBATCH --job-name='{job_name}' +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --time={time_limit_hrs:.0f}:00:00 +#SBATCH -o {job_dir}{job_name}_out +#SBATCH -e {job_dir}{job_name}_err +#SBATCH --export=ALL +#SBATCH --account=oedi +{add_dependency} +{add_debug} + +#------------------ +cd /scratch/$USER +module load conda +conda activate .env2 +srun python {py_file} {comb_ref_file} {' '.join(ref_files)} + +""" + ) + return job_file + +def gen_hpc_to_azure_job(files, transformed_files, az_paths, dependency=None, transfer_speed=1500, debug=False, py_file='/home/mheine/oedi_azure/pipeline/hpc_to_azure.py'): + # Transfer speed in Mb/s + bash_path = os.popen('which bash').read().replace('\n', '') + + first_file_name = transformed_files[0].split('/')[-1] + job_dir = transformed_files[0].replace(first_file_name, '') + match = re.search(r'/\d\d\d\d/$', job_dir) + if match: + year = match.group(0) + job_dir = job_dir.replace(year, '/') + job_name = 'hpc_to_azure' + existing_job_files = glob(job_dir + 'hpc_to_azure*.sh') + if existing_job_files: + job_name += f'_{len(existing_job_files) + 1}' + job_file = f'{job_dir}{job_name}.sh' + + # Estimate time requirements + total_bytes = 0 + for file in files: + total_bytes += os.stat(file).st_size + + time_factor = 1.5 # Provide extra time in case things move a little slower than usual + time_required_hrs = math.ceil(time_factor * total_bytes * 8 * 10 ** -6 / transfer_speed / 60 / 60) + if time_required_hrs > 240: + logging.info('Warning: Transfer job is estimated to take longer than the maximum of 240 hrs.') + time_required_hrs = 240 + + # Create transfer args + # : + transfer_args = [f'{transformed_file}:{az_path}' for transformed_file, az_path in zip(transformed_files, az_paths)] + + # Add dependency if any + if dependency: + add_dependency = get_dep_str(dependency) + else: + add_dependency = '' + + # Add debug partition if desired + if debug: + add_debug = '#SBATCH --partition=debug' + time_required_hrs = 1 + else: + add_debug = '' + + # Create job file + with open(job_file, 'w') as f: + # Write SBATCH inputs + f.write( +f"""#!{bash_path} +#SBATCH --job-name='{job_name}' +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --time={time_required_hrs:.0f}:00:00 +#SBATCH -o {job_dir}{job_name}_out +#SBATCH -e {job_dir}{job_name}_err +#SBATCH --export=ALL +#SBATCH --account=oedi +{add_dependency} +{add_debug} + +#------------------ +cd /scratch/$USER +module load conda +conda activate .env2 +srun python {py_file} {' '.join(transfer_args)} + +""" + ) + return job_file + +def process_h5_dataset(files, comb_ref_file=None, time_limit_hrs=None, mem_factor=1.2, debug=False, skip_transformation=False, skip_transfer_to_azure=False): + # For each file in files, we generate a job script and submit to sbatch + # files should a be a list of absolute file paths to files in the /datasets directory. + + # Make lists to track jobs + job_ids = [] + ref_files = [] + transformed_files = [] + az_paths = [] + # Loop over files + logging.info(f'Starting {len(files)} transformation jobs.') + for file in files: + + # It was found that files as small as 415 GB timed out when only given 4 hours. + # In practice, there is a lot of variablity in the lengths of job runs. This may + # be due to network limitations when running many jobs concurrently. We're bumping + # up the time limit to 48 (the limit for the standard partition) for all files larger + # than 400 GB. + if not time_limit_hrs: + # Get file size to adjust time limit + file_size_GB = os.stat(file).st_size * 10 ** -9 + if file_size_GB < 400: + time_limit_hrs = 4 + else: + time_limit_hrs = 48 + + # Construct paths and create directory + file_name, job_name, job_dir, ref_file, az_path = construct_paths(file) + os.makedirs(job_dir, exist_ok=True) + ref_files.append(ref_file) + transformed_files.append(f'{job_dir}{file_name}') + az_paths.append(az_path) + + if not skip_transformation: + # Generate job file to transform and generate references for a single h5 file + job_file = gen_hpc_single_job(file, job_dir, job_name, time_limit_hrs=time_limit_hrs, debug=debug) + + # Run job file + job_id = run_job(job_file) + if job_id == 0: + cancel_jobs(job_ids) + raise Exception('Job submission failure') + else: + job_ids.append(job_id) + + # Generate job file to copy dataset to Azure + if not skip_transfer_to_azure: + logging.info('Starting job to copy dataset to Azure.') + copy_job_file = gen_hpc_to_azure_job(files, transformed_files, az_paths, dependency=job_ids, debug=debug) + copy_job_id = run_job(copy_job_file) + if copy_job_id == 0: + cancel_jobs(job_ids) + raise Exception('Copy job submission failure') + else: + job_ids.append(copy_job_id) + else: + copy_job_id = None + + # Generate job file to combine references + # NOTE THAT DEBUG IS CURRENTLY SET TO TRUE TO EXPEDITE JOBS WHILE ACCOUNT IN STANDBY + logging.info('Starting job to combine references.') + if comb_ref_file: + ref_job_file = gen_hpc_combine_refs_job(comb_ref_file, ref_files, dependency=copy_job_id, debug=True) + ref_job_id = run_job(ref_job_file) + if ref_job_id == 0: + cancel_jobs(job_ids) + raise Exception('Gen combined ref job submission failure') + else: + job_ids.append(ref_job_id) + + logging.info('All jobs scheduled!') + + comb_ref_file_name = comb_ref_file.split('/')[-1] + if 'hourly' in comb_ref_file_name: + job_id_file = comb_ref_file.replace(comb_ref_file_name, 'job_ids_hourly.txt') + elif '5min' in comb_ref_file_name: + job_id_file = comb_ref_file.replace(comb_ref_file_name, 'job_ids_5min.txt') + else: + job_id_file = comb_ref_file.replace(comb_ref_file_name, 'job_ids.txt') + with open(job_id_file, 'w') as f: + f.writelines([job_id + '\n' for job_id in job_ids ]) + + return job_ids + +def process_h5_redos(files, redos, comb_ref_file=None, time_limit_hrs=None, debug=False, skip_transfer_to_azure=False): + """ + Process an h5 datset where some of the transformations failed. + + Parameters + ---------- + files : list + Paths to source h5 files for entire dataset (must be in /datasets on Eagle) + redos: list + Paths to source h5 files that failed (must be in /datasets on Eagle) + comb_ref_file: str + Path to where the combined kerchunk reference file will be + stored. If None, then no combined reference will be generated. + time_limit_hrs: int + Override the default time limit for the file transformation tasks. + debug: bool + Submit all jobs to the debug partition + skip_transfer_to_azure: bool + If true, then no files will be transferred to Azure. + + Returns + ------- + job_ids : list + List of all job_ids submitted to sbatch. + """ + + # For each file in redos, we generate a job script and submit to sbatch. + + # Make lists to track jobs + job_ids = [] + ref_files = [] + transformed_files = [] + az_paths = [] + # Loop over files + logging.info(f'Starting {len(redos)} transformation jobs.') + for file in files: + + # It was found that files as small as 415 GB timed out when only given 4 hours. + # In practice, there is a lot of variablity in the lengths of job runs. This may + # be due to network limitations when running many jobs concurrently. We're bumping + # up the time limit to 48 (the limit for the standard partition) for all files larger + # than 400 GB. + if not time_limit_hrs: + # Get file size to adjust time limit + file_size_GB = os.stat(file).st_size * 10 ** -9 + if file_size_GB < 400: + time_limit_hrs = 4 + else: + time_limit_hrs = 48 + + # Construct paths and create directory + file_name, job_name, job_dir, ref_file, az_path = construct_paths(file) + os.makedirs(job_dir, exist_ok=True) + ref_files.append(ref_file) + transformed_files.append(f'{job_dir}{file_name}') + az_paths.append(az_path) + + if file in redos: + # Generate job file to transform and generate references for a single h5 file + job_file = gen_hpc_single_job(file, job_dir, job_name, time_limit_hrs=time_limit_hrs, debug=debug) + + # Run job file + job_id = run_job(job_file) + if job_id == 0: + cancel_jobs(job_ids) + raise Exception('Job submission failure') + else: + job_ids.append(job_id) + + # Generate job file to copy dataset to Azure + if not skip_transfer_to_azure: + logging.info('Starting job to copy dataset to Azure.') + copy_job_file = gen_hpc_to_azure_job(files, transformed_files, az_paths, dependency=job_ids, debug=debug) + copy_job_id = run_job(copy_job_file) + if copy_job_id == 0: + cancel_jobs(job_ids) + raise Exception('Copy job submission failure') + else: + job_ids.append(copy_job_id) + else: + copy_job_id = None + + # Generate job file to combine references + if comb_ref_file: + logging.info('Starting job to combine references.') + ref_job_file = gen_hpc_combine_refs_job(comb_ref_file, ref_files, dependency=copy_job_id, debug=debug) + ref_job_id = run_job(ref_job_file) + if ref_job_id == 0: + cancel_jobs(job_ids) + raise Exception('Gen combined ref job submission failure') + else: + job_ids.append(ref_job_id) + + logging.info('All jobs scheduled!') + + comb_ref_file_name = comb_ref_file.split('/')[-1] + if 'hourly' in comb_ref_file_name: + job_id_file = comb_ref_file.replace(comb_ref_file_name, 'job_ids_hourly.txt') + elif '5min' in comb_ref_file_name: + job_id_file = comb_ref_file.replace(comb_ref_file_name, 'job_ids_5min.txt') + else: + job_id_file = comb_ref_file.replace(comb_ref_file_name, 'job_ids.txt') + with open(job_id_file, 'w') as f: + f.writelines([job_id + '\n' for job_id in job_ids ]) + + return job_ids + +def scan_err(dataset='WIND/Great_Lakes', resolution='5min'): + if resolution == 'hourly': + files = glob(f'/datasets/{dataset}/*.h5') + elif resolution == '5min': + files = glob(f'/datasets/{dataset}/*/*.h5') + + if len(files) == 0: + raise Exception('No output files found. Dataset/resolution does not exists or has not been processed.') + + timeouts = [] + other_errors = [] + timeout_redos = [] + other_redos = [] + for file in files: + err = file.replace('/datasets', '/scratch/mheine').replace('.h5', '_err') + with open(err) as f: + text = f.read() + if 'TIME LIMIT' in text: + timeouts.append(err) + timeout_redos.append(file) + elif len(text) > 0: + other_errors.append(err) + other_redos.append(file) + logging.info(f'Timeouts: {len(timeouts)}') + logging.info(f'Other errors: {len(other_errors)}') + logging.info(f'Total Files: {len(files)}') + + sizes = [] + for redo in timeout_redos: + sizes.append(os.stat(redo).st_size * 10 ** -9) + if len(sizes) > 0: + logging.info(f'The smallest file that timed out in {dataset} was {min(sizes):.0f} GB.') + + return files, timeout_redos, other_redos diff --git a/azure/pipeline/run_aws_pipeline.ipynb b/azure/pipeline/run_aws_pipeline.ipynb new file mode 100644 index 0000000..ce9cbf9 --- /dev/null +++ b/azure/pipeline/run_aws_pipeline.ipynb @@ -0,0 +1,117 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from aws_tools import *\n", + "\n", + "# Update state machine and job def\n", + "\n", + "create_aws_resources()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Get the s3 addresses for the dataset\n", + "\n", + "prefix = 'south_atlantic'\n", + "resolution = '5min'\n", + "staging_bucket = 'kerchunk-staging'\n", + "run_name = 'south_atlantic-5min-2'\n", + "\n", + "files = get_dataset('nrel-pds-wtk', prefix=prefix, resolution=resolution)\n", + "files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Generate the state machine input for this dataset\n", + "\n", + "s3_comb_ref_file = f'wtk/{prefix}/kerchunk_{resolution}_ref_s3.json'\n", + "az_comb_ref_file = f'wtk/{prefix}/kerchunk_{resolution}_ref.json'\n", + "create_state_machine_input(files, staging_bucket, s3_comb_ref_file, az_comb_ref_file, run_name=run_name)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Run the state machine\n", + "\n", + "run_state_machine('kerchunk-h5', run_name=run_name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If the state machine fully executed without error, then there should now be a set of transformed h5 files, s3 refs and az refs, as well as a combined s3 ref file in the staging bucket. Use the test_staging.ipynb notebook to verify that the transformation was successful by loading the combined s3 ref file.\n", + "\n", + "Once you are satisfied, continue to the next cell to copy the data to Azure and generate the combined az ref file.\n", + "\n", + "Make sure to update the .env file with AWS credentials!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "copy_s3_dataset_to_azure(files, staging_bucket, dry_run=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "comb_ref_file = f'wtk/{prefix}/kerchunk_{resolution}_ref.json'\n", + "create_combined_ref(files, staging_bucket, comb_ref_file=comb_ref_file, remote_protocol='abfs')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once these tasks have finished, you can open the wtk example notebook and verify that the dataset can now be loaded from Azure." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "oedi-azure-dev", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/azure/pipeline/transform_h5_container/Dockerfile b/azure/pipeline/transform_h5_container/Dockerfile new file mode 100644 index 0000000..b536919 --- /dev/null +++ b/azure/pipeline/transform_h5_container/Dockerfile @@ -0,0 +1,25 @@ +FROM continuumio/miniconda3 + +# make docker use bash instead of sh +SHELL ["/bin/bash", "--login", "-c"] + +# create environment +COPY ./pipeline/transform_h5_container/env.yml . +RUN conda env create -f env.yml + +# install azcopy +COPY ./pipeline/transform_h5_container/install_azcopy.sh . +RUN sh install_azcopy.sh + +# copy all necessary files +COPY ./.sas . +COPY ./pipeline/transform_h5_container/* ./ +COPY ./pipeline/etl_tools.py . +COPY ./pipeline/aws_tools.py . + +# make entrypoint script executable +RUN chmod u+x entrypoint.sh + +# activate environment and run container +ENTRYPOINT ["./entrypoint.sh"] +CMD ["python", "transform.py"] diff --git a/azure/pipeline/transform_h5_container/entrypoint.sh b/azure/pipeline/transform_h5_container/entrypoint.sh new file mode 100644 index 0000000..67ef690 --- /dev/null +++ b/azure/pipeline/transform_h5_container/entrypoint.sh @@ -0,0 +1,6 @@ +#!/bin/bash --login +set -e + +# activate conda environment and let the following process take over +conda activate oedi-azure-container +exec "$@" diff --git a/azure/pipeline/transform_h5_container/env.yml b/azure/pipeline/transform_h5_container/env.yml new file mode 100644 index 0000000..5dbc52f --- /dev/null +++ b/azure/pipeline/transform_h5_container/env.yml @@ -0,0 +1,16 @@ +name: oedi-azure-container +channels: + - conda-forge + - defaults + - hcc +dependencies: + - python=3.10.12 + - h5py=3.9.0 + - boto3 + - cftime + - kerchunk + - planetary-computer + - s3fs=2023.6.0 + - pandas + - adlfs + - xarray diff --git a/azure/pipeline/transform_h5_container/gen_ref.py b/azure/pipeline/transform_h5_container/gen_ref.py new file mode 100644 index 0000000..8ea40c4 --- /dev/null +++ b/azure/pipeline/transform_h5_container/gen_ref.py @@ -0,0 +1,54 @@ +import ujson +from etl_tools import gen_ref_comb +import xarray as xr +import os +import s3fs +import h5py + +# TODO: Remove all az stuff. The az combined ref file gets created at a later step, after data moves to Azure + +# Azure container name +CONTAINER_NAME = 'oedi' + +# Access S3 +s3 = s3fs.S3FileSystem() + +# Get input from container environment +s3_comb_ref_file = os.getenv('s3_comb_ref_file') +staging_bucket = os.getenv('staging_bucket') +run_name = os.getenv('run_name') + +# Get s3 file list from input file on s3 (This list was too long to be an env variable.) +with s3.open(f'{staging_bucket}/{run_name}.json') as f: + input_data = ujson.load(f) +s3_source_files = input_data['s3_files'] + +# Get paths to references and list of identical dims +test_file = s3_source_files[0] +if 'nrel-pds-wtk' in test_file: + s3_ref_paths = [f"{staging_bucket}/{f.replace('nrel-pds-wtk', 'wtk').replace('.h5', '_s3.json')}" for f in s3_source_files] + az_ref_paths = [f"{staging_bucket}/{f.replace('nrel-pds-wtk', 'wtk').replace('.h5', '.json')}" for f in s3_source_files] + test_file = test_file.replace('nrel-pds-wtk', 'wtk') + with s3.open(f'{staging_bucket}/{test_file}') as f: + h5 = h5py.File(f) + identical_dims = list(h5.attrs['identical_dims']) +elif 'sup3rcc' in test_file: + identical_dims = ['country', 'county', 'eez', 'elevation', 'latitude', 'longitude', 'offshore', 'state', 'timezone'] + raise NotImplementedError() +else: + NotImplementedError() + +# Open all reference files +s3_refs = [] +az_refs = [] +for s3_rp, az_rp in zip(s3_ref_paths, az_ref_paths): + with s3.open(s3_rp, 'rb') as f: + s3_refs.append(ujson.load(f)) + with s3.open(az_rp, 'rb') as f: + az_refs.append(ujson.load(f)) + +# Generate the combined reference files +if s3_comb_ref_file: + local_s3_ref = 's3_ref.json' + gen_ref_comb(s3_refs, ref_file=local_s3_ref, identical_dims=identical_dims, remote_protocol='s3') + s3.put_file(local_s3_ref, f's3://{staging_bucket}/{s3_comb_ref_file}') diff --git a/azure/pipeline/transform_h5_container/install_azcopy.sh b/azure/pipeline/transform_h5_container/install_azcopy.sh new file mode 100644 index 0000000..56eb716 --- /dev/null +++ b/azure/pipeline/transform_h5_container/install_azcopy.sh @@ -0,0 +1,17 @@ + +#!/bin/bash + +# Install AzCopy on Linux + +# Download and extract +wget https://aka.ms/downloadazcopy-v10-linux +tar -xvf downloadazcopy-v10-linux + +# Move AzCopy +rm -f /usr/bin/azcopy +cp ./azcopy_linux_amd64_*/azcopy /usr/bin/ +chmod 755 /usr/bin/azcopy + +# Clean the kitchen +rm -f downloadazcopy-v10-linux +rm -rf ./azcopy_linux_amd64_*/ diff --git a/azure/pipeline/transform_h5_container/transfer.py b/azure/pipeline/transform_h5_container/transfer.py new file mode 100644 index 0000000..c43ae6f --- /dev/null +++ b/azure/pipeline/transform_h5_container/transfer.py @@ -0,0 +1,5 @@ +import subprocess +import sys + +args = sys.argv +subprocess.run(['azcopy', '--version']) diff --git a/azure/pipeline/transform_h5_container/transform.py b/azure/pipeline/transform_h5_container/transform.py new file mode 100644 index 0000000..515f15f --- /dev/null +++ b/azure/pipeline/transform_h5_container/transform.py @@ -0,0 +1,60 @@ +import os +from etl_tools import transform_wtk_h5_file, transform_sup3rcc_h5_file, gen_ref +from time import time +import boto3 +import logging + +# Download h5 to local and then build out the rechunked copy + +# Start timer +start_time = time() + +# Get input from container environment overrides +container_name = 'oedi' +staging_bucket = os.getenv('staging_bucket') +source_path = os.getenv('s3_file') +file_name = source_path.split('/')[-1] + +# Download file to local +s3 = boto3.client('s3') +Bucket = source_path.split('/')[0] +Key = source_path.replace(f'{Bucket}/', '') +local_path = f'/data/{source_path}' +os.makedirs(local_path.replace(file_name, '')) +s3.download_file(Bucket=Bucket, Key=Key, Filename=local_path) + +# Transform dataset +logging.info(f'{(time() - start_time) / 60:.2f} min: {file_name} - Starting transformation.') +if 'nrel-pds-wtk' in source_path: + #DATASET_NAME = 'wtk' + az_path = source_path.replace('nrel-pds-wtk/', 'wtk/') + scratch_path = f'/data/{az_path}' + os.makedirs(scratch_path.replace(file_name, ''), exist_ok=True) # Need to create the dir if it doesn't exist + transform_wtk_h5_file(local_path, scratch_path, in_file_on_s3=False) +elif 'sup3rcc' in source_path: + DATASET_NAME = 'sup3rcc' + az_path = source_path.replace('/nrel-pds-sup3rcc/', 'sup3rcc/') + scratch_path = f'/data/{az_path}' + os.makedirs(scratch_path.replace(file_name, ''), exist_ok=True) # Need to create the dir if it doesn't exist + transform_sup3rcc_h5_file(source_path, scratch_path) +else: + raise NotImplementedError(f'Dataset for {source_path} not implemented yet.') +logging.info(f'{(time() - start_time) / 60:.2f} min: {file_name} - Transformed.') + +ref_file = scratch_path.replace('.h5', '.json') +ref_file_s3 = scratch_path.replace('.h5', '_s3.json') +gen_ref(scratch_path, f'abfs://{container_name}/{az_path}', ref_file=ref_file) +logging.info(f'{(time() - start_time) / 60:.2f} min: {file_name} - Azure reference generated.') + +s3_staging_path = f's3://{staging_bucket}/{az_path}' +gen_ref(scratch_path, s3_staging_path, ref_file=ref_file_s3) +logging.info(f'{(time() - start_time) / 60:.2f} min: {file_name} - S3 reference generated.') + +# Upload to staging +s3 = boto3.client('s3') +s3.upload_file(ref_file, staging_bucket, ref_file.replace('/data/', '')) +logging.info(f'{(time() - start_time) / 60:.2f} min: {file_name} - Azure reference uploaded to staging.') +s3.upload_file(ref_file_s3, staging_bucket, ref_file_s3.replace('/data/', '')) +logging.info(f'{(time() - start_time) / 60:.2f} min: {file_name} - S3 reference uploaded to staging.') +s3.upload_file(scratch_path, staging_bucket, az_path) +logging.info(f'{(time() - start_time) / 60:.2f} min: {file_name} - h5 file uploaded to staging.') diff --git a/azure/pipeline/update_trans_container.sh b/azure/pipeline/update_trans_container.sh new file mode 100644 index 0000000..9f1eeea --- /dev/null +++ b/azure/pipeline/update_trans_container.sh @@ -0,0 +1,4 @@ +docker build -t transform_h5_container -f ./pipeline/transform_h5_container/Dockerfile . +docker tag transform_h5_container:latest 351672045885.dkr.ecr.us-west-2.amazonaws.com/transform_h5_container:latest +aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 351672045885.dkr.ecr.us-west-2.amazonaws.com +docker push 351672045885.dkr.ecr.us-west-2.amazonaws.com/transform_h5_container:latest