# Exploring HDF5 (`.h5`) Files

In [1]:
%pip install -q -r ../Docs/requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [2]:
try:
    import os
    import sys
    import glob
    import h5py
    import numpy as np
    import pandas as pd
    import torch
    import matplotlib.pyplot as plt
except Exception as e:
    print(f"Error : {e}")

In [3]:
# Print the PyTorch version
print(f"PyTorch version: {torch.__version__}")

# Check if running in Google Colab
if "google.colab" in str(get_ipython()):
    if torch.cuda.is_available():
        device = "cuda"
    else:
        device = 'cpu'
        print("GPU not available in Colab, consider enabling a GPU runtime.")
# Running on a local machine
else:
    if torch.backends.mps.is_available():
        device = 'mps'
        print(f"Is Apple MPS (Metal Performance Shader) built? {torch.backends.mps.is_built()}")
        print(f"Is Apple MPS available? {torch.backends.mps.is_available()}")
    elif torch.cuda.is_available():
        device = 'cuda'
    else:
        device = 'cpu'

# TODO: Add support for AMD ROCm GPU if needed

# Print the device being used
print(f"Using device: {device}")

PyTorch version: 2.5.0.dev20240806
Is Apple MPS (Metal Performance Shader) built? True
Is Apple MPS available? True
Using device: mps


In [9]:
def get_h5_files(base_path):
    # Use glob to find all .h5 files in base_path and its subdirectories
    search_pattern = os.path.join(base_path, '**', '*.h5')
    h5_files = glob.glob(search_pattern, recursive=True)
    return h5_files

# Specify the base directory to search
directory = '../Datasets/NASA-Earthdata/PLOT1'  # Change this path to do a different search

# Get the list of .h5 files
files = get_h5_files(directory)

# Print the list of files
for file in files:
    print(file)

In [10]:
def explore_file_structure(file_path):
    with h5py.File(file_path, 'r') as file:
        def print_structure(group, indent=0):
            for key in group:
                item = group[key]
                print('  ' * indent + f'{key}/')
                if isinstance(item, h5py.Group):
                    print_structure(item, indent + 1)
                else:
                    print('  ' * (indent + 1) + f'{key} (dataset)')
        
        print(f'Exploring {file_path}:')
        print_structure(file)
        print('-' * 40)

# Apply the exploration to each file
for file in files:
    explore_file_structure(file)

In [21]:
def examine_datasets(file_path):
    with h5py.File(file_path, 'r') as file:
        def print_dataset_info(dataset):
            print(f'  Dataset: {dataset.name}')
            print(f'    Shape: {dataset.shape}')
            print(f'    Dtype: {dataset.dtype}')
        
        print(f'Examining {file_path}:')
        for key in file.keys():
            item = file[key]
            if isinstance(item, h5py.Dataset):
                print_dataset_info(item)
            elif isinstance(item, h5py.Group):
                for subkey in item.keys():
                    sub_item = item[subkey]
                    if isinstance(sub_item, h5py.Dataset):
                        print_dataset_info(sub_item)
        print('-' * 40)

# Apply the examination to each file
# for file in files:
#     examine_datasets(file)

explore_file_structure(file)

Exploring ../Datasets/NASA-Earthdata/PLOT1/ATL03_20210714222751_03251207_006_01.h5:
METADATA/
  AcquisitionInformation/
    lidar/
    lidarDocument/
    platform/
    platformDocument/
  DataQuality/
    CompletenessOmission/
    DomainConsistency/
  DatasetIdentification/
  Extent/
  Lineage/
    ANC01/
    ANC03/
    ANC04/
    ANC05/
    ANC06-01/
    ANC06-02/
    ANC06-03/
    ANC07/
    ANC08/
    ANC11/
    ANC12-01/
    ANC12-02/
    ANC19/
    ANC22/
    ANC23/
    ANC25-03/
    ANC26-03/
    ANC28/
    ANC29/
    ANC36-03/
    ANC38-03/
    ANC41/
    ATL02/
    Control/
  ProcessStep/
    Browse/
    Metadata/
    PGE/
    QA/
  ProductSpecificationDocument/
  QADatasetIdentification/
  SeriesIdentification/
ancillary_data/
  atlas_sdp_gps_epoch/
    atlas_sdp_gps_epoch (dataset)
  control/
    control (dataset)
  data_end_utc/
    data_end_utc (dataset)
  data_start_utc/
    data_start_utc (dataset)
  end_cycle/
    end_cycle (dataset)
  end_delta_time/
    end_delta_time 