In [2]:
import h5py

file_path = r'/Users/bene/Library/CloudStorage/OneDrive-ETHZurich/Space Geodesy 24/data/Japan/los_20110311.001.h5'

def explore_h5(file_path):
    with h5py.File(file_path, 'r') as hdf:
        def print_structure(name, obj):
            if isinstance(obj, h5py.Dataset):
                print(f'Dataset: {name}')
                print(f'  Shape: {obj.shape}')
                print(f'  Type: {obj.dtype}')
                # Print a few sample values if the dataset is small
                if len(obj.shape) == 0 or obj.shape[0] < 5:
                    print(f'  Sample data: {obj[()]}')
                else:
                    print(f'  First few values: {obj[0:3]}')
            elif isinstance(obj, h5py.Group):
                print(f'Group: {name}')
        
        # Visit each object in the file
        hdf.visititems(print_structure)

# Use your file path
explore_h5(file_path)

Group: Data
Dataset: Data/Table Layout
  Shape: (123863032,)
  Type: [('year', '<i8'), ('month', '<i8'), ('day', '<i8'), ('hour', '<i8'), ('min', '<i8'), ('sec', '<i8'), ('recno', '<i8'), ('kindat', '<i8'), ('kinst', '<i8'), ('ut1_unix', '<f8'), ('ut2_unix', '<f8'), ('pierce_alt', '<f8'), ('gps_site', 'S4'), ('sat_id', '<i8'), ('gnss_type', 'S8'), ('gdlatr', '<f8'), ('gdlonr', '<f8'), ('los_tec', '<f8'), ('dlos_tec', '<f8'), ('tec', '<f8'), ('azm', '<f8'), ('elm', '<f8'), ('gdlat', '<f8'), ('glon', '<f8'), ('rec_bias', '<f8'), ('drec_bias', '<f8')]
  First few values: [(2011, 3, 11, 0, 0, 0, 0, 3505, 8000, 1.2998016e+09, 1.2998016e+09, 350., b'00na', 5, b'GPS     ', -12.46664047, 130.84399414, 27.149475, 0.685819, 15.936719, -72.487282, 31.548513, -11.046002, 126.410751, -275620.4375 , 1.308943)
 (2011, 3, 11, 0, 0, 0, 0, 3505, 8000, 1.2998016e+09, 1.2998016e+09, 350., b'00na', 7, b'GPS     ', -12.46664047, 130.84399414, 34.249969, 0.175487, 15.447015, 106.633766, 20.03315 , -14.390582

In [8]:
import h5py
import numpy as np
import pandas as pd

def extract_filtered_data(file_path, lat_range=(39, 41), lon_range=(140, 142), max_entries=100):
    """
    Extract and filter HDF5 data for specified attributes within coordinate ranges.
    
    Args:
        file_path (str): Path to HDF5 file
        lat_range (tuple): Range of latitude (min, max)
        lon_range (tuple): Range of longitude (min, max)
        max_entries (int): Maximum number of entries to process
        
    Returns:
        pandas.DataFrame: DataFrame containing filtered data with all attributes
    """
    # Define all attributes to extract
    attributes = [
        'year', 'month', 'day', 'hour', 'min', 'sec', 'recno', 'kindat', 'kinst',
        'ut1_unix', 'ut2_unix', 'pierce_alt', 'gps_site', 'sat_id', 'gnss_type',
        'gdlatr', 'gdlonr', 'los_tec', 'dlos_tec', 'tec', 'azm', 'elm',
        'gdlat', 'glon', 'rec_bias', 'drec_bias'
    ]
    
    with h5py.File(file_path, 'r') as hdf:
        # Get the dataset
        dataset = hdf['Data/Table Layout']
        
        # Read only the first max_entries
        data = dataset[:max_entries]
        
        # Extract coordinates for filtering
        lats = data['gdlatr']  # receiver latitude
        lons = data['gdlonr']  # receiver longitude
        
        # Create mask for the coordinate ranges
        mask = (lats >= lat_range[0]) & (lats <= lat_range[1]) & \
               (lons >= lon_range[0]) & (lons <= lon_range[1])
        
        # Create dictionary to store filtered data
        filtered_data = {}
        
        # Extract each attribute and apply mask
        for attr in attributes:
            if attr in data.dtype.names:
                filtered_values = data[attr][mask]
                
                # Convert bytes to strings for string columns
                if filtered_values.dtype.kind == 'S':
                    filtered_values = [val.decode('utf-8') for val in filtered_values]
                
                filtered_data[attr] = filtered_values
        
        # Create pandas DataFrame
        df = pd.DataFrame(filtered_data)
        
        # Rename 'min' and 'sec' columns to 'minute' and 'second' for datetime conversion
        df = df.rename(columns={'min': 'minute', 'sec': 'second'})
        
        try:
            # Add datetime column for convenience
            df['datetime'] = pd.to_datetime(
                df[['year', 'month', 'day', 'hour', 'minute', 'second']]
            )
        except ValueError as e:
            print(f"Warning: Could not create datetime column: {e}")
            # If datetime creation fails, we still want to return the DataFrame
            pass
        
        # Rename back to original column names if you prefer
        df = df.rename(columns={'minute': 'min', 'second': 'sec'})
        
        return df

# Example usage
file_path = r'/Users/bene/Library/CloudStorage/OneDrive-ETHZurich/Space Geodesy 24/data/Japan/los_20110311.001.h5'
df = extract_filtered_data(file_path, max_entries=50_000_000)

# Display information about the resulting DataFrame
print("\nDataFrame Info:")
print(df.info())

print("\nFirst few rows:")
print(df.head())


DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19975 entries, 0 to 19974
Data columns (total 27 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   year        19975 non-null  int64         
 1   month       19975 non-null  int64         
 2   day         19975 non-null  int64         
 3   hour        19975 non-null  int64         
 4   min         19975 non-null  int64         
 5   sec         19975 non-null  int64         
 6   recno       19975 non-null  int64         
 7   kindat      19975 non-null  int64         
 8   kinst       19975 non-null  int64         
 9   ut1_unix    19975 non-null  float64       
 10  ut2_unix    19975 non-null  float64       
 11  pierce_alt  19975 non-null  float64       
 12  gps_site    19975 non-null  object        
 13  sat_id      19975 non-null  int64         
 14  gnss_type   19975 non-null  object        
 15  gdlatr      19975 non-null  float64       
 16  gdlon

In [10]:
df['gdlatr'].unique()

array([40.89749908, 39.01977921, 39.13516998])