In [None]:
import h5py
import numpy as np
import pandas as pd

def extract_filtered_data(file_path, lat_range=(39, 41), lon_range=(140, 142), max_entries=100):
    """
    Extract and filter HDF5 data for specified attributes within coordinate ranges.
    
    Args:
        file_path (str): Path to HDF5 file
        lat_range (tuple): Range of latitude (min, max)
        lon_range (tuple): Range of longitude (min, max)
        max_entries (int): Maximum number of entries to process
        
    Returns:
        pandas.DataFrame: DataFrame containing filtered data with all attributes
    """
    # Define all attributes to extract
    attributes = [
        'year', 'month', 'day', 'hour', 'min', 'sec', 'recno', 'kindat', 'kinst',
        'ut1_unix', 'ut2_unix', 'pierce_alt', 'gps_site', 'sat_id', 'gnss_type',
        'gdlatr', 'gdlonr', 'los_tec', 'dlos_tec', 'tec', 'azm', 'elm',
        'gdlat', 'glon', 'rec_bias', 'drec_bias'
    ]
    
    with h5py.File(file_path, 'r') as hdf:
        # Get the dataset
        dataset = hdf['Data/Table Layout']
        
        # Read only the first max_entries
        data = dataset[:max_entries]
        
        # Extract coordinates for filtering
        lats = data['gdlatr']  # receiver latitude
        lons = data['gdlonr']  # receiver longitude
        
        # Create mask for the coordinate ranges
        mask = (lats >= lat_range[0]) & (lats <= lat_range[1]) & \
               (lons >= lon_range[0]) & (lons <= lon_range[1])
        
        # Create dictionary to store filtered data
        filtered_data = {}
        
        # Extract each attribute and apply mask
        for attr in attributes:
            if attr in data.dtype.names:
                filtered_values = data[attr][mask]
                
                # Convert bytes to strings for string columns
                if filtered_values.dtype.kind == 'S':
                    filtered_values = [val.decode('utf-8') for val in filtered_values]
                
                filtered_data[attr] = filtered_values
        
        # Create pandas DataFrame
        df = pd.DataFrame(filtered_data)
        
        # Rename 'min' and 'sec' columns to 'minute' and 'second' for datetime conversion
        df = df.rename(columns={'min': 'minute', 'sec': 'second'})
        
        try:
            # Add datetime column for convenience
            df['datetime'] = pd.to_datetime(
                df[['year', 'month', 'day', 'hour', 'minute', 'second']]
            )
        except ValueError as e:
            print(f"Warning: Could not create datetime column: {e}")
            # If datetime creation fails, we still want to return the DataFrame
            pass
        
        # Rename back to original column names if you prefer
        df = df.rename(columns={'minute': 'min', 'second': 'sec'})
        
        return df

# Example usage
file_path = r'/Users/bene/Library/CloudStorage/OneDrive-ETHZurich/Space Geodesy 24/data/Japan/los_20110311.001.h5'
df = extract_filtered_data(file_path, max_entries=125_000_000)

# Example usage with full args*
# df = extract_filtered_data(file_path, lat_range=(39, 41), lon_range=(140, 142), max_entries=100)

# Display information about the resulting DataFrame
print("\nDataFrame Info:")
print(df.info())

print("\nFirst few rows:")
print(df.head())