In [4]:
import h5py
import numpy as np
import pandas as pd

def extract_filtered_data(file_path, start_idx=0, chunk_size=10000, lat_range=(39, 41), 
                        lon_range=(140, 142), hour_range=(0, 23)):
    """
    Extract and filter HDF5 data starting from a specific index and processing in chunks.
   
    Args:
        file_path (str): Path to HDF5 file
        start_idx (int): Starting index in the file
        chunk_size (int): Number of entries to process at once
        lat_range (tuple): Range of latitude (min, max)
        lon_range (tuple): Range of longitude (min, max)
        hour_range (tuple): Range of hours to include (min, max)
       
    Returns:
        pandas.DataFrame: DataFrame containing filtered data with all attributes
    """
    attributes = [
        'year', 'month', 'day', 'hour', 'min', 'sec', 'recno', 'kindat', 'kinst',
        'ut1_unix', 'ut2_unix', 'pierce_alt', 'gps_site', 'sat_id', 'gnss_type',
        'gdlatr', 'gdlonr', 'los_tec', 'dlos_tec', 'tec', 'azm', 'elm',
        'gdlat', 'glon', 'rec_bias', 'drec_bias'
    ]
   
    # List to store DataFrames for each chunk
    chunk_dfs = []
    
    with h5py.File(file_path, 'r') as hdf:
        dataset = hdf['Data/Table Layout']
        total_size = len(dataset)
        
        # Print total size of the dataset
        print(f"Total dataset size: {total_size}")
        print(f"Starting from index: {start_idx}")
        
        # Process data in chunks
        current_idx = start_idx
        
        while current_idx < total_size:
            end_idx = min(current_idx + chunk_size, total_size)
            print(f"Processing chunk from {current_idx} to {end_idx}")
            
            # Read chunk of data
            data = dataset[current_idx:end_idx]
            
            # Extract coordinates and hour for filtering
            lats = data['gdlatr']
            lons = data['gdlonr']
            hours = data['hour']
            
            # Create masks for the coordinate and time ranges
            coord_mask = (lats >= lat_range[0]) & (lats <= lat_range[1]) & \
                        (lons >= lon_range[0]) & (lons <= lon_range[1])
            time_mask = (hours >= hour_range[0]) & (hours <= hour_range[1])
            combined_mask = coord_mask & time_mask
            
            # If we have any matching data in this chunk
            if np.any(combined_mask):
                # Create dictionary to store filtered data
                filtered_data = {}
                
                # Extract each attribute and apply mask
                for attr in attributes:
                    if attr in data.dtype.names:
                        filtered_values = data[attr][combined_mask]
                        
                        # Convert bytes to strings for string columns
                        if filtered_values.dtype.kind == 'S':
                            filtered_values = [val.decode('utf-8') for val in filtered_values]
                        
                        filtered_data[attr] = filtered_values
                
                # Create DataFrame for this chunk and append to list
                if filtered_data:
                    chunk_df = pd.DataFrame(filtered_data)
                    chunk_dfs.append(chunk_df)
            
            # Move to next chunk
            current_idx = end_idx
            
            # Optional: Print progress
            print(f"Processed {current_idx}/{total_size} entries")
    
    # Combine all chunk DataFrames
    if not chunk_dfs:
        print("No data found matching the specified criteria")
        return pd.DataFrame()
    
    df = pd.concat(chunk_dfs, ignore_index=True)
    
    # Process datetime columns
    df = df.rename(columns={'min': 'minute', 'sec': 'second'})
    try:
        df['datetime'] = pd.to_datetime(
            df[['year', 'month', 'day', 'hour', 'minute', 'second']]
        )
    except ValueError as e:
        print(f"Warning: Could not create datetime column: {e}")
    
    df = df.rename(columns={'minute': 'min', 'second': 'sec'})
    
    return df

# Path to the HDF5 file
file_path = r'/mnt/c/Users/fadri_oudurqw/Downloads/los_20150530.001_Jap_EQ.h5'

# Start from index 1,000,000 and process in chunks of 10,000
df = extract_filtered_data(
    file_path,
    start_idx=4_000_000,  # Start from 1 millionth entry
    chunk_size=10_000,    # Process 10,000 entries at a time
    lat_range=(30, 45),
    lon_range=(130, 150),
    hour_range=(11, 12)
)

# Export the DataFrame to a CSV file
output_path = f'data/data_csvs/Japan_EQ_30Mai2015_11_13.csv'
df.to_csv(output_path, index=False)

print("\nDataFrame Info:")
print(df.info())
print("\nFirst few rows:")
print(df.head())

Total dataset size: 89103966
Starting from index: 4000000
Processing chunk from 4000000 to 4010000
Processed 4010000/89103966 entries
Processing chunk from 4010000 to 4020000
Processed 4020000/89103966 entries
Processing chunk from 4020000 to 4030000
Processed 4030000/89103966 entries
Processing chunk from 4030000 to 4040000
Processed 4040000/89103966 entries
Processing chunk from 4040000 to 4050000
Processed 4050000/89103966 entries
Processing chunk from 4050000 to 4060000
Processed 4060000/89103966 entries
Processing chunk from 4060000 to 4070000
Processed 4070000/89103966 entries
Processing chunk from 4070000 to 4080000
Processed 4080000/89103966 entries
Processing chunk from 4080000 to 4090000
Processed 4090000/89103966 entries
Processing chunk from 4090000 to 4100000
Processed 4100000/89103966 entries
Processing chunk from 4100000 to 4110000
Processed 4110000/89103966 entries
Processing chunk from 4110000 to 4120000
Processed 4120000/89103966 entries
Processing chunk from 4120000 

In [None]:
# import h5py

# def explore_h5(file_path):
#     """
#     Explore the structure of an HDF5 file and print all groups and datasets with their paths.
    
#     Parameters:
#     -----------
#     file_path : str
#         Path to the HDF5 file
#     """
#     def print_structure(name, obj):
#         """Callback function to print the structure"""
#         indent = '    ' * name.count('/')
#         if isinstance(obj, h5py.Dataset):
#             print(f"{indent}Dataset: {name}")
#             print(f"{indent}    Shape: {obj.shape}")
#             print(f"{indent}    Type: {obj.dtype}")
#         elif isinstance(obj, h5py.Group):
#             print(f"{indent}Group: {name}")
    
#     try:
#         with h5py.File(file_path, 'r') as f:
#             print(f"File: {file_path}")
#             print("\nStructure:")
#             # Recursively visit all groups and datasets
#             f.visititems(print_structure)
            
#     except Exception as e:
#         print(f"Error occurred: {str(e)}")

# # Example usage:
# explore_h5('/mnt/c/Users/fadri_oudurqw/Downloads/los_20160326.001.h5')

# def list_datasets(file_path):
#     """
#     List all dataset paths in an HDF5 file.
    
#     Parameters:
#     -----------
#     file_path : str
#         Path to the HDF5 file
        
#     Returns:
#     --------
#     list
#         List of dataset paths
#     """
#     dataset_paths = []
    
#     def collect_datasets(name, obj):
#         if isinstance(obj, h5py.Dataset):
#             dataset_paths.append(name)
    
#     with h5py.File(file_path, 'r') as f:
#         f.visititems(collect_datasets)
    
#     return dataset_paths

# # Example usage:
# # paths = list_datasets('your_file.h5')
# # print("\nDataset paths:", *paths, sep='\n')

File: /mnt/c/Users/fadri_oudurqw/Downloads/los_20160326.001.h5

Structure:
Group: Data
    Dataset: Data/Table Layout
        Shape: (111787628,)
        Type: [('year', '<i8'), ('month', '<i8'), ('day', '<i8'), ('hour', '<i8'), ('min', '<i8'), ('sec', '<i8'), ('recno', '<i8'), ('kindat', '<i8'), ('kinst', '<i8'), ('ut1_unix', '<f8'), ('ut2_unix', '<f8'), ('pierce_alt', '<f8'), ('gps_site', 'S4'), ('sat_id', '<i8'), ('gdlatr', '<f8'), ('gdlonr', '<f8'), ('los_tec', '<f8'), ('dlos_tec', '<f8'), ('tec', '<f8'), ('azm', '<f8'), ('elm', '<f8'), ('gdlat', '<f8'), ('glon', '<f8'), ('rec_bias', '<f8'), ('drec_bias', '<f8')]
Group: Metadata
    Dataset: Metadata/Data Parameters
        Shape: (25,)
        Type: [('mnemonic', 'S10'), ('description', 'S50'), ('isError', '<i8'), ('units', 'S7'), ('category', 'S36')]
    Dataset: Metadata/Experiment Parameters
        Shape: (14,)
        Type: [('name', 'S20'), ('value', 'S46')]
    Dataset: Metadata/Independent Spatial Parameters
        Shape:

In [20]:
import h5py
import pandas as pd

def extract_h5_measurements(file_path, dataset_path, n_measurements=500):
    """
    Extract the first n measurements from an HDF5 file into a pandas DataFrame.
    
    Parameters:
    -----------
    file_path : str
        Path to the HDF5 file
    dataset_path : str
        Path to the dataset within the HDF5 file
    n_measurements : int, optional
        Number of measurements to extract (default: 500)
        
    Returns:
    --------
    pandas.DataFrame
        DataFrame containing the extracted measurements
    """
    try:
        with h5py.File(file_path, 'r') as f:
            # Get the dataset
            dataset = f[dataset_path]
            
            # Extract first n_measurements
            data = dataset[:n_measurements]
            # Filter for stations with certain gdlon and gdlat
            filtered_data = data[(data['gdlonr'] >= 140) & (data['gdlonr'] <= 142) & 
                                 (data['gdlatr'] >= 39) & (data['gdlatr'] <= 41)]
            # Convert to DataFrame
            df = pd.DataFrame(filtered_data)
            
            return df
            
    except KeyError:
        print(f"Dataset {dataset_path} not found in file")
    except Exception as e:
        print(f"Error occurred: {str(e)}")
        
# Example usage:
df = extract_h5_measurements('/mnt/c/Users/fadri_oudurqw/Downloads/los_20160326.001.h5', 'Data/Table Layout', n_measurements=100_000_00)
df

Unnamed: 0,year,month,day,hour,min,sec,recno,kindat,kinst,ut1_unix,...,gdlonr,los_tec,dlos_tec,tec,azm,elm,gdlat,glon,rec_bias,drec_bias
0,2016,3,26,0,0,0,0,3505,8000,1.458950e+09,...,141.132828,23.154556,0.374467,12.409058,134.274796,27.289396,35.299515,145.801300,-13.615938,1.300858
1,2016,3,26,0,0,0,0,3505,8000,1.458950e+09,...,141.132828,16.347767,0.373366,11.964964,57.927349,44.170170,40.664635,144.457001,-13.615938,1.300858
2,2016,3,26,0,0,0,0,3505,8000,1.458950e+09,...,141.132828,4.626015,0.202011,4.393531,-11.427274,70.761436,40.141891,140.865005,-13.615938,1.300858
3,2016,3,26,0,0,0,0,3505,8000,1.458950e+09,...,141.132828,18.365589,2.093730,11.724884,-50.269512,35.884354,41.573429,137.101181,-13.615938,1.300858
4,2016,3,26,0,0,0,0,3505,8000,1.458950e+09,...,141.132828,13.004891,0.908312,12.683059,160.805191,76.542198,38.450500,141.431488,-13.615938,1.300858
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1753,2016,3,26,2,4,30,249,3505,8000,1.458958e+09,...,141.132828,16.955704,0.202011,12.299959,102.312950,43.569721,38.411732,144.929672,-13.615938,1.300858
1754,2016,3,26,2,4,30,249,3505,8000,1.458958e+09,...,141.132828,21.247169,2.093730,18.908945,-127.848763,61.310795,38.115734,139.506317,-13.615938,1.300858
1755,2016,3,26,2,4,30,249,3505,8000,1.458958e+09,...,141.132828,53.092110,0.908312,22.182133,165.859543,16.990219,31.303854,143.416473,-13.615938,1.300858
1756,2016,3,26,2,4,30,249,3505,8000,1.458958e+09,...,141.132828,17.443150,0.488239,15.599589,37.868587,61.900990,40.368431,142.403595,-13.615938,1.300858
