<a href="https://colab.research.google.com/github/m-wessler/nbm-verification/blob/main/nbm_textfile_parser.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [31]:
import re
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import os
import glob
from tqdm.notebook import tqdm  # Progress bar for notebooks

# Supported file types
supported_file_types = ['nbp', 'nbe', 'nbs', 'nbx']

# Ask the user to choose the file type
file_type = input(f"Choose the file type to process ({', '.join(supported_file_types)}): ").strip().lower()

if file_type not in supported_file_types:
    raise ValueError(f"Unsupported file type: {file_type}. Supported types are: {', '.join(supported_file_types)}")

# Glob for all matching *.txt files
file_pattern = f"*{file_type}*.txt"
matching_files = glob.glob(file_pattern)

if not matching_files:
    print(f"No files found matching the pattern: {file_pattern}")
    exit()

print(f"Found {len(matching_files)} file(s) matching the pattern: {file_pattern}")
for idx, file in enumerate(matching_files, 1):
    print(f"{idx}. {file}")

# Process each matching file
for input_file_path in matching_files:
    print(f"\nProcessing file: {input_file_path}")

    # Extract the base name of the input file
    input_file_name = os.path.basename(input_file_path).split('.')[0]  # Remove extension for output filename

    # Read the file
    with open(input_file_path, "r") as file:
        lines = file.read().splitlines()

    # Split the content into blocks based on blank rows
    blocks = []
    current_block = []
    for line in lines:
        if not line.strip():  # Blank line indicates end of a block
            if current_block:
                blocks.append(current_block)
                current_block = []
        else:
            current_block.append(line)

    # Add the last block if it wasn't added
    if current_block:
        blocks.append(current_block)

    # Process metadata from the first row of each block, discarding invalid blocks
    valid_blocks = []
    output_init_time = None  # Will store the init_time for the output filename
    for block in blocks:
        if block:  # Ensure the block isn't empty
            metadata_line = block[0]  # First row contains metadata
            parts = re.split(r"\s{2,}", metadata_line.strip())  # Split by 2+ spaces

            # Validate metadata: ensure we have a proper init time (last two parts of the metadata line)
            if len(parts) >= 2:
                site_id = parts[0].split()[0]  # Extract SITE_ID (first part)
                init_time = " ".join(parts[-2:])  # Combine date and time (last two parts)

                # Check if the init_time is in the correct format
                if re.match(r"^\d{1,2}/\d{1,2}/\d{4} \d{4} UTC$", init_time):
                    valid_blocks.append((site_id, init_time, block))  # Store valid blocks
                    if output_init_time is None:
                        # Format init_time for the output filename (yyyymmddhh)
                        init_time_dt = datetime.strptime(init_time, "%m/%d/%Y %H%M %Z")
                        output_init_time = init_time_dt.strftime("%Y%m%d%H")
                else:
                    print(f"Discarding block due to invalid init time: {metadata_line}")
            else:
                print(f"Discarding block due to insufficient metadata: {metadata_line}")

    # Prepare the DataFrame
    all_dataframes = []  # List to store DataFrames for all blocks

    # Start parsing data rows for each valid block with a progress bar
    for site_id, init_time, block in tqdm(valid_blocks, desc="Processing blocks", unit="block"):
        # Convert init_time to a datetime object
        init_time_dt = datetime.strptime(init_time, "%m/%d/%Y %H%M %Z")

        # Skip the first three rows (metadata + headers)
        data_rows = block[3:]

        # Initialize a dictionary to store the data for the current block
        block_data = {}

        # Define bad data codes
        bad_data_codes = ["-459", "-99"]

        if file_type == 'nbx':
            # Parsing logic for 'nbx' type
            for row in data_rows:
                # Extract the variable name (first 3 characters)
                variable_name = row[1:4].strip()

                # Check if the row contains bad data
                if any(bad_code in row for bad_code in bad_data_codes):
                    # Skip the row and backfill with NaN
                    parsed_data = [np.nan] * 23  # Backfill with NaN for 23 columns
                else:
                    # Parse the data columns dynamically, excluding the last 6 characters
                    parsed_data = [
                        row[i:i+3].strip()
                        for i in range(5, len(row) - 6, 4)  # Start from index 5, stop 6 characters before the end
                    ]

                    # Replace blank values with np.nan
                    parsed_data = [np.nan if val == "" else val for val in parsed_data]

                # Add data to block_data
                block_data[variable_name] = parsed_data

        # Convert block_data to a DataFrame and align rows/columns
        block_df = pd.DataFrame.from_dict(block_data, orient="index").T

        # Add site_id and init_time columns
        block_df["site_id"] = site_id
        block_df["init_time"] = init_time_dt

        # Calculate valid_time based on FHR
        if "FHR" in block_df.columns:
            block_df["FHR"] = pd.to_numeric(block_df["FHR"], errors="coerce")  # Ensure FHR is numeric
            block_df["valid_time"] = block_df["FHR"].apply(
                lambda x: init_time_dt + timedelta(hours=x) if not pd.isna(x) else np.nan
            )

        # Store the block DataFrame
        all_dataframes.append(block_df)

    # Combine all block DataFrames into a single DataFrame, aligning variables
    df = pd.concat(all_dataframes, ignore_index=True)

    # Set MultiIndex [init_time, valid_time, site_id]
    df.set_index(["init_time", "valid_time", "site_id"], inplace=True)

    # Generate the output filename
    if output_init_time:
        output_file_name = f"{input_file_name}_{output_init_time}.csv"
    else:
        output_file_name = f"{input_file_name}_output.csv"  # Fallback if no valid init_time is found
    output_file_path = os.path.join("/content", output_file_name)

    # Save the DataFrame to a CSV file
    df.to_csv(output_file_path)
    print(f"Data saved to {output_file_path}")

Choose the file type to process (nbp, nbe, nbs, nbx): nbx
Found 1 file(s) matching the pattern: *nbx*.txt
1. blend_nbxtx.t13z.txt

Processing file: blend_nbxtx.t13z.txt
Discarding block due to insufficient metadata: 1


Processing blocks:   0%|          | 0/9589 [00:00<?, ?block/s]

Data saved to /content/blend_nbxtx_2024051413.csv


In [32]:
df.loc[(slice(None), slice(None), "KSLC")]

Unnamed: 0_level_0,Unnamed: 1_level_0,FHR,TXN,XND,TMP,TSD,DPT,DSD,SKY,SSD,WDR,...,PZR,PSN,PPL,PRA,S12,SLV,I12,S24,SOL,SWH
init_time,valid_time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2024-05-14 13:00:00,2024-05-23 00:00:00,203.0,70.0,4.0,65.0,5.0,32.0,4.0,46.0,25.0,31.0,...,0.0,0.0,0.0,12.0,0.0,90.0,0.0,,60.0,
2024-05-14 13:00:00,2024-05-23 12:00:00,215.0,49.0,2.0,50.0,3.0,38.0,3.0,51.0,30.0,13.0,...,0.0,0.0,0.0,16.0,0.0,83.0,0.0,0.0,8.0,
2024-05-14 13:00:00,2024-05-24 00:00:00,227.0,69.0,3.0,67.0,5.0,31.0,4.0,36.0,21.0,31.0,...,0.0,0.0,0.0,25.0,0.0,89.0,0.0,,59.0,
2024-05-14 13:00:00,2024-05-24 12:00:00,239.0,49.0,2.0,50.0,3.0,36.0,3.0,45.0,6.0,12.0,...,0.0,0.0,0.0,20.0,0.0,80.0,0.0,0.0,10.0,
2024-05-14 13:00:00,2024-05-25 00:00:00,251.0,70.0,4.0,67.0,4.0,32.0,3.0,38.0,6.0,32.0,...,0.0,0.0,0.0,14.0,0.0,88.0,0.0,,60.0,
2024-05-14 13:00:00,2024-05-25 12:00:00,263.0,49.0,3.0,52.0,3.0,36.0,2.0,38.0,12.0,12.0,...,0.0,0.0,0.0,13.0,0.0,82.0,0.0,0.0,9.0,
2024-05-14 13:00:00,2024-05-26 00:00:00,275.0,,,,,,,,,,...,,,,,,,,,,
2024-05-14 13:00:00,2024-05-26 12:00:00,287.0,,,,,,,,,,...,,,,,,,,,,
2024-05-14 13:00:00,2024-05-27 00:00:00,299.0,,,,,,,,,,...,,,,,,,,,,
2024-05-14 13:00:00,2024-05-27 12:00:00,311.0,,,,,,,,,,...,,,,,,,,,,
