<a href="https://colab.research.google.com/github/m-wessler/nbm-verification/blob/main/NBM_Text_Product_Parser_MWE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import sys
import numpy as np
import pandas as pd

from glob import glob
from tqdm import tqdm
from datetime import datetime, timedelta

from functools import partial
from multiprocessing import Pool

In [2]:
## Global variables ##
textfile_dir = './' #'/nas/stid/data/nbm/v4p2_text/*/'
output_dir = './' #'/nas/stid/data/nbm42_csv/'

# Designed for 01/13Z files
# May need to modify if using 00/06/12/18
forecast_periods = {'NBE':15, 'NBS':23, 'NBH':25}
fixed_columns = {'NBE':59, 'NBS':68, 'NBH':74}
column_width = {'NBE':3, 'NBS':2, 'NBH':2}
first_fhr = {'NBE':23, 'NBS':5, 'NBH':1}
fhr_step = {'NBE':12, 'NBS':3, 'NBH':1}
## ## ## ## ## ## ## ##

In [3]:
def nbm_textfile_parser(filename, product_set):

    with open(filename, 'r') as rfp:
        file_data = np.array(rfp.readlines())

    agg_df = pd.DataFrame()
    init_datetime = None

    starts = np.where(np.char.find(file_data, product_set.upper()) != -1)[0]
    ends = np.append((starts-1)[1:], len(file_data)-1)

    for start, end in zip(starts, ends):

    # Don't use TQDM if multiprocessing!
    # for start, end in tqdm(zip(starts, ends),
    #                     total=len(starts), desc=f'Parsing {filename}'):

        site_block = file_data[start:end]

        # ONLY PROCESS DATETIME ON FIRST ENTRY, consistent throughout file
        if init_datetime == None:

            if product_set == 'NBH':
                site_id, prod_id, init_date, init_hour, timezone = [
                    site_block[0].split(' ')[i] for i in [1, 4, 9, 11, 12]]
            else:
                site_id, prod_id, init_date, init_hour, timezone = [
                    site_block[0].split(' ')[i] for i in [1, 5, 10, 12, 13]]

            init_datetime = datetime.strptime(
                f'{init_date} {init_hour}', '%m/%d/%Y %H%M')

            print(f'Processing {init_datetime}')

            fhr0_datetime = (init_datetime + timedelta(
                hours=first_fhr[prod_id]))

        else:
            site_id = site_block[0].split(' ')[1]

        datetime_index = [fhr0_datetime + timedelta(hours=fhr_step[prod_id]*i)
            for i in range(0, forecast_periods[prod_id])]

        site_matrix = {'datetime':datetime_index,
                    'site_id':[site_id]*forecast_periods[prod_id]}

        for line in site_block[3:]:

            var_id = line[:4]

            data_start = 4+column_width[prod_id]
            data_end = 4+column_width[prod_id]+fixed_columns[prod_id]

            var_raw = line[data_start:data_end].rstrip()

            var_data = [var_raw[i:i+column_width[prod_id]]
                        for i in range(0, fixed_columns[prod_id],
                                       column_width[prod_id]+1)]

            site_matrix[var_id] = var_data

        agg_df = pd.concat([agg_df,
            pd.DataFrame(site_matrix).set_index(['datetime', 'site_id'])])

        # Limit scope for testing purposes
        if start > 2000:
            break

    agg_df.sort_index(inplace=True)

    init_datetime_str = datetime.strftime(init_datetime, "%Y%m%d")

    output_csv_file = f'blend_{prod_id}tx.{init_datetime_str}.t13z.csv'
    agg_df.to_csv(output_dir + output_csv_file)

    return output_csv_file

In [5]:
multiprocess = True
product_set_selection = 'NBS' # sys.argv[1] # command line input selection

file_paths = glob(f'{textfile_dir}*{product_set_selection.lower()}tx*')

parser_parallel = partial(nbm_textfile_parser, product_set=product_set_selection)

if multiprocess:
    with Pool(10) as p:
        output_files = p.map(parser_parallel, file_paths, chunksize=1)
        p.close()
        p.join()
else:
    output_files = [parser_parallel(fp) for fp in file_paths]

print(output_files)

Processing 2024-05-15 01:00:00
['blend_NBStx.20240515.t13z.csv']


In [6]:
# Test output
pd.read_csv(output_files[0])

Unnamed: 0,datetime,site_id,FHR,TXN,XND,TMP,TSD,DPT,DSD,SKY,...,IFC,LCB,VIS,IFV,MHT,TWD,TWS,HID,SOL,SWH
0,2024-05-15 06:00:00,086092,5,,,80.0,1.0,74.0,1.0,27.0,...,0.0,40.0,0.0,0.0,5.0,18.0,16.0,,0.0,3.0
1,2024-05-15 06:00:00,188557,5,,,63.0,1.0,61.0,1.0,94.0,...,22.0,22.0,50.0,3.0,13.0,16.0,12.0,,0.0,0.0
2,2024-05-15 06:00:00,220792,5,,,76.0,2.0,72.0,2.0,11.0,...,0.0,50.0,0.0,2.0,6.0,27.0,10.0,,1.0,1.0
3,2024-05-15 06:00:00,2A1,5,,,60.0,2.0,59.0,2.0,73.0,...,19.0,22.0,50.0,10.0,5.0,19.0,7.0,,0.0,
4,2024-05-15 06:00:00,2M2,5,,,62.0,2.0,62.0,2.0,51.0,...,24.0,13.0,0.0,5.0,4.0,26.0,9.0,,1.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1214,2024-05-18 00:00:00,42012,71,82,0,80.0,1.0,78.0,1.0,76.0,...,3.0,0.0,0.0,2.0,11.0,18.0,17.0,3,62.0,4.0
1215,2024-05-18 00:00:00,42013,71,84,1,82.0,1.0,80.0,1.0,23.0,...,0.0,0.0,0.0,0.0,15.0,19.0,9.0,5,58.0,2.0
1216,2024-05-18 00:00:00,42014,71,85,1,83.0,1.0,78.0,1.0,12.0,...,0.0,4.0,0.0,0.0,12.0,10.0,6.0,5,57.0,1.0
1217,2024-05-18 00:00:00,42019,71,82,1,81.0,1.0,78.0,1.0,54.0,...,15.0,50.0,0.0,10.0,12.0,16.0,10.0,4,65.0,4.0
