# Build Machine Learning Dataset


## Setup

In [None]:
import os.path as osp
from datetime import datetime, timezone
from dateutil.relativedelta import relativedelta
import synoptic
import json
import sys
import numpy as np
import polars as pl
import pandas as pd
sys.path.append('../src')
from utils import Dict, read_yml, read_pkl, str2time, print_dict_summary, time_range
from data_funcs import rename_dict
import ingest.retrieve_raws_api as rr
import ingest.retrieve_raws_stash as rrs

In [None]:
# raws_meta = read_yml("../etc/variable_metadata/raws_metadata.yaml")

with open("../etc/training_data_config.json", "r") as json_file:
    config = json.load(json_file)   
    config = Dict(config)
print_dict_summary(config)

In [None]:
raws_stash_path = rrs.raws_meta["raws_stash_path"]
print(raws_stash_path)
osp.exists(raws_stash_path)

In [None]:
times = time_range(config.start_time, config.end_time, start_offset=True)

In [None]:
raws_dict = rrs.build_raws_dict(config)

In [None]:
print_dict_summary(raws_dict)

In [None]:
import importlib
import ingest.retrieve_hrrr_api
importlib.reload(ingest.retrieve_hrrr_api)
import ingest.retrieve_hrrr_api as ih

In [None]:
hrrr_ds = ih.retrieve_hrrr(config)

In [None]:
hrrr_pts = ih.subset_hrrr2raws(hrrr_ds, raws_dict)
hrrr_pts = ih.rename_ds(hrrr_pts)

In [None]:
hrrr_pts

In [None]:
# Check same STIDs
np.all(hrrr_pts.point_stid.to_numpy() == np.array([*raws_dict.keys()]))

In [None]:
for st in raws_dict:

    # Comfirm times match. For HRRR data it should be the valid_time which accounts for forecast hour
    raws_timesi = np.array([dt.replace(tzinfo=None) for dt in raws_dict[st]["times"]], dtype="datetime64")
    assert np.all(raws_timesi == hrrr_pts.valid_time.to_numpy()), "Times in RAWS dict don't match HRRR data valid_time"

    # Extract dataframe of predictors, save in HRRR subdictionary
    df = hrrr_pts.where(hrrr_pts.point_stid == st, drop=True).to_dataframe()
    df.reset_index('point', drop=True, inplace=True)
    raws_dict[st]["HRRR"] = df