In [None]:
import yaml
import pickle
import os.path as osp
import subprocess
from urllib.parse import urlparse
import numpy as np
import matplotlib.pyplot as plt
from utils import time_intp, str2time, filter_nan_values, read_pkl, read_yml

## Setup

In [None]:
def retrieve_url(url, dest_path, force_download=False):
    if not osp.exists(dest_path) or force_download:
        target_extension = osp.splitext(dest_path)[1]
        url_extension = osp.splitext(urlparse(url).path)[1]
        if target_extension != url_extension:
            print("Warning: file extension from url does not match destination file extension")
        subprocess.call(f"wget -O {dest_path}  {url}", shell=True)
        assert osp.exists(dest_path)
        print(f"Successfully downloaded {url} to {dest_path}")
    else:
        print(f"Target data already exists at {dest_path}")

In [None]:
retrieve_url("https://demo.openwfm.org/web/data/fmda/dicts/fmda_nw_202401-05_f05.pkl", "data/fmda_nw_202401-05_f05.pkl")

In [None]:
data_params = read_yml("params_data.yaml")
data_params

In [None]:
dat = read_pkl("data/test_CA_202401.pkl")

In [None]:
dat['NV040_202401']['RAWS']['fm']

## Filters

In [None]:
def filter_fmc(x, data_params=data_params):
    # Vector to track which values of input x are set to na
    na_vec = np.zeros_like(x)
    
    # Filter extreme lows
    condition = x<= data_params['min_fm']
    x[condition] = np.nan
    na_vec[np.where(condition)] = 1
    
    # Filter extreme highs
    condition = x>= data_params['max_fm']
    x[condition] = np.nan
    na_vec[condition] = 1
    
    return x, na_vec

In [None]:
def filter_rain(r, data_params=data_params):
    # Vector to track which values of input x are set to na
    na_vec = np.zeros_like(r)
    
    # Filter extreme lows
    condition = r< data_params['min_rain']
    r[condition] = np.nan
    na_vec[np.where(condition)] = 1
    
    # Filter extreme highs
    condition = r>= data_params['max_rain']
    r[condition] = np.nan
    na_vec[condition] = 1    

    return r, na_vec

In [None]:
def filter_wind(w, data_params=data_params):
    # Vector to track which values of input x are set to na
    na_vec = np.zeros_like(w)
    
    # Filter extreme lows
    condition = w< data_params['min_wind']
    w[condition] = np.nan
    na_vec[np.where(condition)] = 1
    
    # Filter extreme highs
    condition = w>= data_params['max_wind']
    w[condition] = np.nan
    na_vec[condition] = 1    

    return w, na_vec

In [None]:
# Useful Cases:
    # NV040_202401: more raws observations than HRRR, interp should shorten
    # NV026_202401: raws 10min obs, interp should shorten
    # CGVC1_202401: missing only a few observations, interp should lengthen
    # YNWC1_202401: only 2 observations, should be filtered entirely

In [None]:
def time_intp(t1, v1, t2):
    # Check if t1 v1 t2 are 1D arrays
    if t1.ndim != 1:
        logging.error("Error: t1 is not a 1D array. Dimension: %s", t1.ndim)
        return None
    if v1.ndim != 1:
        logging.error("Error: v1 is not a 1D array. Dimension %s:", v1.ndim)
        return None
    if t2.ndim != 1:
        logging.errorr("Error: t2 is not a 1D array. Dimension: %s", t2.ndim)
        return None
    # Check if t1 and v1 have the same length
    if len(t1) != len(v1):
        logging.error("Error: t1 and v1 have different lengths: %s %s",len(t1),len(v1))
        return None
    t1_no_nan, v1_no_nan = filter_nan_values(t1, v1)
    # print('t1_no_nan.dtype=',t1_no_nan.dtype)
    # Convert datetime objects to timestamps
    t1_stamps = np.array([t.timestamp() for t in t1_no_nan])
    t2_stamps = np.array([t.timestamp() for t in t2])
    

In [None]:
len(dat.keys())

In [None]:
def filter_nan_cases(d, data_params=data_params):
    # Remove cases with too many NAN relative to interp target. Use to avoid over-interpolating
    ks = [] # accumulate keys to remove
    for k in dat:
        n_fm = dat[k]['RAWS']['fm'].shape[0]
        n_obs = dat[k]['HRRR']['time'].shape[0]
        pct_na = (1 - n_fm / n_obs)
        if pct_na > data_params['max_pct_na']:
            print(f"Removing key {k} due to extensive missing data. Percent NA relative to HRRR: {np.round(pct_na, 3)}")
            ks.append(k)
    for k in ks: d.pop(k, None)

In [None]:
filter_nan_cases(dat)

In [None]:
len(dat.keys())

In [None]:
d = dat['NV026_202401']

In [None]:
d['RAWS']['fm'].shape

In [None]:
d['RAWS']['time_raws'].shape

In [None]:
d['HRRR']['f01']['Ed'].shape

In [None]:
d['HRRR']['time'].shape

In [None]:
from utils import str2time
d = dat['NV040_202401']
time_raws=str2time(d['RAWS']['time_raws']) 
time_hrrr=str2time(d['HRRR']['time'])

t1 = np.array([t.timestamp() for t in time_raws])
t2 = np.array([t.timestamp() for t in time_hrrr])

In [None]:
tnew = np.interp(t2, t1, t1)

In [None]:
np.isin(tnew, t1).mean()

In [None]:
d = dat['LIB03_202401']

In [None]:
d['RAWS']['fm'].shape

In [None]:
d['RAWS']['time_raws'].shape

In [None]:
d['HRRR']['f01']['Ed'].shape

In [None]:
d['HRRR']['time'].shape

In [None]:
time_raws = str2time(dat['NV040_202401']['RAWS']['time_raws'])
time_hrrr = str2time(dat['NV040_202401']['HRRR']['time'])
fm = dat['NV040_202401']['RAWS']['fm']
rain = dat['NV040_202401']['HRRR']['f01']['rain']

In [None]:
len(time_raws) == len(fm)

In [None]:
filter_fmc(fm)

In [None]:
filter_rain(rain)

In [None]:
filter_wind(dat['NV040_202401']['HRRR']['f01']['wind'])

In [None]:
dat['NV040_202401']['RAWS']['wind']

In [None]:
dat['CNFC1_202401']['RAWS'].keys()

In [None]:
dat['CNFC1_202401']['RAWS']['solar']

In [None]:
dat['CNFC1_202401']['HRRR']['f01']['wind']

In [None]:
dat['CNFC1_202401']['HRRR']['f01']['soilm']

In [None]:
for k in dat:
    print("~"*50)
    print(k)
    print(f"HRRR Shape: {dat[k]['HRRR']['f01']['Ew'].shape}")
    print(f"RAWS Shape: {dat[k]['RAWS']['fm'].shape}")

In [None]:
dat['LIB03_202401']['RAWS']['soil_moisture']

In [None]:
dat['LIB03_202401']['HRRR']['f01']['soilm']

In [None]:
for k in dat:
    print("~"*50)
    print(k)
    # Filter RAWS
    fm, filter_vec = filter_fmc(dat[k]["RAWS"]['fm'])
    print(f"Percent FMC Observations Filtered: {np.mean(filter_vec)}")
    if 'rain' in dat[k]["RAWS"].keys():
        rain, filter_vec = filter_rain(dat[k]["RAWS"]['rain'])
        print(f"Percent Rain Observations Filtered: {np.mean(filter_vec)}")
    if 'wind' in dat[k]["RAWS"].keys():
        wind, filter_vec = filter_rain(dat[k]["RAWS"]['wind'])
        print(f"Percent Wind Observations Filtered: {np.mean(filter_vec)}")
    # Filter HRRR unncessesary? 

In [None]:
def foo():
    d = read_pkl()
    dat = format_fmda_data()
    dat = filter_data(dat)
    return dat

In [None]:
def format_fmda_data(d):
    # Inputs:
    #    d: (dict) fmda dictionary, output of process in wrfxpy

    return d