In [None]:
import os
import os.path as osp
import subprocess
import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
from osgeo import gdal

# Rain Data Exploration

The purpose of this notebook is to explore the rain data from RAWS and HRRR retrieval processes.

* RAWS Rain Data:
    * [RAWS Definitions](https://www.weather.gov/media/wrh/mesowest/MesoWest_Data_Variables_Definitions.pdf)
    * They list hourly rainfall as parameters, but in practice stations only have wide data availability for accumulated precipitation
    * This data is potentially flawed when the rain collecting apparatus is full, and will thus register zero new rainfall when the bucket is full.
* HRRR Rain Data:
    * [HRRR Definitions](https://www.nco.ncep.noaa.gov/pmb/products/hrrr/hrrr.t00z.wrfprsf00.grib2.shtml)
    * Both rate and total ($kg\; m^{-2}s^{-1}$ and $kg\; m^{-2}$, respectively)

## Read FMDA Data Dictionary

In [None]:
# if not osp.exists("co_202306.pkl"):
#     print("Retrieving FMDA data")
#     subprocess.call("wget -P . https://demo.openwfm.org/web/data/fmda/dicts/co_202306.pkl", shell=True)
#     assert osp.exists("co_202306.pkl")    
#     print("Downloaded https://demo.openwfm.org/web/data/fmda/dicts/co_202306.pkl as co_202306.pkl")

dat = pd.read_pickle("test_CA_202401.pkl")

In [None]:
dat.keys()

## Get Rain Data from a station

In [None]:
x = dat["FCHC1_202401"]
print(x.keys())
print("Station location info:")
print(x["loc"])

In [None]:
print(x["RAWS"].keys())
rain_raws = x["RAWS"]["rain"]
rain_raws[0]=0 # assume 0 at time zero, it is NA because of first difference of accumlated
precipa_raws = x["RAWS"]["precip_accum"]

In [None]:
print(x["HRRR"].keys())
print(x["HRRR"]["f01"].keys())
rain0 = x["HRRR"]["f00"]["rain"]
rain1 = x["HRRR"]["f01"]["rain"]

precip0 = x["HRRR"]["f00"]["precip_accum"]
precip1 = x["HRRR"]["f01"]["precip_accum"]

### Check Time lines up

In [None]:
np.mean(x["RAWS"]["time"] == x["HRRR"]["time"])

In [None]:
# Print max time difference between queried time and actual time returned by RAWS
np.max(np.array([datetime.strptime(val, '%Y-%m-%dT%H:%M:%SZ') for val in x["RAWS"]["time_raws"]]) - np.array([datetime.strptime(val, '%Y-%m-%dT%H:%M:%SZ') for val in x["HRRR"]["time"]]))

## HRRR Rain Data

All zero at analysis hour f00. Rain at f01 nonzero, but we need to confirm it is reasonable.

In [None]:
print(np.unique(rain0))
print(np.max(rain1))

## Plot Side by Side

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(20,8))

ax1.plot(rain_raws, label = "RAWS")
ax2.plot(rain1, label = "HRRR")
plt.legend()
plt.title("Hourly Rain Accumulated Data")

## Reconcile Units

* [HRRR](https://www.nco.ncep.noaa.gov/pmb/products/hrrr/hrrr.t00z.wrfprsf00.grib2.shtml): units for precipitation rate is [kg/m^2/s]
* RAWS units for precipitation is $mm\cdot h^{-1}$

We HRRR to $mm\cdot h^{-1}$. We utilize the fact that water has a specific density of roughly 1g per cubic cm. So 1km of water, or 1000g of water, spread over 1 $m^2$, or 10,000 $cm^2$, would cover that square meter to a depth of .1 cm, or 1mm. So 1 $kg\cdot m^{-2}s^{-1}$ is equal to 1 $mm\cdot s^{-1}$, so we just need to multiply by 3600 seconds per hour to get units of $mm\cdot h^{-1}$.

In [None]:
# Confirm units from RAWS data
from synoptic.services import variables
variables(verbose="HIDE").loc["precip_accum"]

In [None]:
# Print mean values of each to get a sense of scale of observations
print(np.mean(rain_raws))
print(np.mean(rain1))

In [None]:
# Convert HRRR print mean again

print(np.mean(rain_raws))
print(np.mean(rain1 * 3600))

There is difference between these two, since one is a computational model and the other are ground observations, but it is the same order of magnitude. To confirm this, we will calculate the mean for each station in the data dictionary.

In [None]:
# Set up vector of means, not initializing since rain missing from some RAWS locations
raws_means = []
hrrr_means = []

for i, k in enumerate(dat.keys()):
    if "rain" in dat[k]["RAWS"].keys():
        raws_means.append(np.nanmean(dat[k]["RAWS"]["rain"])) 
        hrrr_means.append(np.mean(dat[k]["HRRR"]["f01"]["rain"] * 3600))

# Print summary
print(np.mean(raws_means))
print(np.mean(hrrr_means))

Again, these are the same order of magnitude.

## Missing Data

Synoptic simply excludes missing values from return, instead of filling with NA. Should we:
* Fill with NA at the point of data retrieval?
* Handle this later in the process?

In [None]:
print(f"Number of queried observations: {len(x['HRRR']['time'])}")

In [None]:
for k in dat:
    print("~"*50)
    print(k)
    print(f"Number of RAWS observations: {len(dat[k]['RAWS']['time_raws'])}")

## Some Sations Return data every 10min

In [None]:
dat["LIB31_202401"]["RAWS"]["time_raws"][0:10]

In [None]:
dat["LIB31_202401"]["RAWS"]["fm"][0:10]