In [None]:
import os

import numpy as np
import xarray as xr
import pandas as pd

from glob import glob
from os.path import isfile
from functools import partial
from subprocess import Popen, call
from multiprocessing import get_context

os.environ['OMP_NUM_THREADS'] = '1'

In [None]:
script_dir = '/uufs/chpc.utah.edu/common/home/u1070830/code/snow-liquid-ratio/'
era5_script_dir = '/uufs/chpc.utah.edu/common/home/u1070830/code/model-tools/era5/'
obs_path = '/uufs/chpc.utah.edu/common/home/steenburgh-group10/mewessler/observations/'
era5_path = '/uufs/chpc.utah.edu/common/home/steenburgh-group10/mewessler/era5/'
gfs_path = '/uufs/chpc.utah.edu/common/home/steenburgh-group10/mewessler/gfs/'

## Start with the metadata file as an efficent way to keep data organized vs a config file

In [None]:
metadata = pd.read_excel(obs_path + 'Dataset_Metadata.xlsx').set_index('code')
metadata.iloc[:, :-1]

## Loop over each station:
We don't want to re-run on data already processed, so check each step first

- Ensure the observation data exists and is read in to .pd

In [None]:
site_list = metadata.index.values.astype('str')
print('Sites to process:', site_list)

ingest_script = script_dir + 'data-tools/generic_data_ingest.py'

for site in site_list:
    
    if len(glob(obs_path + 'clean/%s_*.pd'%site)) > 0:
        print('File exists, skipping: %s'%site)
    else:
        try:
            run_cmd = ('python ' + ingest_script + ' %s'%site)
            call(run_cmd, shell=True)

        except:
            print('Ingest %s failed to run'%site)
            # raise

        else:
            # Check to see if file was written before declaring success - 
            # call won't catch failures in the child script
            
            if len(glob(obs_path + 'clean/%s_*.pd'%site)) > 0:
                print('Ingest %s success'%site)
                
            else:
                print('Ingest %s failed to write'%site)
                # raise
                
                # Fail cases (log these within child script:
                #     No header info for site/file
                #     No file found for site
                #     Header mismatch/read error
                #     Data mismatch error (automated - manual too large)

- Ensure the era5 profile has been created

In [None]:
# for site in site_list:
def profiles(site, metadata):

    site_lat, site_lon = metadata.loc[site, ['lat', 'lon']].values
    
    # We need to first determine the output filename (All ERA5 profiles are xx.xxN, xxx.xxW)
    sample = xr.open_dataset(era5_script_dir + 'era5_sample.nc')
    a = abs(sample['latitude']-site_lat)+abs(sample['longitude']-360-site_lon)
    yi, xi = np.unravel_index(a.argmin(), a.shape)

    lat = sample.isel(latitude=yi, longitude=xi)['latitude']
    lon = sample.isel(latitude=yi, longitude=xi)['longitude'] - 360
    
    era5_prof_file = 'era5prof_%.2fN_%.2fW.nc'%(lat, abs(lon))

    print('\nSite: %s %.3f %.3f\n%s\n'%(site, site_lat, site_lon, era5_prof_file))
    
    iter_count = 0
    while not isfile(era5_path + '/profiles/' + era5_prof_file):
        iter_count += 1
        
        run_cmd = era5_script_dir + 'extract_agg_profile.sh %.2f %.2f 1980 2020'%(lat, lon)
        P = Popen(run_cmd, shell=True, stdout=PIPE, stderr=PIPE)
        output, err = P.communicate()

    print('%s Complete'%era5_prof_file)

# 10 process workers seems OK for now...
worker_cap = 10
n_workers = len(site_list) if len(site_list) < worker_cap else worker_cap

profiles_mp = partial(profiles, metadata=metadata)
with get_context('fork').Pool(10) as p:
    p.map(profiles_mp, site_list)
    p.close()
    p.join()

- Pair the observations and ERA5 profile

In [None]:
# Pass cases:
#     Produced new site pair
#     Site pair found

- Extract a GFS profile for verification

In [None]:
# Pass cases:
#     Produced new GFS profile
#     GFS profile found

In [None]:
print('Data Pre-Processing Completed...')