In [1]:
import os

import numpy as np
import xarray as xr
import pandas as pd

from glob import glob
from os.path import isfile
from functools import partial
from subprocess import Popen, call
from multiprocessing import get_context

os.environ['OMP_NUM_THREADS'] = '1'

In [2]:
script_dir = '/uufs/chpc.utah.edu/common/home/u1070830/code/snow-liquid-ratio/'
era5_script_dir = '/uufs/chpc.utah.edu/common/home/u1070830/code/model-tools/era5/'
obs_path = '/uufs/chpc.utah.edu/common/home/steenburgh-group10/mewessler/observations/'
era5_path = '/uufs/chpc.utah.edu/common/home/steenburgh-group10/mewessler/era5/'
gfs_path = '/uufs/chpc.utah.edu/common/home/steenburgh-group10/mewessler/gfs/'

## Start with the metadata file as an efficent way to keep data organized vs a config file

In [8]:
metadata = pd.read_excel(obs_path + 'Dataset_Metadata.xlsx')

# Temp fix to use CLNX instead of CLN data
metadata['code'] = metadata['code'].replace('CLN', 'CLNX')
metadata = metadata.set_index('code')

metadata.iloc[:, :-1]

Unnamed: 0_level_0,name,owner,city,state,lat,lon,elevation_m,start,end,interval,n_events,auto_manual,snow_type,swe_type
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
CLNX,Alta Collins,Alta Ski Lifts,Alta,UT,40.5763,-111.6383,2945,,,12/24,,Manual,Snow Board (HSN),
AGD,Alta Guard,Utah Dept. of Transportation,Alta,UT,40.5905,-111.638,2682,,,6/12/24,,Manual,Snow Board (HSN),
ALTA,Alta CoOp,National Weather Service,Alta,UT,40.59058,-111.63703,2655,,,12/24,,Manual,,
BCC,Big Cottonwood,Utah Dept. of Transportation,Brighton,UT,40.6503,-111.6499,2224,,,12/24,,Manual,,
PVC,Provo Canyon,Utah Dept. of Transportation,Provo,UT,40.3341,-111.6133,1560,,,12/24,,Manual,,
SLB,Silver Lake Brighton,National Weather Service,Brighton,UT,40.6008,-111.5842,2664,,,24,,Manual,,
BSNFDC,Devil's Creek,Burlington Northern Santa Fe Corp,,MT,48.277725,-113.432578,1415,,,,,Manual,,
BSNFEX,Essex,Burlington Northern Santa Fe Corp,Essex,MT,48.28176,-113.60657,1173,,,,,Manual,,
BSNFJE,Java East,Burlington Northern Santa Fe Corp,,MT,48.23587,-113.56453,1232,,,,,Manual,,


## Loop over each station:
We don't want to re-run on data already processed, so check each step first

- Ensure the observation data exists and is read in to .pd

In [9]:
site_list = metadata.index.values.astype('str')
print('Sites to process:', site_list)

ingest_script = script_dir + 'data-tools/generic_data_ingest.py'

for site in site_list:
    
    if len(glob(obs_path + 'clean/%s_*.pd'%site)) > 0:
        print('File exists, skipping: %s'%site)
    else:
        try:
            run_cmd = ('python ' + ingest_script + ' %s'%site)
            call(run_cmd, shell=True)

        except:
            print('Ingest %s failed to run'%site)
            # raise

        else:
            # Check to see if file was written before declaring success - 
            # call won't catch failures in the child script
            
            if len(glob(obs_path + 'clean/%s_*.pd'%site)) > 0:
                print('Ingest %s success'%site)
                
            else:
                print('Ingest %s failed to write'%site)
                # raise
                
                # Fail cases (log these within child script:
                #     No header info for site/file
                #     No file found for site
                #     Header mismatch/read error
                #     Data mismatch error (automated - manual too large)

Sites to process: ['CLNX' 'AGD' 'ALTA' 'BCC' 'PVC' 'SLB' 'BSNFDC' 'BSNFEX' 'BSNFJE']
File exists, skipping: CLNX
File exists, skipping: AGD
File exists, skipping: ALTA
File exists, skipping: BCC
File exists, skipping: PVC
File exists, skipping: SLB
File exists, skipping: BSNFDC
File exists, skipping: BSNFEX
File exists, skipping: BSNFJE


- Ensure the era5 profile has been created

In [10]:
# for site in site_list:
def profiles(site, metadata):

    site_lat, site_lon = metadata.loc[site, ['lat', 'lon']].values
    
    # We need to first determine the output filename (All ERA5 profiles are xx.xxN, xxx.xxW)
    sample = xr.open_dataset(era5_script_dir + 'era5_sample.nc')
    a = abs(sample['latitude']-site_lat)+abs(sample['longitude']-360-site_lon)
    yi, xi = np.unravel_index(a.argmin(), a.shape)

    lat = sample.isel(latitude=yi, longitude=xi)['latitude']
    lon = sample.isel(latitude=yi, longitude=xi)['longitude'] - 360
    
    era5_prof_file = 'era5prof_%.2fN_%.2fW.nc'%(lat, abs(lon))

    print('\nSite: %s %.3f %.3f\n%s\n'%(site, site_lat, site_lon, era5_prof_file))
    
    iter_count = 0
    while not isfile(era5_path + '/profiles/' + era5_prof_file):
        iter_count += 1
        
        run_cmd = era5_script_dir + 'extract_agg_profile.sh %.2f %.2f 1980 2020'%(lat, lon)
        P = Popen(run_cmd, shell=True, stdout=PIPE, stderr=PIPE)
        output, err = P.communicate()

    print('%s Complete'%era5_prof_file)

# 10 process workers seems OK for now...
worker_cap = 10
n_workers = len(site_list) if len(site_list) < worker_cap else worker_cap

profiles_mp = partial(profiles, metadata=metadata)
with get_context('fork').Pool(10) as p:
    p.map(profiles_mp, site_list)
    p.close()
    p.join()


Site: BSNFEX 48.282 -113.607
era5prof_48.25N_113.50W.nc

Site: BCC 40.650 -111.650
era5prof_40.75N_111.75W.nc

Site: AGD 40.590 -111.638
era5prof_40.50N_111.75W.nc

Site: BSNFDC 48.278 -113.433
era5prof_48.25N_113.50W.nc

Site: ALTA 40.591 -111.637
era5prof_40.50N_111.75W.nc

Site: SLB 40.601 -111.584
era5prof_40.50N_111.50W.nc

Site: CLNX 40.576 -111.638
era5prof_40.50N_111.75W.nc

Site: PVC 40.334 -111.613
era5prof_40.25N_111.50W.nc

Site: BSNFJE 48.236 -113.565
era5prof_48.25N_113.50W.nc









era5prof_40.50N_111.50W.nc Complete
era5prof_40.25N_111.50W.nc Complete
era5prof_48.25N_113.50W.nc Complete
era5prof_40.50N_111.75W.nc Complete
era5prof_40.75N_111.75W.nc Complete
era5prof_48.25N_113.50W.nc Complete
era5prof_40.50N_111.75W.nc Complete
era5prof_40.50N_111.75W.nc Complete
era5prof_48.25N_113.50W.nc Complete


- Pair the observations and ERA5 profile

In [None]:
# Pass cases:
#     Produced new site pair
#     Site pair found

- Extract a GFS profile for verification

In [None]:
# Pass cases:
#     Produced new GFS profile
#     GFS profile found

In [None]:
print('Data Pre-Processing Completed...')