In [2]:
%matplotlib inline
%pylab inline
pylab.rcParams['figure.figsize'] = (10.0, 8.0)

import geopandas as gpd
import pandas as pd
import glob
import zipfile
import rasterio
import os
import xarray as xr
import numpy as np
from numpy.ma import masked
import dask.dataframe as dd




# gpd.version.version

Populating the interactive namespace from numpy and matplotlib


In [3]:
def map_gridmet(df, wght_id, wghts, data):
    for index, row in df.iterrows():
        try:
            weight_id_rows = wghts.get_group(row[wght_id])
            df.tmax.at[index] = np.nan_to_num(np_get_wval(data, weight_id_rows, row[wghts_id]) - 273.5)
        except:
            df.tmax.at[index] = netCDF4.default_fillvals['f8']
            
def map_gridmet_dask(df, wght_id, wghts, data):
    for index, row in df.iterrows():
        try:
            weight_id_rows = wghts.get_group(row[wght_id])
            tmp = np.nan_to_num(np_get_wval(data, weight_id_rows, row[wghts_id]) - 273.5)
#             row.replace({'tmax':tmp})
        except:
            tmp = netCDF4.default_fillvals['f8']
#             row.replace({'tmax':tmp})
        df.loc[[index],['tmax']] = tmp
            
import netCDF4
def np_get_wval(ndata, wghts, hru_id):
    """
    Returns weighted average of ndata with weights = grp
    1) mdata = the subset of values associated with the gridmet id's that are mapped to hru_id.
    2) Some of these values may have nans if the gridmet id is outside of conus so only return values
    that are inside of conus
    3) this means that hru's that are entirely outside of conus will return nans which will ultimately,
    outside of this function get assigned zero's.
    4) the value is assigned the weighted average
    :param ndata: float array of data values
    :param wghts: float array of weights
    :param hru_id hru id number
    :return: numpy weighted averaged - masked to deal with nans associated with
            ndata that is outside of the conus.
    """
    mdata = np.ma.masked_array(ndata[wghts['grid_ids'].values.astype(int)],
                               np.isnan(ndata[wghts['grid_ids'].values.astype(int)]))
#     if np.ma.is_masked(mdata):
#         print('returning masked value', hru_id)

    # mdata = np.ma.masked_where(ndata[wghts['grid_ids'].values.astype(int)] <= 0.0,
    #                            (ndata[wghts['grid_ids'].values.astype(int)]))
    tmp = np.ma.average(mdata, weights=wghts['w'])
    if tmp is masked:
#         print('returning masked value', hru_id)
        return netCDF4.default_fillvals['f8'] #np.nan

    else:
        return tmp


In [4]:
print(os.getcwd())

B:\GitRepos\onhm-fetcher-parser\notebooks


# Open Gridmet max temperature with geopandas and plot

In [5]:
print(os.getcwd())
from pathlib import Path
# folder = Path(r'../Data') # assumes working directory is onhm-fetcher-parser
folder = Path(r'../Data_v1_1') # assumes working directory is onhm-fetcher-parser
print(folder)
# shapefiles = folder.glob("*_0[1-2].shp")
shapefiles = folder.glob("*2e*.shp")
gdf = pd.concat([
    gpd.read_file(shp)
    for shp in shapefiles
]).pipe(gpd.GeoDataFrame)
gdf.reset_index(drop=True, inplace=True)
# gdf.plot()
print(gdf)

B:\GitRepos\onhm-fetcher-parser\notebooks
..\Data_v1_1
               LAYER            GM_TYPE  OBJECTID  nhru_v11  hru_segme1  \
0       NHM\nhru_v11  Unknown Area Type         1     76127       40038   
1       NHM\nhru_v11  Unknown Area Type         2     76147       40038   
2       NHM\nhru_v11  Unknown Area Type         3     76170       40021   
3       NHM\nhru_v11  Unknown Area Type         3     76170       40021   
4       NHM\nhru_v11  Unknown Area Type         3     76170       40021   
...              ...                ...       ...       ...         ...   
139802  NHM\nhru_v11  Unknown Area Type    114954     57964       31028   
139803  NHM\nhru_v11  Unknown Area Type    114955     64080       28886   
139804  NHM\nhru_v11  Unknown Area Type    114956     64150       28866   
139805  NHM\nhru_v11  Unknown Area Type    114957     65633       31412   
139806  NHM\nhru_v11  Unknown Area Type    114958     18843       10081   

        Shape_Leng Shape_Area  nhm_id  hru_i

Copy onhm hru ID (here nhru_v11 into a new simple dataframe)

# Open Gridmet data (as netcdf file) print out some metadata
This first bit of code follows examples from the following link:https://climate.northwestknowledge.net/MACA/OPENDAP.php
First we open the data set and inspect the metadata

In [6]:
import requests
from requests.exceptions import HTTPError
import json
# delete existing file if it exists
gmfile = Path('../Data_v1_1/test_gm4.nc')
exists = gmfile.exists()
if exists:
    os.remove(gmfile)
    print('removed existing file')

url2 = 'http://thredds.northwestknowledge.net:8080/thredds/ncss/agg_met_tmmx_1979_CurrentYear_CONUS.nc'
payload2={'var': 'daily_maximum_temperature',
        'disableLLSubset': 'on',
        'disableProjSubset': 'on',
        'horizStride': '1',
        'time_start': '2018-12-31T00:00:00Z',
        'time_end': '2018-12-31T00:00:00Z',
        'timeStride': '1',
        'accept': 'netcdf'}

try:
    myfile = requests.get(url2, params=payload2)
    myfile.raise_for_status()
except HTTPError as http_err:
    print(f'HTTP error occurred: {http_err}')  # Python 3.6
except Exception as err:
    print(f'Other error occurred: {err}')  # Python 3.6
else:
    print('Success!')
    print(myfile.url)
        
with open(gmfile, 'wb') as fh:
    fh.write(myfile.content)
    fh.close()

ds = xr.open_dataset(gmfile)
# print(ds)

print('\n The meta data is: \n', json.dumps(ds.attrs, indent=4))
lathandle=ds['lat']
lonhandle=ds['lon']
timehandle=ds['day']

datahandle=ds['daily_maximum_temperature'] # for aggragated download

#collect data to describe geotransform
lonmin = float(ds.attrs['geospatial_lon_min'])
latmax = float(ds.attrs['geospatial_lat_max'])
lonres = float(ds.attrs['geospatial_lon_resolution'])
latres = float(ds.attrs['geospatial_lon_resolution'])

#Print some information on the data

print('\n Data attributes, sizes, and coords \n') 
print('\n Data sizes are: \n', datahandle.sizes)
print('\n Data coords are: \n', datahandle.coords)

ts = datahandle.sizes
print(type(ts))
print(ts['day'])
dayshape = ts['day']
Lonshape = ts['lon']
Latshape = ts['lat']
print(dayshape, Lonshape, Latshape)



removed existing file
Success!
http://thredds.northwestknowledge.net:8080/thredds/ncss/agg_met_tmmx_1979_CurrentYear_CONUS.nc?var=daily_maximum_temperature&disableLLSubset=on&disableProjSubset=on&horizStride=1&time_start=2018-12-31T00%3A00%3A00Z&time_end=2018-12-31T00%3A00%3A00Z&timeStride=1&accept=netcdf

 The meta data is: 
 {
    "geospatial_bounds_crs": "EPSG:4326",
    "Conventions": "CF-1.0",
    "geospatial_bounds": "POLYGON((-124.7666666333333 49.400000000000000, -124.7666666333333 25.066666666666666, -67.058333300000015 25.066666666666666, -67.058333300000015 49.400000000000000, -124.7666666333333 49.400000000000000))",
    "geospatial_lat_min": 25.066666666666666,
    "geospatial_lat_max": 49.400000000000006,
    "geospatial_lon_min": -124.76666663333334,
    "geospatial_lon_max": -67.05833330000002,
    "geospatial_lon_resolution": "0.041666666666666",
    "geospatial_lat_resolution": "0.041666666666666",
    "geospatial_lat_units": "decimal_degrees north",
    "geospatial_l

In [7]:
dfmap = pd.DataFrame(gdf.filter(['nhru_v11']))
dfmap['tmax'] = 0.0
print(type(dfmap))
print(dfmap)
nhm_id = dfmap.nhru_v11.values
nhm_id

<class 'pandas.core.frame.DataFrame'>
        nhru_v11  tmax
0          76127   0.0
1          76147   0.0
2          76170   0.0
3          76170   0.0
4          76170   0.0
...          ...   ...
139802     57964   0.0
139803     64080   0.0
139804     64150   0.0
139805     65633   0.0
139806     18843   0.0

[139807 rows x 2 columns]


array([76127, 76147, 76170, ..., 64150, 65633, 18843], dtype=int64)

In [8]:
wght_UofI = pd.read_csv('../Data_v1_1/tmp_Gridmet_weights_hru_v1_1e.csv')
wghts_id = wght_UofI.columns[1]
wghts = wght_UofI.groupby(wghts_id)
ndata = datahandle.values[dayshape-1,:,:].flatten(order='K')
# mdata = np.ma.masked_array(ndata[wghts['grid_ids'].values.astype(int)],
#                                    np.isnan(ndata[wghts['grid_ids'].values.astype(int)]))

In [None]:
val = dfmap.tmax.values
index = dfmap[wghts_id].values

In [None]:
val.shape[0]

In [None]:
%%timeit -r 1
map_gridmet(dfmap, wghts_id, wghts, ndata)

In [None]:
dfmap.tmax.plot()

In [None]:
def map_gridmet2(val, index, wght_id, wghts, data):
    for i in range(val.shape[0]):
        try:
            weight_id_rows = wghts.get_group(index[i])
            val[i] = np.nan_to_num(np_get_wval(data, weight_id_rows, index[i]) - 273.5)
        except:
            val[i] = netCDF4.default_fillvals['f8']


In [None]:
%%timeit -r 1
map_gridmet2(val, index, wghts_id, wghts, ndata)

In [None]:
print(val.min())
print(val.max())

In [None]:
import numba
fast_func = numba.jit(map_gridmet2)

In [None]:
# %%timeit -r 1
# fast_func(val, index, wghts_id, wghts, ndata)

In [None]:
def map_gridmet3(val, index, wght_id, wghts, data):
     
    for i in range(val.shape[0]):
        try:
            weight_id_rows = wghts.get_group(index[i])
            mdata = np.ma.masked_array(data[weight_id_rows['grid_ids'].values.astype(int)],
                                   np.isnan(data[weight_id_rows['grid_ids'].values.astype(int)]))
            tmp = np.ma.average(mdata, weights=wight_id_rows['w'])
            if tmp is masked:
                df.tmax.at[index] = netCDF4.default_fillvals['f8']
            else:
                df.tmax.at[index] = tmp-273.5
#             df.tmax.at[index] = np.nan_to_num(np_get_wval2(data, weight_id_rows, row[wghts_id]) - 273.5)
        except:
            df.tmax.at[index] = netCDF4.default_fillvals['f8']
            


In [None]:
import numba
fast_func = numba.jit(map_gridmet3)

In [None]:
%%timeit -r 1
fast_func(val, index, wghts_id, wghts, ndata)

In [None]:
from dask.distributed import Client
client = Client(n_workers = 4)

In [None]:
dfmap['tmax'] = 0.0
df = dd.from_pandas(dfmap, npartitions=3)
print(type(df))
# for index, i in df.iterrows():
#     if index % 10000 == 0:
#         print(i)

In [None]:
print(df._meta)
print(df._meta.dtypes)

In [None]:
%%timeit -r 1
res = df.map_partitions(map_gridmet_dask, wghts_id, unique_hru_ids, ndata,
                        meta=(pd.DataFrame({'nhru_v11':np.dtype(np.int64), 'tmax':np.dtype(np.float64)}, index=[]))).compute()

In [None]:
dfmap.tmax.plot()

In [None]:
df