# Imports

In [4]:
import wget  # Downloading Data
from osgeo import gdal  # GDAL
import rasterio as rio  # Modifying Raster datasets
import geopandas as gpd
import rasterstats as rs

import pandas as pd
import numpy as np
import calendar
import sys, os
import glob
import matplotlib.pyplot as plt
from tqdm import tqdm
from multiprocessing import Pool

In [None]:
!pip install wget rasterstats --user

In [5]:
extract_vars = ['apcp', 'rhum.2m', 'air.sfc']  # NARR variables we will be downloading

In [None]:
for var in extract_vars:  # Make base Directories
    if not os.path.exists('./' + var):
        os.makedirs('./' + var)

## Download

In [None]:
for var in extract_vars:
    out_dir = './' + var + '/netcdf/'
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)
    
#   Example: ftp://ftp.cdc.noaa.gov/Datasets/NARR/Dailies/monolevel/air.sfc.2000.nc
    for year in range(2000,2019):
        dl_url = 'ftp://ftp.cdc.noaa.gov/Datasets/NARR/Dailies/monolevel/' + var + '.' + str(year) + '.nc'
        filename = out_dir + var + '.' + str(year) + '.nc'
        
        wget.download(dl_url, filename)

In [None]:
# Download Land Mask
wget.download('ftp://ftp.cdc.noaa.gov/Datasets/NARR/time_invariant/land.nc', './land.nc')

## Convert to GeoTIFF, Warp to EPSG 4326

In [None]:
# These are the NetCDF subdatasets used to access the data in NETCDF form
netcdf_subdatasets = {
                        'apcp':'apcp',
                        'rhum.2m': 'rhum',
                        'air.sfc':'air',
                    }

In [None]:
for var in extract_vars:
    out_dir = './' + var + '/geotiff/'
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)
        
        
    warp_options = gdal.WarpOptions(options='-t_srs \"EPSG:4326\" -of GTiff')  # set target spatial project, output type = GeoTIFF
    for year in tqdm(glob.glob('./' + var + '/netcdf/*.nc')):  # Do this for all netcdf files
        name = year.split('/')[3]
        
        print('Processing:', name)
        ds = gdal.Warp(srcDSOrSrcDSTab='NETCDF:' + year + ':' + netcdf_subdatasets[var], destNameOrDestDS=out_dir + name + '.geotiff', options=warp_options)
        ds = None # Flush the file cache


## Warp Land Mask

In [None]:
warp_options = gdal.WarpOptions(options='-t_srs \"EPSG:4326\" -of GTiff')
ds = gdal.Warp(srcDSOrSrcDSTab='NETCDF:./land.nc:land', destNameOrDestDS='./land.geotiff', options=warp_options)
ds = None # Flush the file cache

## Apply Land Mask
#### Import Gdal_Calc script

In [None]:
gdal_path = '/opt/conda/bin/'
sys.path.insert(0, gdal_path)
try:
    import gdal_calc
except:
    print('gdal_calc not found, please specify the path to this file in the cell above')

In [None]:
mask_file = './land.geotiff'

for var in extract_vars:
    out_dir = './' + var + '/masked_daily_geotiff/'
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)
        
    for file in tqdm(glob.glob('./' + var + '/geotiff/*.geotiff')):
        band_count = gdal.Open(file).RasterCount  # Get number of days (bands) in that year's file
        
        if var != 'apcp': # format for the dot in rhum.2m and air.sfc
            year = file[file.rfind('/')+6:-11].split('.')[1]
        else: 
            year = file[file.rfind('/')+6:-11]
            
        sub_out_dir = out_dir + year + '/'
        if not os.path.exists(sub_out_dir):
            os.makedirs(sub_out_dir)
        
        for band in range(1, band_count+1):
            outfile = sub_out_dir + str(band) + '.geotiff'
            if not os.path.exists(outfile):
                gdal_calc.Calc('A*B', A=file, B=mask_file, A_band=band, outfile=outfile, format='GTiff', NoDataValue=0)

## Zonal Stats

In [None]:
shape_frame = gpd.read_file('./tl_2017_us_county/tl_2017_us_county.shp')
shape = shape_frame.to_crs('+proj=longlat +datum=WGS84 +no_defs')
shape['GEOID'] = shape['GEOID'].astype(str)

In [None]:
for year in range(2000, 2019):
    year_dir = './' + var + '/data_NEW_NEW/' + str(year) + '/'
    if not os.path.exists(year_dir):
        print('Making', year_dir)
        os.makedirs(year_dir)

    year_files = sorted(glob.glob('./' + var + '/masked_daily_geotiff/' + str(year) + '/*.geotiff'))
    print('PROCESSING', len(year_files), 'FILES')

    for file in year_files:
        name = file[file.rfind('/')+1:file.rfind('.')] # the number of the day (1 for January 1, ..., 365 for December 31)
        stats = rs.zonal_stats(shape, file, stats=output_columns, all_touched=True)
        frame = pd.DataFrame.from_dict(stats).set_index(shape_frame['GEOID'])
        frame.to_pickle(year_dir + name + '.pkl')
        print(year_dir + name + '.pkl')