# SWMM Hourly and Daily Precipitation Comparison

In this notebook we will download the 3-hour and daily precipitation datasets from NARR, and then run simulations on both datasets. We will then compare their outputs.

## Imports

In [None]:
# General Usage
import os
import glob
import numpy as np
import pandas as pd
import sys
import wget  # Downloading Data

# Raster Data
from osgeo import gdal  # GDAL
import rasterio as rio  # Modifying Raster datasets
import rasterstats as rs
import geopandas as gpd

# Plotting
import matplotlib.pyplot as plt
import calendar

# Multi-processing
from multiprocessing import Pool
from functools import partial
 
    
# gdal_calc.py, custom path
path_to_gdal_calc = '/home/matas/anaconda3/envs/swmm/bin/'
sys.path.insert(0, path_to_gdal_calc)
try:
    import gdal_calc
except:
    print('gdal_calc not found, please specify the path to this file in gdal_path above')

## 1) Creating the Land Mask

This land mask will be used to remove the ocean data in both the daily and hourly data-sets.
#### Download

In [None]:
if not os.path.exists('./land.nc'):
    print('File not found, attempting download.')
    download_url = 'ftp://ftp.cdc.noaa.gov/Datasets/NARR/time_invariant/land.nc'
    wget.download(download_url, './land.nc')
    
mask_file = glob.glob('./land.nc')
print('Successfully found land mask' if len(mask_file)==1 else 'Failed to find land mask')
mask_file = mask_file[0]

#### Projection

We use the EPSG 5070 projection, which matches our block group shapefile.

In [None]:
proj = '+proj=aea +lat_1=29.5 +lat_2=45.5 +lat_0=23 +lon_0=-96 +x_0=0 +y_0=0 +ellps=GRS80 +towgs84=1,1,-1,0,0,0,0 +units=m +no_defs'
subdataset = 'land'
output_file = './land_mask'

warp_options = gdal.WarpOptions(options='-t_srs \"' + proj + '\" -of GTiff')
ds = gdal.Warp(srcDSOrSrcDSTab='NETCDF:' + mask_file + ':' + subdataset, destNameOrDestDS=output_file + '.geotiff', options=warp_options)
ds = None  # THIS IS VERY IMPORTANT
print('Successfully Warped Land Mask (' + output_file + '.geotiff)')

## 2) Land Mask Extension

We will extend the land mask by one pixel in each direction if the target pixel has no data. This is because the land mask excludes some block groups due to the low resolution.

In [None]:
raster = rio.open(output_file + '.geotiff')

data = raster.read(1)

rows = data.shape[0]
cols = data.shape[1]
nodata = 0

data = np.ma.masked_equal(data,nodata)
output_data = np.copy(data)

for i in range(rows):
    for j in range(cols):
        if data.mask[i][j]:
            try:
                for x in range(-1,2):
                    for y in range(-1,2):
                        if not data.mask[i+x][j+y]:
                            output_data[i][j] = data[i+x][j+y]
            except IndexError:
                pass

with rio.open(output_file + '_extended.geotiff', 'w', **raster.profile) as dst:
    dst.write(output_data, 1)    
    print('Created extended land mask at', output_file + '_extended.geotiff')

# Hourly Data

## 1) Downloading the 3-Hour NetCDF Data for 2014

In [None]:
download_dir = './hourly/'
if not os.path.exists(download_dir):
    os.makedirs(download_dir)

In [None]:
overwrite = False

name = 'apcp.2014.nc'
url = 'ftp://ftp.cdc.noaa.gov/Datasets/NARR/monolevel/'


if not os.path.exists(download_dir + name) or overwrite:
    print('Downloading', download_dir + name)
    wget.download(url + name, download_dir + name)
else:
    print('File already exists, not downloading')

## 2) Converting NetCDF to GeoTIFF

**Note that this process will take a few minutes as the NETCDF has a lot of bands**

In [None]:
netcdf = glob.glob('./hourly/apcp.2014.nc')
print('Found NETCDF file' if netcdf else 'NETCDF File not found')
if not netcdf:
    raise KeyboardInterrupt
netcdf = netcdf[0]

In [None]:
overwrite = False  # Overwrite existing GeoTIFF or not

proj = '+proj=aea +lat_1=29.5 +lat_2=45.5 +lat_0=23 +lon_0=-96 +x_0=0 +y_0=0 +ellps=GRS80 +towgs84=1,1,-1,0,0,0,0 +units=m +no_defs'

# The warp options specify our target projection and output type (GeoTIFF)
warp_options = gdal.WarpOptions(options='-t_srs \"' + proj + '\" -of GTiff')
subdataset = 'apcp'

outfile = './hourly/apcp.hourly.2014.geotiff'

if not os.path.exists(outfile) or overwrite:
    ds = gdal.Warp(srcDSOrSrcDSTab='NETCDF:' + netcdf + ':' + subdataset, destNameOrDestDS=outfile, options=warp_options)
    ds = None # Flush the file cache
    print('Warped file ./hourly/apcp.hourly.2014.geotiff')
else:
    print('File already exists, skipped.')

## 3) Masking the Data

#### Select the NetCDF and Land Mask

In [None]:
netcdf = glob.glob('./hourly/apcp.hourly.2014.geotiff')
print('Found NETCDF file' if netcdf else 'NETCDF File not found')

mask_file = glob.glob('./land_mask_extended.geotiff')
print('Found Mask file' if mask_file else 'Mask file not found')

if not mask_file or not netcdf:
    print('Error! One or more required files not found.')
    raise KeyboardInterrupt
    
netcdf = netcdf[0]
mask_file = mask_file[0]

#### Mask Each Band

Note that this will take a while as there are 2920 bands!

We will use multiprocessing to speed up the masking process.

In [None]:
overwrite = False

output_directory = './hourly/masked_geotiffs/'
if not os.path.exists(output_directory):
    print('Making', output_directory)
    os.mkdir(output_directory)

#### Multiprocessing Function

In [None]:
def multithread_mask(band, netcdf, mask):
    overwrite = False
    outfile = output_directory + str(band) + '.geotiff'
    if not os.path.exists(outfile) or overwrite:
        gdal_calc.Calc('A*B', A=netcdf, B=mask, A_band=band, outfile=outfile, format='GTiff', NoDataValue=-9999.0)
    return

In [None]:
raster = gdal.Open(netcdf)
band_count = raster.RasterCount

band_list = [i for i in range(1, band_count+1)]

pool = Pool()
pool.map(partial(multithread_mask, netcdf=netcdf, mask=mask_file), band_list)

## 4) Extracting the Data

The shape file california_chicago.gpkg is a modified version of the SelectBG_all_land_BGID_final shapefile and contains only two block groups. It can be found on the [GitHub repository](https://github.com/ncsa/CPRHD_WNV_USA_SWMM/tree/master/jupyter_notebooks/SWMM_Precipitation) in this Jupyter Notebook's folder.

In [None]:
shape_file = './chicago_california_shape_file/california_chicago.gpkg'
geotiffs = glob.glob('./hourly/masked_geotiffs/*.geotiff')
if not shape_file and len(geotiffs) != 2920:
    print('Failed to find one or more files!')
    raise KeyboardInterrupt
else:
    print('Found all files successfully')
    
geotiffs.sort()

In [None]:
proj = '+proj=aea +lat_1=29.5 +lat_2=45.5 +lat_0=23 +lon_0=-96 +x_0=0 +y_0=0 +ellps=GRS80 +towgs84=1,1,-1,0,0,0,0 +units=m +no_defs'
shape_file = './modified_shapefile/california_chicago.gpkg'
shape_frame = gpd.read_file(shape_file)
shape_frame['GEOID10'] = shape_frame['GEOID10'].astype(str)
name = (geotiffs[0])[geotiffs[0].rfind('/')+1:geotiffs[0].rfind('.')]

output_columns = ['mean']
stats = rs.zonal_stats(shape_frame, geotiffs[0], stats=output_columns, all_touched=True)
frame = pd.DataFrame.from_dict(stats)
frame = frame.join(shape_frame['GEOID10'])
frame = frame.join(shape_frame['STATE'])
frame = frame[['STATE', 'GEOID10', 'mean']]

columns = frame.columns.values
columns[2] = name
frame.columns = columns

for file in geotiffs[1:]:
    name = file[file.rfind('/')+1:file.rfind('.')]
    stats = rs.zonal_stats(shape_frame, file, stats=output_columns, all_touched=True)
    subframe = pd.DataFrame.from_dict(stats)
    
    columns = subframe.columns.values
    columns[0] = name
    subframe.columns = columns
    frame = frame.join(subframe)
    
frame.to_pickle('./hourly/hourly_data.pkl')

## 5) Unit Conversion and Sorting

We will convert NARR's units (mm) to the units we use in SWMM (inches).

We will also sort the columns by increasing order (1, 2, 3, ..., 2919, 2920)

In [None]:
frame = pd.read_pickle('./hourly/hourly_data.pkl')

In [None]:
california = frame.iloc[0,:]
chicago = frame.iloc[1,:]

# Convert to inches and append to the STATE and GEOID10 rows
chicago = chicago.iloc[2:].apply(lambda x: x / 25.40)
california = california.iloc[2:].apply(lambda x: x / 25.40)

#Sort by increasing order
chicago.index = chicago.index.astype(int)
chicago.sort_index(inplace=True)
chicago = chicago.rename('chicago')

california.index = california.index.astype(int)
california.sort_index(inplace=True)
california = california.rename('california')

#### Save the Data

In [None]:
output_directory = './hourly/precipitation_data/'
if not os.path.exists(output_directory):
    print('Making', output_directory)
    os.mkdir(output_directory)

chicago.to_pickle(output_directory + 'chicago.pkl')
california.to_pickle(output_directory + 'california.pkl')

# Daily Data Processing

We will repeat the above process with the daily NetCDF for 2014.

## 1) Download Data

In [None]:
download_dir = './daily/'
if not os.path.exists(download_dir):
    print('Making', download_dir)
    os.mkdir(download_dir)

In [None]:
download_url = 'ftp://ftp.cdc.noaa.gov/Datasets/NARR/Dailies/monolevel/'
name = 'apcp.2014.nc'

if not os.path.exists(download_dir + name):
    print('NetCDF file not found, attempting download')
    wget.download(download_url+name, download_dir + name)
    print('Download completed')

## 2) Convert the NetCDF to GeoTIFF

In [None]:
netcdf = glob.glob('./daily/apcp.2014.nc')
print('Found NETCDF file' if netcdf else 'NETCDF File not found')
if not netcdf:
    raise KeyboardInterrupt
netcdf = netcdf[0]

In [None]:
overwrite = False  # Overwrite existing GeoTIFF or not

proj = '+proj=aea +lat_1=29.5 +lat_2=45.5 +lat_0=23 +lon_0=-96 +x_0=0 +y_0=0 +ellps=GRS80 +towgs84=1,1,-1,0,0,0,0 +units=m +no_defs'

# The warp options specify our target projection and output type (GeoTIFF)
warp_options = gdal.WarpOptions(options='-t_srs \"' + proj + '\" -of GTiff')
subdataset = 'apcp'
if not os.path.exists('./daily/apcp.daily.2014.geotiff') or overwrite:
    ds = gdal.Warp(srcDSOrSrcDSTab='NETCDF:' + netcdf + ':' + subdataset, destNameOrDestDS='./daily/apcp.daily.2014.geotiff', options=warp_options)
    ds = None # Flush the file cache
    print('Warped file ./daily/apcp.daliy.2014.geotiff')

## 3) Masking each GeoTIFF

In [None]:
netcdf = glob.glob('./apcp.daily.2014.geotiff')
print('Found NETCDF file' if netcdf else 'NETCDF File not found')
if not netcdf:
    raise KeyboardInterrupt
netcdf = netcdf[0]

mask_file = glob.glob('./land_mask_extended.geotiff')
print('Found mask file' if mask_file else 'Mask file not found')
if not mask_file:
    raise KeyboardInterrupt
mask_file = mask_file[0]

In [None]:
overwrite = False

output_directory = './daily/masked_geotiffs/'
if not os.path.exists(output_directory):
    print('Making', output_directory)
    os.mkdir(output_directory)
    
raster = gdal.Open(netcdf)
band_count = raster.RasterCount
print(netcdf)
for band in range(1, band_count+1):
    outfile = output_directory + str(band) + '.geotiff'
    if not os.path.exists(outfile) or overwrite:
        gdal_calc.Calc('A*B', A=netcdf, B=mask_file, A_band=band, outfile=outfile, format='GTiff', NoDataValue=-9999.0)

## 4) Zonal Stats Data Extraction

In [None]:
geotiffs = glob.glob('./daily_masked_geotiffs/*.geotiff')
if len(geotiffs) != 365:
    print('Failed to find all GeoTIFFs.')
    raise KeyboardInterrupt
else:
    print('Found all GeoTIFF files')

In [None]:
shape_file = './modified_shapefile/california_chicago.gpkg'
if not os.path.exists(shape_file):
    print('Failed to find shape file')
    raise KeyboardInterrupt
else:
    print('Found shape file')

In [None]:
output_directory = './daily/precipitation_data/'
if not os.path.exists(output_directory):
    print('Making', output_directory)
    os.mkdir(output_directory)

In [None]:
shape_frame = gpd.read_file(shape_file)
shape_frame['GEOID10'] = shape_frame['GEOID10'].astype(str)

# Set up the frame with the first file (we will join the other files onto this one)
name = (geotiffs[0])[geotiffs[0].rfind('/')+1:geotiffs[0].rfind('.')]

output_columns = ['mean']
stats = rs.zonal_stats(shape_frame, geotiffs[0], stats=output_columns, all_touched=True)
frame = pd.DataFrame.from_dict(stats)
frame = frame.join(shape_frame['GEOID10'])
frame = frame.join(shape_frame['STATE'])
frame = frame[['STATE', 'GEOID10', 'mean']]

columns = frame.columns.values
columns[2] = name
frame.columns = columns

for file in geotiffs[1:]:
    name = file[file.rfind('/')+1:file.rfind('.')]
    stats = rs.zonal_stats(shape_frame, file, stats=output_columns, all_touched=True)
    subframe = pd.DataFrame.from_dict(stats)
    
    columns = subframe.columns.values
    columns[0] = name
    subframe.columns = columns
    frame = frame.join(subframe)
frame.to_pickle('./daily/daily_data.pkl')

## 5) Unit Conversion and Sorting

In [None]:
daily_data = pd.read_pickle('./daily/daily_data.pkl')

california = daily_data.iloc[0,:]
chicago = daily_data.iloc[1,:]

# Convert to inches and append to the STATE and GEOID10 rows
chicago = chicago.iloc[2:].apply(lambda x: x / 25.40)
california = california.iloc[2:].apply(lambda x: x / 25.40)

#Sort by increasing order
chicago.index = chicago.index.astype(int)
chicago.sort_index(inplace=True)
chicago = chicago.rename('chicago')

california.index = california.index.astype(int)
california.sort_index(inplace=True)
california = california.rename('california')

In [None]:
output_directory = './daily/precipitation_data/'
if not os.path.exists(output_directory):
    print('Making', output_directory)
    os.mkdir(output_directory)

chicago.to_pickle(output_directory + 'chicago.pkl')
california.to_pickle(output_directory + 'california.pkl')

Now that the data has been processed, we can analyze it and use it to run simulations. This can be found in the Data_Analysis notebook.