# Convolutional Neural Network: Preprocesses NCEP-NCAR-R1
Notebook preprocessing is based on the workflow in [read_reanalysis.ipynb](https://github.com/fdavenport/GRL2021/blob/main/notebooks/0a_read_reanalysis.ipynb) from Davenport and Diffenbaugh, 2021 
<br><br>
**Preprocessing steps**: 
1) Clip to study region
2) [HGT only] Detrend the data
3) Compute daily standardized anomalies
4) Convert to dataframe 

In [1]:
import xarray as xr 
import numpy as np 
import pandas as pd
from glob import glob
import sys 
from datetime import datetime
import boto3
import s3fs

# Import helper functions 
sys.path.insert(0, '../../utils')
from preprocessing_utils import (
    get_features_geom,
    convert_lon_360_to_180, 
    clip_to_geom, 
    calc_anomalies, 
) 
from misc_utils import format_nbytes
import parameters as param

## Get boundary geometry 
Will be used to clip the data

In [2]:
geom = get_features_geom()

## Sea Level Pressure data 

In [3]:
# Open dataset 
var = "slp" # Variable name 
filepaths_wildcard = "../../data/{0}_daily_means/{1}*.nc".format(var,var)
filepaths_all = glob(filepaths_wildcard)
ds = xr.open_mfdataset(filepaths_all).sel(time=param.time_period)
global_attrs = ds.attrs
ds = ds.drop_dims("nbnds")

# Convert lon range from 0:360 to -180:180 
ds = convert_lon_360_to_180(ds)

# Clip to geometry 
ds = clip_to_geom(ds, geom)

# Calculate daily standardized anomalies
ds = calc_anomalies(ds, var) 

  return self.array[key]
  return self.array[key]


Format the output data

In [4]:
# Format the output data 
slp_output_da = ds[var+"_anom"]
slp_output_da.attrs = {
    "long_name": "mean daily sea level pressure anomalies",
    "units": "Pa",
}

# Geopotential Heights at 500 hPa

In [5]:
# Open dataset 
var = "hgt"
filepaths_wildcard = "../../data/{0}_daily_means/{1}*.nc".format(var,var)
filepaths_all = glob(filepaths_wildcard)
ds = xr.open_mfdataset(filepaths_all).sel(time=param.time_period)
global_attrs = ds.attrs

# Clean it up a bit 
level = 500
ds = ds.sel(time=param.time_period)
ds = ds.drop_dims("nbnds")
ds = ds.sel(level=level).drop("level") 

# Convert lon range from 0:360 to -180:180 
ds = convert_lon_360_to_180(ds)

# Clip to geometry 
ds = clip_to_geom(ds, geom)

# Calculate annual domain average 500-hPa GPH to remove seasonal variability 
domain_mean_df = ds[var].groupby('time.year').mean(dim = "time").to_dataframe(name = var)

# Calculate linear trend in 500-hPa GPH
trend = np.polyfit(domain_mean_df.index.get_level_values('year'), domain_mean_df[var], 1)
print("Slope of trend:", trend[0], "m per year")

# Calculate detrended hgt
ds['change'] = (ds.time.dt.year - int(param.time_start[:4]))*trend[0]
ds[var+'_detrended'] = ds[var] - ds['change']
ds = ds.drop_vars('change')

# Calculate daily standardized anomalies
ds = calc_anomalies(ds, var+'_detrended') 

Slope of trend: 0.4554078383880419 m per year


  return self.array[key]
  return self.array[key]


Format the data

In [6]:
# Format the output data 
hgt_output_da = ds[var+"_detrended_anom"]
hgt_output_da.attrs = {
    "long_name": "mean detrended daily geopotential height anomalies",
    "units": "m",
    "level":level
}

## Combine datasets and write to netcdf

In [7]:
# Merge DataArrays 
output_ds = xr.merge([hgt_output_da, slp_output_da])

# Add descriptive attributes
output_ds.attrs = global_attrs
output_ds.attrs["title"] = global_attrs["title"] + " modified to produce daily anomalies"
output_ds.attrs["history"] = global_attrs["history"] + "\nDaily detrended anomalies produced " + datetime.today().strftime('%Y/%m/%d')

# Display 
display(output_ds)

# Print size of dataset 
nbytes = format_nbytes(output_ds.nbytes)
print("Size of output dataset: {0}".format(nbytes))

Unnamed: 0,Array,Chunk
Bytes,60.63 MiB,3.98 kiB
Shape,"(15583, 15, 34)","(1, 15, 34)"
Dask graph,15583 chunks in 3761 graph layers,15583 chunks in 3761 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 60.63 MiB 3.98 kiB Shape (15583, 15, 34) (1, 15, 34) Dask graph 15583 chunks in 3761 graph layers Data type float64 numpy.ndarray",34  15  15583,

Unnamed: 0,Array,Chunk
Bytes,60.63 MiB,3.98 kiB
Shape,"(15583, 15, 34)","(1, 15, 34)"
Dask graph,15583 chunks in 3761 graph layers,15583 chunks in 3761 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,30.32 MiB,1.99 kiB
Shape,"(15583, 15, 34)","(1, 15, 34)"
Dask graph,15583 chunks in 3759 graph layers,15583 chunks in 3759 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 30.32 MiB 1.99 kiB Shape (15583, 15, 34) (1, 15, 34) Dask graph 15583 chunks in 3759 graph layers Data type float32 numpy.ndarray",34  15  15583,

Unnamed: 0,Array,Chunk
Bytes,30.32 MiB,1.99 kiB
Shape,"(15583, 15, 34)","(1, 15, 34)"
Dask graph,15583 chunks in 3759 graph layers,15583 chunks in 3759 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


Size of output dataset: 91.07 MB


In [9]:
# Split into training-validation-testing
training = output_ds.sel(time=param.training_period)
validation = output_ds.sel(time=param.validation_period)
testing = output_ds.sel(time=param.testing_period)

In [10]:
# Output to netcdf
data_dir = "../../data/input_data_preprocessed/"

training.to_netcdf(data_dir+"training/training_features.nc") 
validation.to_netcdf(data_dir+"validation/validation_features.nc") 
testing.to_netcdf(data_dir+"testing/testing_features.nc") 