# Preprocesses NCEP-NCAR-R1
Notebook preprocessing is based on the workflow in [read_reanalysis.ipynb](https://github.com/fdavenport/GRL2021/blob/main/notebooks/0a_read_reanalysis.ipynb) from Davenport and Diffenbaugh, 2021 
<br><br>
**Preprocessing steps**: 
1) Clip to study region
2) Compute area average
3) [HGT only] Detrend the data
4) Compute daily standardized anomalies
5) Convert to dataframe 

In [None]:
import xarray as xr 
import numpy as np 
import pandas as pd
from glob import glob
import sys 
from datetime import datetime
import boto3
import s3fs

# Import helper functions 
sys.path.insert(0, '../utils')
from preprocessing_utils import (
    get_state_geom,
    convert_lon_360_to_180, 
    clip_to_geom, 
    calc_anomalies, 
) 
from misc_utils import format_nbytes
import parameters as param

## Get boundary geometry 
Will be used to clip the data

In [None]:
state = "Colorado"
geom = get_state_geom(state=state)

## Sea Level Pressure data 

In [None]:
# Open dataset 
var = "slp" # Variable name 
filepaths_wildcard = "../data/{0}_daily_means/{1}*.nc".format(var,var)
filepaths_all = glob(filepaths_wildcard)
ds = xr.open_mfdataset(filepaths_all).sel(time=param.time_period)
global_attrs = ds.attrs
ds = ds.drop_dims("nbnds")

# Convert lon range from 0:360 to -180:180 
ds = convert_lon_360_to_180(ds)

# Clip to geometry 
ds = clip_to_geom(ds, geom)

# Average over entire region
ds = ds.mean(dim=["lat","lon"]) 

# Calculate daily standardized anomalies
ds = calc_anomalies(ds, var) 

Format the output data

In [None]:
# Format the output data 
slp_output_da = ds[var+"_anom"]
slp_output_da.attrs = {
    "long_name": "mean daily sea level pressure anomalies",
    "units": "Pa",
}

# Geopotential Heights at 500 hPa

In [None]:
# Open dataset 
var = "hgt"
filepaths_wildcard = "../data/{0}_daily_means/{1}*.nc".format(var,var)
filepaths_all = glob(filepaths_wildcard)
ds = xr.open_mfdataset(filepaths_all).sel(time=param.time_period)
global_attrs = ds.attrs

# Clean it up a bit 
level = 500
ds = ds.sel(time=param.time_period)
ds = ds.drop_dims("nbnds")
ds = ds.sel(level=level).drop("level") 

# Convert lon range from 0:360 to -180:180 
ds = convert_lon_360_to_180(ds)

# Clip to geometry 
ds = clip_to_geom(ds, geom)

# Average over entire region
ds = ds.mean(dim=["lat","lon"]) 

# Calculate annual domain average 500-hPa GPH to remove seasonal variability 
domain_mean_df = ds[var].groupby('time.year').mean(dim = "time").to_dataframe(name = var)

# Calculate linear trend in 500-hPa GPH
trend = np.polyfit(domain_mean_df.index.get_level_values('year'), domain_mean_df[var], 1)
print("Slope of trend:", trend[0], "m per year")

# Calculate detrended hgt
ds['change'] = (ds.time.dt.year - int(param.time_start[:4]))*trend[0]
ds[var+'_detrended'] = ds[var] - ds['change']
ds = ds.drop_vars('change')

# Calculate daily standardized anomalies
ds = calc_anomalies(ds, var+'_detrended') 

Format the data

In [None]:
# Format the output data 
hgt_output_da = ds[var+"_detrended_anom"]
hgt_output_da.attrs = {
    "long_name": "mean detrended daily geopotential height anomalies",
    "units": "m",
    "level":level
}

## Combine datasets and write to csv

In [None]:
# Merge DataArrays 
output_ds = xr.merge([hgt_output_da, slp_output_da])

# Add descriptive attributes
output_ds.attrs = global_attrs
output_ds.attrs["title"] = global_attrs["title"] + " modified to produce daily anomalies"
output_ds.attrs["history"] = global_attrs["history"] + "\nDaily detrended anomalies produced " + datetime.today().strftime('%Y/%m/%d')

# Display 
display(output_ds)

# Print size of dataset 
nbytes = format_nbytes(output_ds.nbytes)
print("Size of output dataset: {0}".format(nbytes))

In [None]:
# Split into training-validation-testing
training = output_ds.sel(time=param.training_period)
validation = output_ds.sel(time=param.validation_period)
testing = output_ds.sel(time=param.testing_period)

# Convert to pandas 
training_df = training.to_dataframe()
validation_df = validation.to_dataframe()
testing_df = testing.to_dataframe()

In [None]:
# Output as csv
data_dir = "../data/input_data_preprocessed/"
training_df.to_csv(data_dir+"training/training_features.csv") 
validation_df.to_csv(data_dir+"validation/validation_features.csv") 
testing_df.to_csv(data_dir+"testing/testing_features.csv") 