# Preprocesses IMERG precip data 
1) Clip to CO
2) Compute average over region 
3) Assign each timestep to a class -- extreme precip or not 

In [None]:
import xarray as xr 
from glob import glob
import sys 
from datetime import datetime
import boto3
import s3fs

# Import helper functions 
sys.path.insert(0, '../utils')
from preprocessing_utils import (
    get_state_geom, 
    convert_lon_360_to_180, 
    clip_to_geom, 
    calc_anomalies
) 
import parameters as param

In [None]:
# # Get all the filepaths for the specified time range 
# filepaths_all = []
# for year in range(int(param.time_start), int(param.time_end)): 
#     filepaths_wildcard = "../data/precip_daily/*3IMERG.{0}*.nc4".format(year)
#     filepaths_all += glob(filepaths_wildcard)
# ds = xr.open_mfdataset(filepaths_all).sel(time=param.time_period)

In [None]:
# Get data 
var = "precipitation"
filepaths_wildcard = "../data/precip_daily/*3IMERG*.nc4"
filepaths_all = glob(filepaths_wildcard)
ds = xr.open_mfdataset(filepaths_all).sel(time=param.time_period)
global_attrs = ds.attrs
var_attrs = ds[var].attrs
ds = ds[var].to_dataset()

# Convert from Julian --> Standard Calendar 
ds = ds.convert_calendar("standard")

# Shift order of dimensions to match reanalysis data 
ds = ds.transpose("time","lat","lon")

# Get Colorado state boundary 
state = "Colorado"
geom = get_state_geom(state)

# Clip to Colorado geometry 
ds = clip_to_geom(ds, geom)

# Average over entire region
ds = ds.mean(dim=["lat","lon"])

In [None]:
# Read data into memory 
ds = ds.compute() 

In [None]:
# Compute 95th percentile precip
perc_95 = ds[var].quantile(0.95).item()
print("95th percentile precip over {0}: {1}".format(state, perc_95))

# Assign classes based on exceedance of 95th percentile 
extremes_var = "precip_classes"
ds[extremes_var] = xr.where(ds[var] > perc_95, 1, 0)
ds[extremes_var].attrs = {
    "description":"95th percentile precipitation", 
    "classes": "Class 0: precipitation below threshold \nClass 1: precipitation exeeds threshold"
} 

In [None]:
# Format the output data 
output_da = ds[extremes_var]
output_ds = output_da.to_dataset()
output_ds.attrs = global_attrs
output_ds.attrs["region"] = "Data has been spatially averaged across the state of "+state
output_ds.attrs["title"] = global_attrs["title"] + " modified to compute extreme precip classes"
output_ds.attrs["history"] = global_attrs["history"] + "\nExtreme precip classes produced " + datetime.today().strftime('%Y/%m/%d')

In [None]:
# Take a gander at the data 
display(output_ds)

In [None]:
# Convert to pandas DataFrame 
output_df = output_ds.to_dataframe().reset_index()
output_df.head()

In [None]:
# Output to csv 
filename = "precip_classes.csv"
output_df.to_csv("../data/input_data_preprocessed/{}".format(filename), index=False)

In [None]:
# # Confirm that you're connected to the right S3 bucket
# s3 = boto3.resource(service_name='s3')
# for bucket in s3.buckets.all():
#     # What is printed here should match the variable "bucket" below
#     print("Bucket in S3: " + bucket.name)

# # S3 paths and such 
# bucket = "ml-extreme-precip" # Name of bucket 
# folder = "IMERG" # Name of folder in bucket
# s3_path = "s3://{0}/{1}/".format(bucket, folder) 

# # Name to give file 
# # DO NOT include file extension (this will be .zarr)
# filename = output_da.name

# # Path to zarr store in AWS bucket
# filepath_zarr = "{}{}.zarr/".format(s3_path, filename)
# print("zarr store will be written to path: {}".format(filepath_zarr))

In [None]:
# # Write zarr to bucket 

# # Initilize the S3 file system
# s3 = s3fs.S3FileSystem()
# store = s3fs.S3Map(root=filepath_zarr, s3=s3, check=False)

# # Save to zarr
# output_ds.to_zarr(
#     store=store, 
#     consolidated=True, 
#     mode="w" # Overwrite any existing files 
# )

In [None]:
# # Now try opening the file from AWS! :D 
# xr.open_zarr(filepath_zarr)