# Random Forest: Preprocess CHIRPS precipitation data 
1) Clip to CO
2) Compute average over region 
3) Assign each timestep to a class -- extreme precip or not 

In [None]:
import xarray as xr 
import numpy as np 
import pandas as pd
from glob import glob
import sys 
from datetime import datetime
import boto3
import s3fs

# Import helper functions 
sys.path.insert(0, '../../utils')
from preprocessing_utils import (
    get_state_geom,
    convert_lon_360_to_180, 
    clip_to_geom, 
    calc_anomalies, 
) 
from misc_utils import format_nbytes
import parameters as param

In [None]:
state = "Colorado"
geom = get_state_geom(state=state)

In [None]:
var = "precip"
filepaths_wildcard = "../data/chirps_precip/*chirps*.days_p25.nc"
filepaths_all = glob(filepaths_wildcard)
ds = xr.open_mfdataset(filepaths_all).sel(time=param.time_period)
global_attrs = ds.attrs
var_attrs = ds[var].attrs

# Clip to geometry 
ds = clip_to_geom(ds, geom, lon_name="longitude", lat_name="latitude")

In [None]:
# Average over entire region
ds_mean = ds.mean(dim=["latitude","longitude"]) 

# Read data into memory 
ds_mean = ds_mean.compute() 

# Compute 95th percentile precip
perc_95 = ds_mean[var].quantile(0.95).item()
print("95th percentile precip over {0}: {1}".format(state, perc_95))

In [None]:
# Assign classes based on exceedance of 95th percentile 
extremes_var = "precip_classes"
ds[extremes_var] = xr.where(ds_mean[var] > perc_95, 1, 0)
ds[extremes_var].attrs = {
    "description":"95th percentile precipitation", 
    "classes": "Class 0: precipitation below threshold \nClass 1: precipitation exeeds threshold",
    "95th percentile":"{} mm/day".format(round(perc_95,3)),
} 

In [None]:
# Format the output data 
output_ds = ds[extremes_var].to_dataset()
output_ds.attrs = global_attrs
output_ds.attrs["region"] = "Data has been spatially averaged across the state of "+state
output_ds.attrs["title"] = global_attrs["title"] + " modified to compute extreme precip classes"
output_ds.attrs["history"] = global_attrs["history"] + "\nExtreme precip classes produced " + datetime.today().strftime('%Y/%m/%d')

In [None]:
# Take a gander at the data 
display(output_ds)

In [None]:
# Mean value should be close to 0.05
mean_val = output_ds[extremes_var].mean().item()
print(mean_val)

In [None]:
# Convert to dataframe 
output_df = output_ds[extremes_var].to_dataframe()
output_df.head()

In [None]:
# Split into training-validation-testing
training = output_df.loc[param.training_time_start:param.training_time_end]
validation = output_df.loc[param.validation_time_start:param.validation_time_end]
testing = output_df.loc[param.testing_time_start:param.testing_time_end]

In [None]:
# Output as csv
data_dir = "../data/input_data_preprocessed/"
training.to_csv(data_dir+"training/training_labels.csv") 
validation.to_csv(data_dir+"validation/validation_labels.csv") 
testing.to_csv(data_dir+"testing/testing_labels.csv") 