# Pull, Process, and Prepare Data

In [3]:
# import necessary packages

import requests 
import numpy as np
import pandas as pd
import xarray as xr
from datetime import datetime
from scipy import stats
import os
import intake

## Step 0: Set Up Dask Resources

In [None]:
!pip install dask-cloudprovider==2021.9.0

In [None]:
from dask.distributed import Client
from dask_cloudprovider.aws import FargateCluster

In [None]:
cluster = FargateCluster(n_workers=100, image='pangeo/pangeo-notebook:2021.10.19',
                         environment=env, scheduler_timeout='10 minutes')
client = Client(cluster)

print(cluster.dashboard_link)

In [None]:
# client.close()
# cluster.close()

## Step 1: Pull Raw Data from AWS

In [None]:
os.environ['AWS_DEFAULT_REGION'] = 'us-west-1'
env = {k: os.environ[k] for k in ('AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')}

In [4]:
cat = intake.open_catalog('s3://cdcat/cae.yaml')
print(list(cat))

NoCredentialsError: Unable to locate credentials

In [None]:
ds = cat['wrf_cesm2_ssp585_9km'].to_dask()

In [None]:
da = ds['T2']
da

## Step 2: Process Raw Data and Extract AMS

In [None]:
ams = da.resample(time="A").max(keep_attrs=True)

In [None]:
ams = ams.compute()
ams

In [None]:
ams.to_netcdf('./data/processed/9km/wrf_cesm2_ssp585_9km.nc')

## Step 3: Prepare Processed Data for Analysis

In [None]:
# add attributes

ams.attrs["extreme value extraction method"] = "block maxima"
ams.attrs["extremes type"] = "maxima"
ams.attrs["block size"] = "1 year"
ams.attrs["timeseries type"] = "annual max series"

In [6]:
# subset data into appropriate periods