# [SC57 - Working with big, multi-dimensional geoscientific datasets in Python: a tutorial introduction to xarray](http://meetingorganizer.copernicus.org/EGU2017/session/25651)  
  
  
Original notebook by [Stephan Hoyer](http://stephanhoyer.com), Rossbypalooza, 2016.  

Edits by Edward Byers, Matthew Gidden and [Fabien Maussion](http://fabienmaussion.info/) for EGU General Assembly 2017, Vienna, Austria,  [Dr Chelle Gentemann](mailto:gentemann@esr.org), Earth and Space Research, USA and [Dr Marisol Garcia-Reyes](mailto:marisolgr@faralloninstitute.org) (multiple events) with help from @lewismc, [B.Storer](https://github.com/bastorer), and [M.Feen](https://github.com/melaniefeen).

# Structure of this tutorial

1. Opening data
1. Collocating satellite data with a cruise dataset



# 1. Key features of `xarray`

-------------------

## Import python packages

You are going to want numpy, pandas, matplotlib.pyplot and xarray

In [None]:
import warnings
warnings.simplefilter('ignore') # filter some warning messages

import numpy as np
import pandas as pd


import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import cmocean
from pyproj import Proj
import cartopy.crs as ccrs


import xarray as xr


#for search capabilites import podaacpy
import podaac.podaac as podaac
import podaac.podaac_utils as putil
# then create an instance of the Podaac class
p = podaac.Podaac()

## A nice cartopy tutorial is [here](http://earthpy.org/tag/visualization.html)

# Collocate a Saildrone cruise with Satellite Observations

### The Saildrone cruise is 2 months long.

`xarray`can open multiple files at once using string pattern matching.  
  
  In this case we open all the files that match our `filestr`, i.e. all the files for the 2080s. 
  
  Each of these files (compressed) is approximately 800 MB.

# Read in the Saildrone data


In [None]:
url = 'https://podaac-opendap.jpl.nasa.gov/opendap/hyrax/allData/insitu/L2/saildrone/Baja/saildrone-gen_4-baja_2018-sd1002-20180411T180000-20180611T055959-1_minutes-v1.nc'
ds_usv = xr.open_dataset(url)
#ds_usv #uncomment this line for information about the data (dimensions, data variables, attributes)

## The next section requires a manual input with the start and end time manually

In [None]:
str_start_time = '2018-04-12'
str_end_time = '2018-06-10'
str_start_time = '2018-04-12T02'
str_end_time = '2018-06-10T18'

Subset the Saildrone data using the time inputs from above.

In [None]:
ds_usv2 = ds_usv.isel(trajectory=0).swap_dims({'obs':'time'}).rename({'longitude':'lon','latitude':'lat'})
ds_usv_subset = ds_usv2.sel(time=slice(str_start_time,str_end_time)) 
start_time=pd.to_datetime(str(ds_usv2.time.min().data)).strftime('%Y-%m-%dT%H:%m:%SZ') 
end_time=pd.to_datetime(str(ds_usv2.time.max().data)).strftime('%Y-%m-%dT%H:%m:%SZ') 
ds_usv_subset 
print('start: ',start_time,'end: ',end_time)

In [None]:
logchl = np.log(ds_usv_subset.CHLOR_MEAN) #take the log of the chlorophyll data

#plot the saildrone cruise track colored by the chlorophyll data
font = {'size' : 16}
plt.rc('font', **font)
ax = plt.axes(projection=ccrs.PlateCarree())
cs1 = ax.scatter(ds_usv_subset.lon, ds_usv_subset.lat, s=3.0, c=logchl, edgecolor='none', cmap='jet')
ax.coastlines()
x1,x2,y1,y2 = -128,-112,25,40
ax.set_xlim(x1,x2)
ax.set_ylim(y1,y2)
ax.set_xticks(np.arange(x1,x2,4))
ax.set_yticks(np.arange(y1,y2,5))
cax = plt.colorbar(cs1)
cax.set_label('Mean Log Chlorophyll')
plt.xlabel('Longitude')
plt.ylabel('Latitude')

# Collocate with Ocean Color Observations from the Ocean Color Web Opendap 
https://oceandata.sci.gsfc.nasa.gov/opendap/.

## This example pulls chlorophyll from MODIS Aqua Level 3 Standard Mapped Image Product.

The next section requires manual inputs from the user to build a url to call data from the Opendap. Inputs include the following:
* start and end dates (start_date, end_date)
* Variable (VAR)
* Algorithm (ALG)
* Binning period (BIN)
* Spatial resolution (SRES)


To run just the [tutorial on the Ocean Color data](../Chlorophyll/CHL_dap.ipynb)

In [None]:
## YYYY-MM-DD
start_date = np.datetime64(str_start_time[:10])
end_date   = np.datetime64(str_end_time[:10])

# variable to load
VAR = 'CHL'

# algorithm
ALG = 'chl_ocx'

# Binning period
BIN = '8D'  # DAY, 8D, MO, R32

# Spatial resolution
SRES = '9km'   # 4km, 9km



In [None]:
num_days = (end_date - start_date).tolist().days

# Track which days are kept
the_days = []

dap_urls = []

url_base = "https://oceandata.sci.gsfc.nasa.gov:443/opendap/MODISA/L3SMI/"


for ii in range(num_days):
    
    curr_date = start_date + ii
    
    curr_year = curr_date.tolist().year
    ref_date = np.datetime64('{0:d}-01-01'.format(curr_year))
    
    day_num = 1 + (curr_date - ref_date).tolist().days
    
    # We need to change the formatting a bit depending on the binning
    do = True
    if BIN == 'DAY':
        time_str = 'A{0:d}{1:03d}'.format(curr_year, day_num)
    elif BIN == '8D':
        if (int(day_num) - 1) % 8 == 0:
            targ_day = day_num + 7
            if targ_day > 365:
                targ_day = 365
            
            time_str = 'A{0:d}{1:03d}{2:d}{3:03d}'.format(curr_year, day_num, curr_year, targ_day)
        else:
            # There isn't an 8D set starting here
            do = False
    
    if do:
        file_url = url_base + \
                '{0:d}/{1:03d}/{2}'.format(curr_year, day_num, time_str) + \
                '.L3m_{0}_{1}_{2}_{3}'.format(BIN, VAR, ALG, SRES) + \
                '.nc'
    
        dap_urls += [file_url]
        
        the_days += [curr_date]
    
print('dap_urls containts {0:d} urls for {1} data.'.format(len(dap_urls), VAR))

In [None]:
#test that the file you generate actually opens data by loading one time point
single_set = xr.open_dataset(dap_urls[0])

#and plot the data to see what it looks like!
single_set.chl_ocx.plot(
    x="lon",
    y="lat",
);

# Load in Satellite Sea Surface Temperature Data

### This tutorial originally used Multiscale Ultrahigh Resolution (MUR) SST, which is 0.01 degree resolution, but was updated to pull a daily product by NAVOCEANO on a 0.1 degree grid to increase run time! More information on the product can be found [here](https://cmr.earthdata.nasa.gov/search/concepts/C1268959235-PODAAC.html).

In [None]:
#dataset_id = 'PODAAC-GHGMR-4FJ04'  #MUR SST looked up on podaac website
dataset_id = 'PODAAC-GHK10-41N01'  #smaller data
gresult = p.granule_search(dataset_id=dataset_id,
                           start_time=start_time,
                           end_time=end_time,
                           items_per_page='100')
urls = putil.PodaacUtils.mine_opendap_urls_from_granule_search(gresult)
urls = [w[:-5] for w in urls]  #remove html from urls

In [None]:
ds_sst = xr.open_mfdataset(urls,coords='minimal')
ds_sst

ds_sst.analysed_sst[0, ...].plot(
    x="lon",
    y="lat",
);

How big is all this data uncompressed? Will it fit into memory?
Use `.nbytes` / 1e9  to convert it into gigabytes

In [None]:
ds_sst.nbytes / 1e9  

## The NCEI trajectory format uses 'obs' as the coordinate.  This is an example of an 'older' style of data formatting that doesn't really mesh well with modern software capabilities. 

* So, let's change that by using [.swap_dims](http://xarray.pydata.org/en/stable/generated/xarray.DataArray.swap_dims.html) to change the coordinate from `obs` to `time`
* Another thing, `latitude` and `longitude` are just long and annoying, lets [.rename](http://xarray.pydata.org/en/stable/generated/xarray.Dataset.rename.html) them to `lat` and `lon`

* Finally, the first and last part of the cruise the USV is being towed, so let's only include data from `2018-04-12T02` to `2018-06-10T18`


# Xarray interpolation won't run on chunked dimensions.  
1. First let's subset the data to make it smaller to deal with by using the cruise lat/lons
    * Find the max/min of the lat/lon using `.lon.min().data`

1. Now load the data into memory (de-Dask-ify) it using `.load()`  


In [None]:
#Step 1 from above - this takes the lat and lon from the saildrone data and subsets the satellite sst
print('min max lat lon:', ds_usv_subset.lon.min().data,ds_usv_subset.lon.max().data,ds_usv_subset.lat.min().data,ds_usv_subset.lat.max().data)
lon_min,lon_max = ds_usv_subset.lon.min().data,ds_usv_subset.lon.max().data
lat_min,lat_max = ds_usv_subset.lat.min().data,ds_usv_subset.lat.max().data
subset_sat_sst = ds_sst.sel(lon=slice(lon_min,lon_max),
                  lat=slice(lat_max,lat_min))

#subset_sat_sst #uncomment this line for info on the dimensions and variables in the satellite sst

In [None]:
#Plot the satellite SST subset

xv, yv = np.meshgrid(subset_sat_sst.lon, subset_sat_sst.lat)
font = {'size' : 16}
plt.rc('font', **font)
ax = plt.axes(projection=ccrs.PlateCarree())
cs1 = ax.scatter(xv, yv, s=3.0, c=subset_sat_sst.analysed_sst[0,:,:], edgecolor='none', cmap='jet')
ax.coastlines()
x1,x2,y1,y2 = -128,-112,25,40
ax.set_xlim(x1,x2)
ax.set_ylim(y1,y2)
ax.set_xticks(np.arange(x1,x2,4))
ax.set_yticks(np.arange(y1,y2,5))
cax = plt.colorbar(cs1)
cax.set_label('SST (kelvin)')
plt.xlabel('Longitude')
plt.ylabel('Latitude')





In [None]:
#this is now subsetting the ocean color satellite data
def preprocess_set(dset, time):
    return dset.sel(lon=slice(lon_min,lon_max),lat=slice(lat_max,lat_min))

data_sets = [preprocess_set(xr.open_dataset(url), ind) \
             for (url,ind) \
             in zip(dap_urls, np.arange(num_days))]

time_array = xr.DataArray(the_days, None, 'time', 'time')

subset_chl = xr.concat(data_sets, time_array)


In [None]:
#Step 2 from above - load in the data subsets
subset_sat_sst.load()
subset_chl.load()
print() #comment this line for some output, but this keeps the work space nice and neat

In [None]:
#plot the subset of chlorophyll satellite data

xv2, yv2 = np.meshgrid(subset_chl.lon, subset_chl.lat)

font = {'size' : 16}
plt.rc('font', **font)
ax = plt.axes(projection=ccrs.PlateCarree())
cs1 = ax.scatter(xv2, yv2, s=3.0, c=(np.log(subset_chl.chl_ocx[0,:,:])), edgecolor='none', cmap='jet')
ax.coastlines()
x1,x2,y1,y2 = -128,-112,25,40
ax.set_xlim(x1,x2)
ax.set_ylim(y1,y2)
ax.set_xticks(np.arange(x1,x2,4))
ax.set_yticks(np.arange(y1,y2,5))
cax = plt.colorbar(cs1)
cax.set_label('Log Chl (mg/m^3)')
plt.xlabel('Longitude')
plt.ylabel('Latitude')

# Collocate In Situ USV data with the Satellite SST data and Ocean Color Data
There are different options when you interpolate.  First, let's just do a linear interpolation using [.interp()](http://xarray.pydata.org/en/stable/generated/xarray.Dataset.interp.html#xarray.Dataset.interp)

`Dataset.interp(coords=None, method='linear', assume_sorted=False, kwargs={}, **coords_kwargs))`



In [None]:
#make sure they both have the same time stamp format otherwise the interpolation will fail
subset_sat_sst['time'] = subset_sat_sst.indexes['time'].to_datetimeindex()
#subset_sat_sst.time #uncomment this line to see that the satellite SST date format is correct


In [None]:
ds_collocated_sst = subset_sat_sst.interp(lat=ds_usv_subset.lat,lon=ds_usv_subset.lon,time=ds_usv_subset.time,method='linear')
ds_collocated_oc = subset_chl.interp(lat=ds_usv_subset.lat,lon=ds_usv_subset.lon,time=ds_usv_subset.time,method='linear')

In [None]:
dif_sst_lin = ds_collocated_sst.analysed_sst-(ds_usv_subset.TEMP_CTD_MEAN + 273.15)
print('mean difference = ',dif_sst_lin.mean().data)
print('STD = ',dif_sst_lin.std().data)

dif_oc_lin = ds_collocated_oc.chl_ocx-ds_usv_subset.CHLOR_MEAN
print('mean difference = ',dif_oc_lin.mean().data)
print('STD = ',dif_oc_lin.std().data)

# Collocate USV data with SST data
There are different options when you interpolate.  First, let's just do a nearest point rather than interpolate the data
`method = 'nearest'`

In [None]:
ds_collocated_nearest_sst = subset_sat_sst.interp(lat=ds_usv_subset.lat,lon=ds_usv_subset.lon,time=ds_usv_subset.time,method='nearest')
ds_collocated_nearest_oc = subset_chl.interp(lat=ds_usv_subset.lat,lon=ds_usv_subset.lon,time=ds_usv_subset.time,method='nearest')

## Now, calculate the different in SSTs and print the [.mean()](http://xarray.pydata.org/en/stable/generated/xarray.DataArray.mean.html#xarray.DataArray.mean) and [.std()](http://xarray.pydata.org/en/stable/generated/xarray.DataArray.std.html#xarray.DataArray.std)
For the satellite data we need to use `sst` and for the USV data we need to use `TEMP_CTD_MEAN`

In [None]:
ds_collocated_nearest_sst.analysed_sst


In [None]:
ds_usv_subset.TEMP_CTD_MEAN

In [None]:
dif_sst_near = ds_collocated_nearest_sst.analysed_sst-(ds_usv_subset.TEMP_CTD_MEAN + 273.15)
print('mean difference = ',dif_sst_near.mean().data)
print('STD = ',dif_sst_near.std().data)

dif_oc_near = ds_collocated_nearest_oc.chl_ocx-ds_usv_subset.CHLOR_MEAN
print('mean difference = ',dif_oc_near.mean().data)
print('STD = ',dif_oc_near.std().data)

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(12,14),sharey = True,subplot_kw={'projection': ccrs.PlateCarree()})

plt.rc('font', **font)
ax=axes[0,0]
cs00 = axes[0,0].scatter(ds_usv_subset.lon, ds_usv_subset.lat, s=3.0, c=logchl, edgecolor='none', cmap='cmo.algae')
ax.coastlines()
cax = plt.colorbar(cs00,ax=axes[0,0])
cax.set_label('Log Mean Chlorophyll (mg/m^3)')
ax.set_title('Saildrone')



#plot the satellite cruise track colored by the chlorophyll data
plt.rc('font', **font)
ax =axes[0,1]
cs01 = ax.scatter(ds_usv_subset.lon, ds_usv_subset.lat, s=3.0, c=ds_collocated_nearest_oc.chl_ocx, edgecolor='none', cmap='cmo.algae')
ax.coastlines()
cax = plt.colorbar(cs01,ax=axes[0,1])
cax.set_label('Mean Log Chlorophyll (mg/m^3)')
ax.set_title('Satellite')


#plot the satellite cruise track colored by the chlorophyll data
plt.rc('font', **font)
ax =axes[1,0]
cv=np.percentile(np.abs(dif_oc_lin),75)
cs10 = ax.scatter(ds_usv_subset.lon, ds_usv_subset.lat, s=3.0, c=dif_oc_lin, edgecolor='none', cmap='cmo.balance',vmin=-cv,vmax=cv)
ax.coastlines()
cax = plt.colorbar(cs10,ax=axes[1,0])
cax.set_label('Difference')
ax.set_title('Linear Interpolation (mg/m^3)')

#plot the satellite cruise track colored by the chlorophyll data
plt.rc('font', **font)
ax = axes[1,1]
cv=np.percentile(np.abs(dif_oc_near),75)
cs11 = ax.scatter(ds_usv_subset.lon, ds_usv_subset.lat, s=3.0, c=dif_oc_near, edgecolor='none', cmap='cmo.balance',vmin=-cv,vmax=cv)
ax.coastlines()
cax = plt.colorbar(cs11,ax=axes[1,1])
cax.set_label('Difference')
ax.set_title('Nearest Neighbor (mg/m^3)')

for ax in axes.ravel():
    x1,x2,y1,y2 = -128,-112,25,40
    ax.set_xlim(x1,x2)
    ax.set_ylim(y1,y2)
    ax.set_xticks(np.arange(x1,x2,4))
    ax.set_yticks(np.arange(y1,y2,5))
    font = {'size' : 14}




In [None]:
fig, axes = plt.subplots(2, 2, figsize=(12,14),sharey = True,subplot_kw={'projection': ccrs.PlateCarree()})

plt.rc('font', **font)
ax=axes[0,0]
cs00 = axes[0,0].scatter(ds_usv_subset.lon, ds_usv_subset.lat, s=3.0, c=ds_usv_subset.TEMP_CTD_MEAN + 273.15, edgecolor='none', cmap='cmo.tempo')
ax.coastlines()
cax = plt.colorbar(cs00,ax=axes[0,0])
cax.set_label('Temperature (degC)')
ax.set_title('Saildrone')



plt.rc('font', **font)
ax =axes[0,1]
cs01 = ax.scatter(ds_usv_subset.lon, ds_usv_subset.lat, s=3.0, c=ds_collocated_nearest_sst.analysed_sst, edgecolor='none', cmap='cmo.tempo')
ax.coastlines()
cax = plt.colorbar(cs01,ax=axes[0,1])
cax.set_label('Temperature(degC)')
ax.set_title('Satellite')


plt.rc('font', **font)
ax =axes[1,0]
cv=np.percentile(np.abs(dif_oc_lin),75)
cs10 = ax.scatter(ds_usv_subset.lon, ds_usv_subset.lat, s=3.0, c=dif_sst_lin, edgecolor='none', cmap='cmo.balance',vmin=-cv,vmax=cv)
ax.coastlines()
cax = plt.colorbar(cs10,ax=axes[1,0])
cax.set_label('Difference')
ax.set_title('Linear Interpolation (degC)')

plt.rc('font', **font)
ax = axes[1,1]
cv=np.percentile(np.abs(dif_oc_near),75)
cs11 = ax.scatter(ds_usv_subset.lon, ds_usv_subset.lat, s=3.0, c=dif_sst_near, edgecolor='none', cmap='cmo.balance',vmin=-cv,vmax=cv)
ax.coastlines()
cax = plt.colorbar(cs11,ax=axes[1,1])
cax.set_label('Difference')
ax.set_title('Nearest Neighbor (degC)')

for ax in axes.ravel():
    x1,x2,y1,y2 = -128,-112,25,40
    ax.set_xlim(x1,x2)
    ax.set_ylim(y1,y2)
    ax.set_xticks(np.arange(x1,x2,4))
    ax.set_yticks(np.arange(y1,y2,5))
    font = {'size' : 14}