# Data exploration

## Setting up

In [1]:
# run script that installs missing libraries
! chmod 755 scripts.sh
! ./scripts.sh



### Imports

In [1]:
import os
from matplotlib import pyplot as plt
import matplotlib.path as mpath
import numpy as np
import pandas as pd
import xarray as xr
import cartopy
import cf_units
from datetime import datetime
from datetime import timedelta
%matplotlib inline
plt.rcParams['figure.figsize'] = 12, 6
%config InlineBackend.figure_format = 'retina' 

In [2]:
! ls ../data

MARcst-AN35km-176x148.cdf	      year_ACCESS1-3.nc2
year-MAR_ACCESS1.3-1980-2100_zen.nc2


## Global data:
See that global data has weird time format that needs to be fixed, format is number of months since starting date. Months are counted in 30 days per month, CF format.

In [3]:
dg = xr.open_dataset('../data/year_ACCESS1-3.nc2', decode_times=False)
dg.load()

### Fix time format

In [4]:
# get starting date and units of dataset
units, reference_date = dg.TIME2.attrs['units'].split('since')
print('Units: {} and reference date: {} of global dataset'.format(units, reference_date))

# set reference date, add first value in time array
START = datetime(1950,1,15,0,0,0)+timedelta(days=dg.TIME2[0].values*30)

print('First date in time array: {}'.format(START))

# Correct the time by having a yearly range from the starting date till 2100
dg['TIME'] = pd.date_range(start=START, periods=dg.sizes['TIME2'], freq='12M')
dg.load()

Units: month(360days)  and reference date:  1950-01-15 00:00:00 of global dataset
First date in time array: 1950-06-29 00:00:00


### Some plots:

In [None]:
decades = dg.resample(TIME="Y").mean().groupby("TIME.year").mean()
decades.TA.plot(col="year", row = "PLEV12_15")

In [None]:
# mean SMB values over first 5 years
yearly_mean = dg.groupby("TIME.year").mean().isel(year=slice(5))
# facet the yearly means
yearly_mean.TA.plot(col="year", row = "PLEV12_15")

In [None]:
# mean SMB values over first 5 years
yearly_mean = dg.groupby("TIME.year").mean().isel(year=slice(5))
# facet the yearly means
yearly_mean.TAS.plot(col="year")

## Exploring regional data:

In [None]:
dsr = xr.open_dataset('../data/year-MAR_ACCESS1.3-1980-2100_zen.nc2')
ds = dsr.load()
ds

In [None]:
# plot the first timestep
ds.SMB.isel(TIME=1).plot(x="X")

In [None]:
# mean SMB over time
ds.SMB.mean("TIME").plot(x="X")
plt.title('Mean SMB values over time')

In [None]:
# mean SF over time
ds.RF.mean("TIME").plot(x="X")
plt.title('Mean RF values over time')

In [None]:
# mean RU over time
ds.RU.mean("TIME").plot(x="X")
plt.title('Mean RU values over time')

In [None]:
# mean SU over time
ds.SU.mean("TIME").plot(x="X")
plt.title('Mean SU values over time')

In [None]:
# mean SMB values over first 5 years
yearly_mean = ds.groupby("TIME.year").mean().isel(year=slice(5))
# facet the yearly means
yearly_mean.SMB.plot(col="year")

In [None]:
decades = ds.resample(TIME="50Y").mean().groupby("TIME.year").mean()
decades.SF.plot(col="year")

In [None]:
decades.SMB.plot(col="year")

---

## Extracting data or "indexing" : `.sel`, `.isel`

Xarray supports

- label-based indexing using `.sel`
- position-based indexing using `.isel`

For more see https://xarray.pydata.org/en/stable/indexing.html

### Label-based indexing

Xarray inherits its label-based indexing rules from pandas; this means great
support for dates and times!

In [None]:
# pull out data for all of 2037-July
ds.sel(TIME="2037-07")

In [None]:
# demonstrate slicing
ds.sel(TIME=slice("2013-05", "2013-07"))

In [None]:
# demonstrate "nearest" indexing
ds.sel(X=240.2, method="nearest")

In [None]:
# "nearest indexing at multiple points"
ds.sel(X=[240.125, 234], Y=[40.3, 50.3], method="nearest")

### Position-based indexing

This is similar to your usual numpy `array[0, 2, 3]` but with the power of named
dimensions!

In [None]:
# pull out time index 0 and lat index 0
ds.SMB.isel(TIME=0, X=30)  #  much better than ds.air[0, 0, :]

In [None]:
# demonstrate slicing
ds.SMB.isel(X=slice(10))

---

## High level computation: `groupby`, `resample`, `rolling`, `coarsen`, `weighted`

Xarray has some very useful high level objects that let you do common
computations:

1. `groupby` :
   [Bin data in to groups and reduce](https://xarray.pydata.org/en/stable/groupby.html)
1. `resample` :
   [Groupby specialized for time axes. Either downsample or upsample your data.](https://xarray.pydata.org/en/stable/time-series.html#resampling-and-grouped-operations)
1. `rolling` :
   [Operate on rolling windows of your data e.g. running mean](https://xarray.pydata.org/en/stable/computation.html#rolling-window-operations)
1. `coarsen` :
   [Downsample your data](https://xarray.pydata.org/en/stable/computation.html#coarsen-large-arrays)
1. `weighted` :
   [Weight your data before reducing](https://xarray.pydata.org/en/stable/computation.html#weighted-array-reductions)


### groupby

In [None]:
# seasonal groups, see that we only have summer data (june, july, august)
ds.groupby("TIME.season")

In [None]:
# make a seasonal mean
seasonal_mean = ds.groupby("TIME.season").mean()
seasonal_mean

### resample

In [None]:
# resample to monthly frequency
ds.resample(TIME="M").mean()

---

## Visualization: `.plot`

For more see https://xarray.pydata.org/en/stable/plotting.html and
https://xarray.pydata.org/en/stable/examples/visualization_gallery.html

We have seen very simple plots earlier. Xarray has some support for visualizing
3D and 4D datasets by presenting multiple facets (or panels or subplots) showing
variations across rows and/or columns.


In [None]:
# facet the seasonal_mean
seasonal_mean.SMB.plot()

In [None]:
# contours
seasonal_mean.SMB.isel(SECTOR1_1=0, season =0).plot.contour(levels=800, add_colorbar=True)

In [None]:
# line plots too? wut
seasonal_mean.SMB.isel(SECTOR1_1=0).mean("X").plot.line(hue="season", y="Y")

In [None]:
import plotly.express as px

df = pd.DataFrame(data = {"time":ds.TIME.data, "data" : ds.SMB.data}) 

fig = px.line(df, x="time", y="data", title='Life expectancy in Canada')
fig.show()

In [None]:
# pull out data for all of 2013-May
ds.sel(TIME="2037-07")