In [None]:
# Australian Geoscience Datacube
## Observation count

The [Australian Geoscience Datacube](https://github.com/data-cube/agdc-v2) provides an integrated gridded data analysis environment for decades of analysis ready earth observation satellite and related data from multiple satellite and other acquisition systems.

For instructions on using the Datacube on NCI, see: http://agdc-v2.readthedocs.io/en/develop/nci_usage.html

For instructions on setting up your own instance, see: http://agdc-v2.readthedocs.io/en/develop/install.html

This notebook touches briefly on some the implimented features of the Datacube module, and is only intended to deomstrat functionality rather than be a tutorial.

In [1]:
%pylab notebook
#%pylab inline
#%matplotlib inline
import datacube
import xarray as xr
from datacube.storage import masking
from datacube.storage.masking import mask_to_dict
from matplotlib import pyplot as plt
import matplotlib.dates
import json
import pandas as pd
from IPython.display import display
import ipywidgets as widgets

Populating the interactive namespace from numpy and matplotlib


If you have set up your config correctly, or are using the module on NCI, you should be able to make `Datacube` object that can connects to the configured datacube system.

In [2]:
dc = datacube.Datacube(app='dc-example')
dc

Datacube<index=Index<db=PostgresDb<engine=Engine(postgresql://lxl554@130.56.244.227:6432/datacube)>>>

## Datacube products and measurements
The Datacube provides pandas.DataFrame representations of the available products and measurements:

In [3]:
#dc.list_products()

## Datacube Measurements
The list of measurements stored in the datacube can also be listed.

Measurements are also known as _bands_ in the imagery domain, and _data variables_ when stored in NetCDF files or when working with `xarray.Dataset` objects.

In [4]:
#dc.list_measurements()

In [5]:
#### DEFINE SPATIOTEMPORAL RANGE AND BANDS OF INTEREST
#Use this to manually define an upper left/lower right coords

#Define temporal range
start_of_epoch = '1987-01-01'
#need a variable here that defines a rolling 'latest observation'
end_of_epoch =  '2016-07-31'

#Define wavelengths/bands of interest, remove this kwarg to retrieve all bands
bands_of_interest = [#'blue',
                     #'green',
                     #'red', 
                     'nir',
                     #'swir1', 
                     #'swir2'
                     ]

#Define sensors of interest
sensor1 = 'ls8'
sensor2 = 'ls7'
sensor3 = 'ls5'

query = {
    'time': (start_of_epoch, end_of_epoch),
}

#DEFINE A LAT/LON RANGE THAT CAPTURES A NARROW AREA THAT SPANS BOTH THE SIDELAP AND NON SIDELAP AREAS
lat_max = -21.37
lat_min = -21.378
lon_max = 143.41
lon_min = 143.26
query['x'] = (lon_min, lon_max)
query['y'] = (lat_max, lat_min)
query['crs'] = 'EPSG:4326'

In [6]:
#Define the Koppen climate class here - the text is passed into figure titles at the bottom of the notebook
clim_class = 'Arid desert - hot arid'

In [7]:
print query

{'y': (-21.37, -21.378), 'x': (143.26, 143.41), 'crs': 'EPSG:4326', 'time': ('1987-01-01', '2016-07-31')}


## Retrieve surface reflectance data


In [8]:
#Group PQ by solar day to avoid idiosyncracies of N/S overlap differences in PQ algorithm performance
pq_albers_product = dc.index.products.get_by_name(sensor1+'_pq_albers')
valid_bit = pq_albers_product.measurements['pixelquality']['flags_definition']['contiguous']['bits']

def pq_fuser(dest, src):
    valid_val = (1 << valid_bit)

    no_data_dest_mask = ~(dest & valid_val).astype(bool)
    np.copyto(dest, src, where=no_data_dest_mask)

    both_data_mask = (valid_val & dest & src).astype(bool)
    np.copyto(dest, src & dest, where=both_data_mask)

# retrieve the NBAR and PQ for the spatiotemporal range of interest


In [9]:
#Retrieve the NBAR and PQ data for sensor n
sensor1_nbar = dc.load(product= sensor1+'_nbar_albers', group_by='solar_day', measurements = bands_of_interest,  **query)
sensor1_pq = dc.load(product= sensor1+'_pq_albers', group_by='solar_day', fuse_func=pq_fuser, **query)
            

In [10]:
affine = sensor1_nbar.affine

In [11]:
#This line exists to make sure that there's a 1:1 match between NBAR and PQ
sensor1_nbar = sensor1_nbar.sel(time = sensor1_pq.time)

In [12]:
#Generate PQ masks and apply those masks to remove cloud, cloud shadow, saturated observations
s1_cloud_free = masking.make_mask(sensor1_pq, ga_good_pixel= True)
s1_good_data = s1_cloud_free.pixelquality.loc[start_of_epoch:end_of_epoch]
sensor1_nbar = sensor1_nbar.where(s1_good_data)

In [13]:
sensor2_nbar = dc.load(product= sensor2+'_nbar_albers', group_by='solar_day', measurements = bands_of_interest,  **query)
sensor2_pq = dc.load(product= sensor2+'_pq_albers', group_by='solar_day', fuse_func=pq_fuser, **query)                  

In [14]:
sensor2_nbar = sensor2_nbar.sel(time = sensor2_pq.time)

In [15]:
s2_cloud_free = masking.make_mask(sensor2_pq, ga_good_pixel= True)
s2_good_data = s2_cloud_free.pixelquality.loc[start_of_epoch:end_of_epoch]
sensor2_nbar = sensor2_nbar.where(s2_good_data)

In [16]:
sensor3_nbar = dc.load(product= sensor3+'_nbar_albers', group_by='solar_day', measurements = bands_of_interest,  **query)
sensor3_pq = dc.load(product= sensor3+'_pq_albers', group_by='solar_day', fuse_func=pq_fuser, **query)                  

In [17]:
sensor3_nbar = sensor3_nbar.sel(time = sensor3_pq.time)

In [18]:
s3_cloud_free = masking.make_mask(sensor3_pq, ga_good_pixel= True)
s3_good_data = s3_cloud_free.pixelquality.loc[start_of_epoch:end_of_epoch]
sensor3_nbar = sensor3_nbar.where(s3_good_data)

## Combining data from multiple sensors
Having masked out cloud and cloud shadow affected pixels and calculated various indices we can now combine the measurements from the different sensors to create full depth time series

In [19]:
#Concatenate and sort the different sensor xarrays into a single xarray
nbar_clean = xr.concat([sensor1_nbar, sensor2_nbar, sensor3_nbar], dim='time')
time_sorted = nbar_clean.time.argsort()
nbar_clean = nbar_clean.isel(time=time_sorted)
nbar_clean.attrs['affine'] = affine


In [20]:
#clean up per sensor xarrays to free up some memory
del sensor1_nbar
del sensor2_nbar
del sensor3_nbar

In [21]:
nbar_clean

<xarray.Dataset>
Dimensions:  (time: 1169, x: 619, y: 91)
Coordinates:
  * y        (y) float64 -2.348e+06 -2.348e+06 -2.348e+06 -2.348e+06 ...
  * x        (x) float64 1.158e+06 1.158e+06 1.158e+06 1.158e+06 1.158e+06 ...
  * time     (time) datetime64[ns] 1987-05-24 1987-05-31 1987-09-13 ...
Data variables:
    nir      (time, y, x) float64 nan nan nan nan nan nan nan nan nan nan ...
Attributes:
    affine: | 25.00, 0.00, 1157900.00|
| 0.00,-25.00,-2347725.00|
| 0.00, 0.00, 1.00|

## Plotting a multi-band image

In [22]:
print 'The number of time slices at this location is' 
print nbar_clean.nir.shape[0]

The number of time slices at this location is
1169


In [25]:
#Click on this image to chose the side lap pixel (high observation count)
w = widgets.HTML("Event information appears here when you click on the figure")


def callback(event):
    global x, y
    x, y = int(event.xdata + 0.5), int(event.ydata + 0.5)
    w.value = 'X: {}, Y: {}'.format(x,y)

fig = plt.figure(figsize =(12,6))
nbar_clean.nir.count('time').plot()

fig.canvas.mpl_connect('button_press_event', callback)
plt.show()
display(w)

<IPython.core.display.Javascript object>

In [170]:
x, y

(1159353, -2348007)

In [195]:

image_coords = ~nbar_clean.affine * (x, y)
xdim = int(image_coords[0])
ydim = int(image_coords[1])
xdim, ydim

(58, 11)

In [196]:
#Click on this image to chose the  NON side lap pixel (low observation count)
w = widgets.HTML("Event information appears here when you click on the figure")


def callback(event):
    global x2, y2
    x2, y2 = int(event.xdata + 0.5), int(event.ydata + 0.5)
    w.value = 'X: {}, Y: {}'.format(x,y)

fig = plt.figure(figsize =(12,6))
nbar_clean.nir.count('time').plot()

fig.canvas.mpl_connect('button_press_event', callback)
date_ = nbar_clean.time[time_slice]
plt.title(date_.astype('datetime64[D]'))
plt.show()
display(w)

<IPython.core.display.Javascript object>

In [None]:
image_coords2 = ~nbar_clean.affine * (x2, y2)
xdim2 = int(image_coords2[0])
ydim2 = int(image_coords2[1])
xdim2, ydim2

In [198]:
#This figure shows you the two locations you've selected
fig = plt.figure(figsize=(10,5))
nbar_clean.nir.count('time').plot()
plt.scatter(x=[x], y=[y], c='r')
plt.scatter(x=[x2], y=[y2], c='cyan')


<IPython.core.display.Javascript object>

<matplotlib.collections.PathCollection at 0x7f1f77ca43d0>

In [28]:
#Drop the no data
ts = nbar_clean.nir.isel(x=[xdim],y=[ydim]).dropna('time', how = 'any')
ts2 = nbar_clean.nir.isel(x=[xdim2],y=[ydim2]).dropna('time', how = 'any')

In [30]:
#Group the data by month to see total number of observations per month over the full depth of archive
tsg = ts.groupby('time.month')
tsg2 = ts2.groupby('time.month')

In [162]:
fig = plt.figure(figsize=(10,5))

tsg2.count().plot(label = 'not side lap')
tsg.count().plot(label = 'side lap')
plt.title('Total number of observations by month - Koppen Class = ' +clim_class)
plt.ylabel('Number of observations in 29 years')
plt.legend(bbox_to_anchor = (0, 1), loc = 2, borderaxespad = 0.)
plt.axis([1 , 12 ,0, 100])
plt.show

<IPython.core.display.Javascript object>

<function matplotlib.pyplot.show>

In [38]:
#group by year to see total number of observations per year
tsg_year = ts.groupby('time.year')
tsg2_year = ts2.groupby('time.year')

In [165]:
fig = plt.figure(figsize=(13,5))
tsg2_year.count().plot(label = 'not side lap')
tsg_year.count().plot(label = 'side lap')
plt.ylabel('Number of observations per year')
plt.title('Number of observations per year - Koppen Class = ' +clim_class)

plt.legend(bbox_to_anchor = (0, 1), loc = 2, borderaxespad = 0.)



<IPython.core.display.Javascript object>

<matplotlib.legend.Legend at 0x7f1f2d608fd0>

In [167]:
fig = plt.figure(figsize=(10,5))
#Calculate a timedelta to quantify the gaps between observations
x = ts.time - ts.time.shift(time=1)
x2 = ts2.time - ts2.time.shift(time=1)
days = x.astype('timedelta64[D]')
days2 = x2.astype('timedelta64[D]')

delta = days/np.timedelta64(1, 'D')
delta2 = days2/np.timedelta64(1, 'D')

print 'The average number of days between observations in the sidelap is' 
print int(delta.mean().values)
print 'The average number of days between observations outside the sidelap is' 
print int (delta2.mean().values)

delta.plot(label = 'side lap, mean num days = '+str(int(delta.mean().values)))
delta2.plot(label = 'not side lap, mean num days = '+str(int(delta2.mean().values)))
plt.ylabel("Days between observations")
plt.legend(bbox_to_anchor = (0, 1), loc = 2, borderaxespad = 0.)
plt.title('Number of days between observations - Koppen Class = ' +clim_class)


    #time_int_ = time_int.astype(datetime64[D])

<IPython.core.display.Javascript object>

The average number of days between observations in the sidelap is
11
The average number of days between observations outside the sidelap is
21


<matplotlib.text.Text at 0x7f1f2d431810>