# Building the dataset

## Import From Digital Earth Australia NBAR

In [1]:
%matplotlib notebook

import pandas as pd
import xarray as xr
import numpy as np
import datetime
from datetime import date, timedelta
import gdal
from gdal import *

import datacube
from datacube.helpers import ga_pq_fuser
from datacube.storage import masking
from datacube.storage.masking import mask_to_dict

from matplotlib.backends.backend_pdf import PdfPages
from matplotlib import pyplot as plt
import matplotlib.dates

from IPython.display import display
import ipywidgets as widgets
import rasterio

## Define Bounding Box

In [9]:
study_area = 'mtbarker'

# Mt Barker in the Adelaide Hills
if study_area == 'mtbarker':
    lat_min = -35.05
    lat_max = -35.08
    lon_min = 138.85
    lon_max = 138.895
    
elif study_area == 'swmelb':
    lat_min = -37.879
    lat_max = -37.91
    lon_min = 144.705
    lon_max = 144.76
    
elif study_area == 'gunghalin':
    lat_min = -35.18
    lat_max = -35.21
    lon_min = 149.14
    lon_max = 149.17
elif study_area == 'goldengrove': 
    lat_min = -34.77
    lat_max = -34.8
    lon_min = 138.66
    lon_max = 138.73
elif study_area == 'molonglo'
    lat_min = -35.3
    lat_max = -35.33
    lon_min = 149.015
    lon_max = 149.06
elif study_area == 'nperth'
    lat_min = -31.686
    lat_max = -31.73
    lon_min = 115.79
    lon_max = 115.813
elif study_area = 'swbris'
    lat_min = -27.66
    lat_max = -27.7 
    lon_min = 152.877
    lon_max = 152.93
elif study_area == 'swsyd'
    lat_min = -33.993
    lat_max = -34.04
    lon_min = 150.715 
    lon_max = 150.78
else:
    print('Area not known')

# Datacube query is completed

In [10]:
#Temporal range, wavelengths/band and sensors of interest are defined

#temporal range is defined
start_of_epoch = '1987-01-01'
end_of_epoch =  '2017-10-31'

#wavelengths/bands of interest are defined
bands_of_interest = ['blue',
                     'green',
                     'red', 
                     'nir',
                     'swir1', 
                     'swir2']

#Landsat sensors of interest are defined
sensors = ['ls8', 'ls7', 'ls5'] 

#query is created
query = {'time': (start_of_epoch, end_of_epoch),}
query['x'] = (lon_min, lon_max)
query['y'] = (lat_max, lat_min)
query['crs'] = 'EPSG:4326'

{'time': ('1987-01-01', '2017-10-31'), 'x': (138.85, 138.895), 'y': (-35.08, -35.05), 'crs': 'EPSG:4326'}


# Data is extracted from Open Datacube

The extracted data is first filtered using the criteria in "mask_components". 
The cloudiness of the scenes is then tested, and any scenes that do not meet the given "cloud_free_threshold" are discarded.
Additionally, any pixel that is located within the ocean/sea will be converted to "nan" values with the 'land_sea' command.

In [12]:
#Create cloud mask. This will define which pixel quality (PQ) artefacts are removed from the results.
# It should be noted the "land_sea" code will remove all ocean/sea pixels.

mask_components = {'cloud_acca':'no_cloud',
'cloud_shadow_acca' :'no_cloud_shadow',
'cloud_shadow_fmask' : 'no_cloud_shadow',
'cloud_fmask' :'no_cloud',
'blue_saturated' : False,
'green_saturated' : False,
'red_saturated' : False,
'nir_saturated' : False,
'swir1_saturated' : False,
'swir2_saturated' : False,
'contiguous':True,
'land_sea': 'land'}

In [26]:
start = datetime.datetime.now()
print('Load Started At: ' + str(start))

# Data for each Landsat sensor is retrieved and saved in a dict for concatenation
sensor_clean = {}

# Connect to DataCube
dc = datacube.Datacube(app='testing')

for sensor in sensors:
    # Load the NBAR and corresponding PQ
    sensor_nbar = dc.load(product= sensor+'_nbar_albers', group_by='solar_day', 
                          measurements = bands_of_interest,  **query)
    sensor_pq = dc.load(product= sensor+'_pq_albers', group_by='solar_day', 
                        fuse_func=ga_pq_fuser, **query)
    
    # Retrieve the projection information before masking/sorting
    crs = sensor_nbar.crs
    crswkt = sensor_nbar.crs.wkt
    affine = sensor_nbar.affine        

    # Combing the pq so it is a single 
    sensor_all = xr.auto_combine([sensor_pq,sensor_nbar])
    sensor_clean[sensor] = sensor_all

    print('Loaded %s' % sensor) 
    
print('Concatenating')
nbar_clean = xr.concat(sensor_clean.values(), 'time')
nbar_clean = nbar_clean.sortby('time')
nbar_clean.attrs['crs'] = crs
nbar_clean.attrs['affin|e'] = affine    
    
print ('Load and Xarray build complete')
print('Process took ' + str(datetime.datetime.now() - start))


#Check that the concatenation worked
nbar_clean

Load Started At: 2018-01-15 13:40:01.982642
Loaded ls8
Loaded ls7
Loaded ls5
Concatenating
Load and Xarray build complete
Process took 0:06:51.762970


<xarray.Dataset>
Dimensions:       (time: 710, x: 171, y: 143)
Coordinates:
  * y             (y) float64 -3.844e+06 -3.844e+06 -3.844e+06 -3.844e+06 ...
  * x             (x) float64 6.228e+05 6.228e+05 6.229e+05 6.229e+05 ...
  * time          (time) datetime64[ns] 1987-09-20T00:00:31 ...
Data variables:
    pixelquality  (time, y, x) int16 16383 16383 16383 16383 16383 13311 ...
    blue          (time, y, x) int16 318 274 341 385 851 1397 1762 1954 1848 ...
    green         (time, y, x) int16 473 379 473 519 939 1539 2040 2221 2086 ...
    red           (time, y, x) int16 326 284 368 368 827 1450 1862 2027 1986 ...
    nir           (time, y, x) int16 3561 2752 2514 2370 2800 3561 4130 4366 ...
    swir1         (time, y, x) int16 1207 970 801 801 1241 2154 2593 2796 ...
    swir2         (time, y, x) int16 384 334 334 334 786 1490 1942 1841 2092 ...
Attributes:
    crs:      EPSG:3577
    affin|e:  | 25.00, 0.00, 622800.00|\n| 0.00,-25.00,-3844375.00|\n| 0.00, ...

## Pixel Quality Stuff

In [None]:
#     #Discard data that does not meet the cloud_free_threshold
#     mostly_good = sensor_nbar2.where(mostly_cloud_free).dropna(dim='time', how='all')
    
#     mostly_good['product'] = ('time', np.repeat(sensor, mostly_good.time.size))    
#     sensor_clean[sensor] = mostly_good

#Ensure there's PQ to go with the NBAR
#     sensor_nbar = sensor_nbar.sel(time = sensor_pq.time)
    
    #Apply the PQ masks to the NBAR
#     quality_mask = masking.make_mask(sensor_pq, **mask_components)
#     good_data = quality_mask.pixelquality.loc[start_of_epoch:end_of_epoch]
#     sensor_nbar2 = sensor_nbar.where(good_data)
    
    #Calculate the percentage cloud free for each scene
#     cloud_free = masking.make_mask(sensor_pq, cloud_acca='no_cloud', cloud_fmask='no_cloud', 
#                                    contiguous=True).pixelquality
#     mostly_cloud_free = cloud_free.mean(dim=('x','y')) >= cloud_free_threshold

In [24]:
sensor = 'ls8'
#Load the NBAR and corresponding PQ
# sensor_nbar = dc.load(product= sensor+'_nbar_albers', group_by='solar_day', 
#                       measurements = bands_of_interest,  **query)

# sensor_nbar
# sensor_pq = dc.load(product= sensor+'_pq_albers', group_by='solar_day', 
#                     fuse_func=ga_pq_fuser, **query)
xr.auto_combine([sensor_pq,sensor_nbar])
# #Retrieve the projection information before masking/sorting
# crs = sensor_nbar.crs
# crswkt = sensor_nbar.crs.wkt
# affine = sensor_nbar.affine


# sensor_combo = xr.concat([sensor_nbar, sensor_pq], 'time')
# sensor_combo


<xarray.Dataset>
Dimensions:       (time: 103, x: 171, y: 143)
Coordinates:
  * time          (time) datetime64[ns] 2013-04-20T00:35:21 ...
  * y             (y) float64 -3.844e+06 -3.844e+06 -3.844e+06 -3.844e+06 ...
  * x             (x) float64 6.228e+05 6.228e+05 6.229e+05 6.229e+05 ...
Data variables:
    pixelquality  (time, y, x) int16 16383 16383 16383 14335 14335 14335 ...
    blue          (time, y, x) int16 762 895 959 1009 1019 988 897 814 713 ...
    green         (time, y, x) int16 867 865 954 1049 1079 1005 956 914 927 ...
    red           (time, y, x) int16 1046 1049 1146 1257 1258 1182 1115 1055 ...
    nir           (time, y, x) int16 1881 1963 2042 2139 2168 2090 1966 1853 ...
    swir1         (time, y, x) int16 2435 2364 2445 2547 2535 2448 2410 2401 ...
    swir2         (time, y, x) int16 1984 1928 2001 2126 2123 2036 2005 1979 ...

## Playing

In [15]:
#Check the output

sensor_clean['ls8']

<xarray.Dataset>
Dimensions:  (time: 103, x: 171, y: 143)
Coordinates:
  * time     (time) datetime64[ns] 2013-04-20T00:35:21 2013-05-06T00:35:22 ...
  * y        (y) float64 -3.844e+06 -3.844e+06 -3.844e+06 -3.844e+06 ...
  * x        (x) float64 6.228e+05 6.228e+05 6.229e+05 6.229e+05 6.229e+05 ...
Data variables:
    blue     (time, y, x) int16 762 895 959 1009 1019 988 897 814 713 768 ...
    green    (time, y, x) int16 867 865 954 1049 1079 1005 956 914 927 959 ...
    red      (time, y, x) int16 1046 1049 1146 1257 1258 1182 1115 1055 1047 ...
    nir      (time, y, x) int16 1881 1963 2042 2139 2168 2090 1966 1853 1924 ...
    swir1    (time, y, x) int16 2435 2364 2445 2547 2535 2448 2410 2401 2229 ...
    swir2    (time, y, x) int16 1984 1928 2001 2126 2123 2036 2005 1979 1848 ...
Attributes:
    crs:      EPSG:3577

In [26]:
nbar_clean.red

<xarray.DataArray 'red' (time: 145, y: 257, x: 269)>
array([[[ 135.,  189., ...,  136.,  190.],
        [ 162.,  162., ...,  217.,  244.],
        ..., 
        [ 407.,  461., ...,  164.,  164.],
        [ 407.,  407., ...,  164.,  164.]],

       [[ 154.,  127., ...,  207.,  207.],
        [ 154.,  154., ...,  233.,  260.],
        ..., 
        [ 496.,  575., ...,  155.,  155.],
        [ 470.,  443., ...,  155.,  129.]],

       ..., 
       [[ 233.,  230., ...,  322.,  351.],
        [ 211.,  230., ...,  320.,  316.],
        ..., 
        [ 985.,  965., ...,  345.,  281.],
        [ 948.,  826., ...,  384.,  285.]],

       [[ 253.,  237., ...,  331.,  362.],
        [ 221.,  237., ...,  343.,  342.],
        ..., 
        [ 735.,  743., ...,  266.,  288.],
        [ 693.,  677., ...,  263.,  274.]]])
Coordinates:
  * y        (y) float64 -2.279e+06 -2.279e+06 -2.279e+06 -2.279e+06 ...
  * x        (x) float64 1.714e+06 1.714e+06 1.714e+06 1.714e+06 1.714e+06 ...
  * time     (tim

In [34]:
for colour in ['red','green','blue']:
    print(nbar_clean.variables['red'][0,:,:])

<xarray.Variable (y: 257, x: 269)>
array([[ 135.,  189.,  162., ...,  244.,  136.,  190.],
       [ 162.,  162.,  162., ...,  217.,  217.,  244.],
       [ 135.,  108.,  135., ...,  217.,  217.,  217.],
       ..., 
       [ 461.,  758.,  812., ...,  109.,  164.,  191.],
       [ 407.,  461.,  542., ...,  137.,  164.,  164.],
       [ 407.,  407.,  434., ...,  137.,  164.,  164.]])
Attributes:
    nodata:   -999
    units:    1
    crs:      EPSG:3577
<xarray.Variable (y: 257, x: 269)>
array([[ 135.,  189.,  162., ...,  244.,  136.,  190.],
       [ 162.,  162.,  162., ...,  217.,  217.,  244.],
       [ 135.,  108.,  135., ...,  217.,  217.,  217.],
       ..., 
       [ 461.,  758.,  812., ...,  109.,  164.,  191.],
       [ 407.,  461.,  542., ...,  137.,  164.,  164.],
       [ 407.,  407.,  434., ...,  137.,  164.,  164.]])
Attributes:
    nodata:   -999
    units:    1
    crs:      EPSG:3577
<xarray.Variable (y: 257, x: 269)>
array([[ 135.,  189.,  162., ...,  244.,  136.,  190.

In [29]:
nbar_clean

<xarray.Dataset>
Dimensions:  (time: 145, x: 269, y: 257)
Coordinates:
  * y        (y) float64 -2.279e+06 -2.279e+06 -2.279e+06 -2.279e+06 ...
  * x        (x) float64 1.714e+06 1.714e+06 1.714e+06 1.714e+06 1.714e+06 ...
  * time     (time) datetime64[ns] 2000-06-30T23:56:29.500000 ...
Data variables:
    green    (time, y, x) float64 205.0 206.0 206.0 267.0 236.0 236.0 206.0 ...
    red      (time, y, x) float64 135.0 189.0 162.0 162.0 135.0 189.0 162.0 ...
    nir      (time, y, x) float64 2.473e+03 2.278e+03 2.473e+03 2.667e+03 ...
    swir1    (time, y, x) float64 839.0 803.0 839.0 944.0 979.0 909.0 698.0 ...
    product  (time) <U3 'ls7' 'ls7' 'ls7' 'ls7' 'ls7' 'ls7' 'ls7' 'ls7' ...
Attributes:
    crs:      EPSG:3577
    affin|e:  | 25.00, 0.00, 1713675.00|\n| 0.00,-25.00,-2279200.00|\n| 0.00,...