# Training Data Picking Tool - Setup

## Imports

In [1]:
%matplotlib notebook
import os

import numpy as np
import pandas as pd
import xarray as xr

import matplotlib
import matplotlib.pyplot as plt
from matplotlib.pyplot import imshow

import gdal

import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual, IntSlider, FloatSlider, Dropdown
from IPython.display import display

from skimage import exposure
from scipy.signal import lfilter

import warnings

## function: getData
This code is specific to the intermediate output files from Peter's urban change algorithm.

In [2]:
def getData(study_area):
    # build a list of all files in the directory (ie the folder for that location)
    location = study_area + '/'
    files = os.listdir(location)

    # build a list of all the NBAR*.img file names and which bands they represent
    NBARfiles = []
    bands = []
    for file in files:
        if file[-4::] == '.img' and file[0:4] == 'NBAR':
            NBARfiles.append(file)
            bands.append(file.split('NBAR_')[1].split('.img')[0])

    # open all the .img files with NBAR in the name, convert to numpy array, swap axes so order is (x, y, t)
    # and save to dict
    raw_data = {}
    for i in range(len(NBARfiles)):
        raw_data[bands[i]] = gdal.Open(location + NBARfiles[i]).ReadAsArray().swapaxes(0,2)
#     num_scenes = len(raw_data['red'][0][0])   # delete this?

    # build a list of all the dates represented by each band in the NBAR files
    # reuse the list of NBAR file names, but this time access the .hdr file
    in_dates = False
    dates = []
    for line in open(location + NBARfiles[0].split('.img')[0] + '.hdr'):
        if line[0] == '}':
            continue
        if in_dates:
            dates.append(line.split(',')[0].strip())
        if line[0:10] == 'band names':
            in_dates = True

    # save list of satellite originated bands
    sat_bands = bands.copy()

    # add the yet to be calculated derivative bands to the overall bands list
    bands += ['cloud_mask']

    # building the Xarray
    # define the size for the numpy array that will hold all the data for conversion into XArray
    x = len(raw_data['red'])
    y = len(raw_data['red'][0])
    t = len(raw_data['red'][0][0])
    n = len(bands)

    # create an empty numpy array of the correct size
    alldata = np.zeros((x, y, t, n), dtype=np.float32)

    # populate the numpy array with the satellite data
    # turn all no data NBAR values to NaNs
    for i in range(len(sat_bands)):
        alldata[:,:,:,i] = raw_data[sat_bands[i]]
        alldata[:,:,:,i][alldata[:,:,:,i] == -999] = np.nan

    # convert the numpy array into an xarray, with appropriate lables, and axes names
    data = xr.DataArray(alldata, coords = {'x':range(x), 'y':range(y), 'date':dates, 'band':bands},
                 dims=['x', 'y', 'date', 'band'])
    
    # import cloudmask and add to xarray
    cloudmask = gdal.Open(location + '/tsmask.img').ReadAsArray().swapaxes(0,2)
    data.loc[:,:,:,'cloud_mask'] = cloudmask
    
    return data

## function: drawTrainingPlot

In [3]:
def drawTrainingPlot(study_area, scene_num, covertype):
    # kept for easy extnesion to multiple subplots for the training view
    ax1 = drawTrainingScene(study_area, scene_num, covertype)   
    plt.draw()

## function: drawTrainingScene

In [4]:
def drawTrainingScene(study_area, scene_num, covertype):
    # get data for selected study area
    data = getData(study_area)
    
    # colour map included incase of need to display false colour or other in the future
    # could change this to an ordereddict and remove the RGB list created below...?
    colourmap = {'R':'red', 'G':'green', 'B':'blue'}
    
    # combine the data for the 3 bands to be displayed into a single numpy array
    h = data.shape[1]
    w = data.shape[0]
    t = data.shape[2]
    if scene_num > (t -1):
        scene_num = t - 1
    RGB = ['R','G','B']
    date = str(data[:,:,scene_num].date.values)
    
    # create array to store the RGB info in, and fill by looping through the colourmap variable
    # note the .T at the end, because the data array is setup as a (x,y,t), but imshow works (y,x)
    rawimg = np.zeros((h, w, 3), dtype=np.float32)
    for i in range(len(RGB)):     
        rawimg[:,:,i] = data[:,:,scene_num].sel(band=colourmap[RGB[i]]).T
        
    # equalizing for all bands together
    # goal is to make is human interpretable
    img_toshow = exposure.equalize_hist(rawimg, mask = np.isfinite(rawimg))    

    # displaying the results and formatting the axes etc
    plt.imshow(img_toshow)
    ax = plt.gca()
    ax.set_title('True Colour Landsat Scene, taken\n' + date + ', over ' + study_area)
    
    otherpicks = np.zeros((h,w), dtype=np.uint8)
    picks = np.zeros((h,w), dtype=np.uint8)
    if study_area in trainingdata.index:
        if scene_num in trainingdata.loc[study_area].index:
            temp = trainingdata.loc[study_area].loc[scene_num]
            for i in range(len(temp)):
                if temp.iloc[i]['landcover'] == landcover[covertype]:
                    position = temp.iloc[i].name
                    picks[position[0],position[1]] = 1
                else:
                    position = temp.iloc[i].name
                    otherpicks[position[0],position[1]] = 1
            allpicks = np.zeros((h,w,4), dtype = np.float32)
            allpicks[:,:,0] = picks
            allpicks[:,:,3][picks != 0] = 1
            allpicks[:,:,2] = otherpicks
            allpicks[:,:,3][otherpicks != 0] = 1
            allpicks[allpicks == 0] = np.nan

            ax.imshow(allpicks)
    
    return ax

## function: train

In [5]:
# some broad scope variables that need setting up

global xpos
global ypos
xpos = 0
ypos = 0

# easier to work with integers than strings, so map the planned training classes to integers
landcover = {'vegetation':1,'urban':2,'earth':3,'water':4}
# range of pretermined study areas to use as sources for training data
study_areas = ['mtbarker', 'molonglo', 'nperth', 'swbris', 'swmelb', 'swsyd']

# setup a multilevel heirachrical index dataframe to store the results
# storing the training data in this format is way more memory efficient than in an Xarray of same size as data
# but it takes a lot of processing and manipulation to get it into a more useable form

trainidx = pd.MultiIndex(levels = [[]]*4, labels = [[]]*4, names=['study_area', 'scene_num', 'row','column'])
traincols = ['landcover']
trainingdata = pd.DataFrame(index = trainidx, columns = traincols)

def train(study_area, scene_num, covertype):

    def onclick(event):
        # defining what to do on a click event
        
        # I don't understand why this need to be declared global again, but it breaks without these lines
        global xpos
        global ypos
        # need to cast to int as result is a float, and can't index a list with a float
        xpos = int(event.xdata)
        ypos = int(event.ydata)
        # save the results of the click to the training data dataframe
        trainingdata.loc[(study_area, scene_num, ypos, xpos)] = [landcover[covertype]]
        # redraw with the trained pixels updated on the image
        drawTrainingPlot(study_area, scene_num, covertype)
    
    # control the figure size
    fig = plt.figure(figsize=[10,10])
    axs = fig.axes
    plt.subplots_adjust(hspace = 0.6)
    
    # draw the figure
    drawTrainingPlot(study_area, scene_num, covertype)
    #connect the click event action to the figure
    cid = fig.canvas.mpl_connect('button_press_event', onclick)

# The Training Figure

In [6]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    interact(train,
             study_area = Dropdown(options=study_areas, value = study_areas[0], description='Study Area', disabled = False),
             scene_num = IntSlider(value = 1, min = 0, max = 2000,description = "Scene Number"),
             covertype = Dropdown(options=list(landcover.keys()), value=list(landcover.keys())[0], description='Landcover', disabled = False))

In [7]:
# check the outputs of the training data generation process
trainingdata

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,landcover
study_area,scene_num,row,column,Unnamed: 4_level_1
mtbarker,1,38,101,1
mtbarker,1,38,105,1
mtbarker,1,32,103,1
mtbarker,1,29,104,1
mtbarker,1,28,105,1
mtbarker,1,28,107,1
mtbarker,131,43,116,3
mtbarker,131,40,116,3
mtbarker,131,40,118,3
mtbarker,131,44,119,3


# Training Results Manipulation and Classifier Training

In [8]:
# Aim is to get the data from the dataframe (which holds references to the pixel's location, along with the
# assigned class for that pixel), use it to extract the spectral data for that pixel, format it appropriately
# and pass it to the classification algorithm to teach it.

# empty lists where the spectral data for each pixel (X) and the landcover class (Y) will be stored
X = []
Y = []

# useful variable for pulling out only the spectral data
sat_bands = ['blue', 'green', 'red', 'nir', 'swir1', 'swir2']
dc_bands = ['blue', 'green', 'red', 'nir', 'swir1', 'swir2', 'cloud_mask']

# loop through the different locations used for the training data.
for loc in trainingdata.index.levels[0]:
    
    # build the Xarray for that location
    data = getData(loc)
    # only look at the training data for that location
    subset = trainingdata.loc[loc]
    # for each row (ie each pick) at that location
    for i in range(len(subset)):
        # unpack the multilevel pandas index into components for accessing the correct Xarray pixel
        scene, y, x = subset.iloc[i].name
        if data[x, y, scene].sel(band='cloud_mask').values == 0:
            # if the pixel is valid (no cloud), take the spectral bands
            X_vect = data[x, y, scene].sel(band=sat_bands).values
            if np.isfinite(X_vect).all():
                # if all the bands have readings (no NaNs), save the relevant bits into X and Y
                Y.append(subset.iloc[i].values[0])        
                X.append(np.reshape(X_vect,(len(X_vect),1)).T)

# join the list of spectral vectors into a single 2D array
X = np.concatenate(X, axis = 0)
        
from sklearn import preprocessing

# scale and normalize the data
X_scaled = preprocessing.scale(X)
X_normalized = preprocessing.normalize(X_scaled)
X_normalized



array([[-0.39479834, -0.20103183, -0.4579601 ,  0.56202102, -0.32830667,
        -0.41272277],
       [-0.46247533, -0.46942592, -0.36761388,  0.22592355, -0.47991425,
        -0.3863377 ],
       [ 0.36207926,  0.46474674, -0.29363525,  0.55288416,  0.47928971,
        -0.17688371],
       [-0.10133391,  0.74710363, -0.27756268,  0.44000477,  0.39239869,
        -0.08334096],
       [-0.08646604,  0.63748747, -0.44228828,  0.44594139,  0.07938063,
        -0.43052226],
       [ 0.11182161,  0.6622445 , -0.35275033,  0.56127864,  0.29579604,
        -0.14821024],
       [-0.06411101,  0.10958982,  0.47482851, -0.34458989,  0.51077384,
         0.61545563],
       [ 0.58603477,  0.10836481,  0.46952084, -0.40408906,  0.09298779,
         0.50242996],
       [ 0.61855698,  0.11437855,  0.49557707, -0.39323774,  0.17063899,
         0.41827586],
       [ 0.33802253,  0.1401719 ,  0.60733396, -0.35957476,  0.47563392,
         0.37645262],
       [-0.29709566, -0.61186749, -0.10932432, -0.

In [9]:
from sklearn import svm

# create a support vector classifer, and fit the data to it
clf = svm.SVC()
clf.fit(X,Y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

# Formatting Remaining Data for Classification & Classifying It

In [12]:
# just run the classification over mt barker, will need to loop this through all eventually
data = getData('mtbarker')

#setting up the xarray to store the results for easy plotting later
for newband in ['landcover','predicted_landcover']:
    test = data[:,:,:].sel(band='red').copy()
    test.band.values = newband
    test.values[:] = np.nan
    data = xr.concat([data, test], dim='band')

shape = data.values.shape

# classify one scene at a time, save the results to the xarray    
for scene in range(1,2):
    
    # setting up dataframe multilevel indexes
    col_idx = list(range(shape[0])) * shape[1]
    row_idx = []
    study_area = 'mtbarker'
    for i in range(shape[1]):
        row_idx += [i] * shape[0]
    scene_idx = [scene] * (shape[0] * shape[1])

    # reshape the data into a 2D flat array for scikit learn
    flattened = data[:,:,scene].sel(band=dc_bands).values.reshape(shape[0] * shape[1], len(dc_bands))

    # add the data to a new DataFrame, set up the columns and index
    alldata = pd.DataFrame(flattened)
    alldata.columns = dc_bands
    alldata['row'] = row_idx
    alldata['column'] = col_idx
    alldata['scene_num'] = scene_idx
    alldata['study_area'] = study_area
    alldata = alldata.set_index(['study_area','scene_num','row','column'])

    # join in the training data. This is a SQL left join, so only adds data to current study area
    alldata = alldata.reset_index().join(trainingdata[['landcover']], on=trainingdata.index.names).set_index(alldata.index.names)

    # reduce all data down to valid pixels (ie cloudmask), and non-training pixels (ie landcover is still NaN)
    datatoclassify = alldata[alldata['cloud_mask'] == 0 & np.isnan(alldata['landcover'])].copy()
    # remove pixels with a np.nan as scikit-learn doesn't like them. Only keep spectral bands
    datatoclassify = datatoclassify[np.isnan(datatoclassify['landcover'])][sat_bands]
    # cast these relevant columns into a numpy array
    datatoclassify_np = np.array(datatoclassify)

    # results of predict() are a 1 dimensional numpy array of the same length as the input data
    # assign these results to a new column in the dataframe
    datatoclassify['predicted_landcover'] = clf.predict(datatoclassify_np)
    
    # SQL left join the results back onto the original data
    alldata = alldata.reset_index().join(datatoclassify[['predicted_landcover']], on=trainingdata.index.names).set_index(alldata.index.names)
    
    data[:,:,scene].loc[dict(band='landcover')] = alldata['landcover'].values.reshape(shape[0],shape[1])
    data[:,:,scene].loc[dict(band='predicted_landcover')] = alldata['predicted_landcover'].values.reshape(shape[0],shape[1])

# Viewing the Results

In [13]:
def drawClassifedScene(data, scene_num, alpha):
    
    # colour map included incase of need to display false colour or other in the future
    # could change this to an ordereddict and remove the RGB list created below...?
    colourmap = {'R':'red', 'G':'green', 'B':'blue'}
    
    # combine the data for the 3 bands to be displayed into a single numpy array
    h = data.shape[1]
    w = data.shape[0]
    t = data.shape[2]
    
    if scene_num > (t -1):
        scene_num = t - 1
    RGB = ['R','G','B']
    date = str(data[:,:,scene_num].date.values)
    
    # create array to store the RGB info in, and fill by looping through the colourmap variable
    # note the .T at the end, because the data array is setup as a (x,y,t), but imshow works (y,x)
    rawimg = np.zeros((h, w, 3), dtype=np.float32)
    for i in range(len(RGB)):     
        rawimg[:,:,i] = data[:,:,scene_num].sel(band=colourmap[RGB[i]]).T
        
    # equalizing for all bands together
    # goal is to make is human interpretable
    img_toshow = exposure.equalize_hist(rawimg, mask = np.isfinite(rawimg))    

    # displaying the results and formatting the axes etc
    plt.imshow(img_toshow)
    ax = plt.gca()
    ax.set_title('True Colour Landsat Scene, taken\n' + date + ', over ' + study_area)
        
    ax.imshow(data[:,:,scene_num].sel(band='predicted_landcover'), alpha = alpha)
    ax.imshow(data[:,:,scene_num].sel(band='landcover'), alpha = 1)
    
    return ax

In [14]:
def drawClassifedPlots(data, scene_num, alpha):
    # make them plot sideways
    ax1 = drawClassifedScene(data, scene_num, alpha)
#     ax2 = drawClassifedScene(data, scene_num, alpha = 1)
    plt.draw()

In [15]:
def check(data, scene_num, alpha):
 
    # control the figure size
    fig = plt.figure(figsize=[10,10])
    axs = fig.axes
    plt.subplots_adjust(hspace = 0.6)
    
    # draw the figure
    drawClassifedPlots(data, scene_num, alpha)


In [16]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    interact(check,
             data = fixed(data),
             scene_num = IntSlider(value = 1, min = 0, max = 2000,description = "Scene Number"),
             alpha= FloatSlider(value = 0.6, min = 0, max = 1, description = "Classification Transparency"))