# Validation

When a classification is performed, whether it is generated with machine learning tools such as Random Forest or based on thresholds of a spectral index for example, it should be validated on the basis of in-situ data (ground data). In the case where in-situ data are also used to generate the classification, the data used to validate the classification must be independent.

In [1]:
import glob, os, time
import numpy as np
import pandas as pd
import geopandas as gpd
import rasterio
from rasterio import features

import matplotlib.pyplot as plt
import scipy
import sklearn
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

from pathlib import Path
from IPython.display import display

print(f'Numpy : {np.__version__}')
print(f'Pandas : {pd.__version__}')
print(f'GeoPandas : {gpd.__version__}')
print(f'Scipy: {scipy.__version__}')
print(f'Scikit-learn: {sklearn.__version__}')

Numpy : 1.19.2
Pandas : 1.1.5
GeoPandas : 0.8.1
Scipy: 1.5.2
Scikit-learn: 0.24.1


## Set paths for input and output directories

In [14]:
#computer_path = 'X:/'
computer_path = '/Volumes/nbdid-sst-lbrat2104/'
grp_letter    = 'X'

# Directory for all work files
work_path = f'{computer_path}GROUP_{grp_letter}/WORK/'


# ----- #
# INPUT #
# ----- #

# Product name

prod_name = 'Classif_RF_with_NDVI'

# Set the name of the classification map
classif_tif = f'{work_path}CLASSIF/{prod_name}.tif'

# Set the name of the in-situ used for validate the classification map
in_situ_val_shp = f'{work_path}IN_SITU_SD/WALLONIA_2018_IN_SITU_ROI_val.shp'

# Set the field name of the class code
field_name_code = 'CODE'

# ------ #
# OUTPUT #
# ------ #

am_path = f'{work_path}ACCURACY_METRICS/'

# Set the name of the Confusion Matrix
cm_csv = f'{am_path}/{prod_name}_CM.csv'



Path(am_path).mkdir(parents=True, exist_ok=True)

print(f'Accuracy metrics path are set to : {am_path}')

Accuracy metrics path are set to : /Volumes/nbdid-sst-lbrat2104/GROUP_X/WORK/ACCURACY_METRICS/


## Rasterize in-situ data validation shapefile

In [12]:

in_situ_val_tif = f'{in_situ_val_shp[:-4]}.tif'

no_data_rasterize = -999

# Open the shapefile with GeoPandas

in_situ_gdf = gpd.read_file(in_situ_val_shp)


# Open the raster file you want to use as a template for rasterize

src = rasterio.open(classif_tif, "r")

# Update metadata
profile = src.profile
profile.update(nodata=no_data_rasterize)

print(f'The CRS of in-situ data is       : {in_situ_gdf.crs}')
print(f'The CRS of classification map is : {src.crs}')

if in_situ_gdf.crs == src.crs:
    print("CRS are the same")

    print(f'Rasterize {in_situ_val_shp}')

    # Burn the features into the raster and write it out

    dst = rasterio.open(in_situ_val_tif, 'w+', **profile)
    dst_arr = dst.read(1)

    # this is where we create a generator of geom, value pairs to use in rasterizing

    geom_col = in_situ_gdf.geometry
    code_col = in_situ_gdf[field_name_code].astype(int)

    shapes = ((geom,value) for geom, value in zip(geom_col, code_col))

    in_situ_arr = features.rasterize(shapes=shapes,
                                     fill=no_data_rasterize,
                                     out=dst_arr,
                                     transform=dst.transform)

    dst.write_band(1, in_situ_arr)

else:
    print('CRS are different --> repoject in-situ data shapefile with "to_crs"')


# Close rasterio objects
src.close()
dst.close()


The CRS of in-situ data is       : epsg:32631
The CRS of classification map is : EPSG:32631
CRS are the same
Rasterize /Volumes/nbdid-sst-lbrat2104/GROUP_X/WORK/IN_SITU_SD/WALLONIA_2018_IN_SITU_ROI_val.shp


## Create Confusion Matrix

In [15]:
# Open in-situ used for validation
src = rasterio.open(in_situ_val_tif, "r")
val_arr = src.read(1)
src.close()

# Open classification map
src = rasterio.open(classif_tif, "r")
classif_arr = src.read(1)
src.close()

# Get the postion of validation pixels
idx = np.where(val_arr == no_data_rasterize, 0, 1).astype(bool)

# Get the truth vector and the prediction vector
truth_val   = val_arr[idx]
predict_val = classif_arr[idx]

# Compute Confusion Matrix
cm = confusion_matrix(truth_val, predict_val)

# Convert CM into panda dataframe and save it
cm_df = pd.DataFrame(cm)
display(cm_df)
cm_df.to_csv(cm_csv, index=False, sep=';')

cm_values = cm_df.values





#pd.DataFrame(cm).to_csv(cm_csv, index=False, sep=';')
#disp = ConfusionMatrixDisplay(confusion_matrix=cm)
#plot_confusion_matrix(rf, class_prediction, val_arr)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,30
0,794,0,0,5,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,142,1,0,0,0,0,0,0,0,45,...,0,0,0,0,0,0,0,0,0,0
2,10,0,0,10,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,557,0,30,0,0,0,2,...,2,0,0,0,0,0,0,0,0,0
4,10,0,0,23,2,16,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,2,0,54,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,11,0,0,18,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,6,0,0,3,...,0,0,0,1,0,0,0,0,0,0
8,0,0,0,9,0,0,82,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,95,0,0,68,0,0,113,0,0,9523,...,123,0,0,84,0,0,0,0,8,0


## Compute accuracy metrics

### Compute Overall Accuracy

In [16]:
OA = (np.trace(cm) / float(np.sum(cm))) *100

print(f'Overall Accuracy : {round(OA,2)}%')

Overall Accuracy : 41.32%


### Compute F-Scores