# Validation

When a classification is performed, whether it is generated with machine learning tools such as Random Forest or based on thresholds of a spectral index for example, it should be validated on the basis of in-situ data (ground data). In the case where in-situ data are also used to generate the classification, the data used to validate the classification must be independent.

In [1]:
import glob, os, time
import numpy as np
import pandas as pd
import geopandas as gpd
import rasterio
from rasterio import features

import matplotlib.pyplot as plt
import scipy
import sklearn
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, f1_score, accuracy_score, classification_report

from pathlib import Path
from IPython.display import display

print(f'Numpy : {np.__version__}')
print(f'Pandas : {pd.__version__}')
print(f'GeoPandas : {gpd.__version__}')
print(f'Scipy: {scipy.__version__}')
print(f'Scikit-learn: {sklearn.__version__}')

Numpy : 1.19.2
Pandas : 1.1.5
GeoPandas : 0.8.1
Scipy: 1.5.2
Scikit-learn: 0.24.1


## Set paths for input and output directories

In [3]:
#computer_path = 'X:/'
computer_path = '/Volumes/nbdid-sst-lbrat2104/'
grp_letter    = 'X'

# Directory for all work files
work_path = f'{computer_path}GROUP_{grp_letter}/WORK/'


# ----- #
# INPUT #
# ----- #

# Product name

prod_name = 'Classif_RF_with_NDVI'

# Set the name of the classification map
classif_tif = f'{work_path}CLASSIF/{prod_name}.tif'

# Set the name of the in-situ used for validate the classification map
in_situ_val_shp = f'{work_path}IN_SITU_SD/WALLONIA_2018_IN_SITU_ROI_val.shp'

# Set the field name of the class code
field_name_code = 'CODE'

# ------ #
# OUTPUT #
# ------ #

am_path = f'{work_path}ACCURACY_METRICS/'

# Set the name of the Confusion Matrix
cm_csv = f'{am_path}/{prod_name}_CM.csv'
cm_png = f'{am_path}/{prod_name}_CM.png'



Path(am_path).mkdir(parents=True, exist_ok=True)

print(f'Accuracy metrics path are set to : {am_path}')

Accuracy metrics path are set to : /Volumes/nbdid-sst-lbrat2104/GROUP_X/WORK/ACCURACY_METRICS/


## Rasterize in-situ data validation shapefile

In [4]:

in_situ_val_tif = f'{in_situ_val_shp[:-4]}.tif'

no_data_rasterize = -999

# Open the shapefile with GeoPandas

in_situ_gdf = gpd.read_file(in_situ_val_shp)


# Open the raster file you want to use as a template for rasterize

src = rasterio.open(classif_tif, "r")

# Update metadata
profile = src.profile
profile.update(nodata=no_data_rasterize)

print(f'The CRS of in-situ data is       : {in_situ_gdf.crs}')
print(f'The CRS of classification map is : {src.crs}')

if in_situ_gdf.crs == src.crs:
    print("CRS are the same")

    print(f'Rasterize {in_situ_val_shp}')

    # Burn the features into the raster and write it out

    dst = rasterio.open(in_situ_val_tif, 'w+', **profile)
    dst_arr = dst.read(1)

    # this is where we create a generator of geom, value pairs to use in rasterizing

    geom_col = in_situ_gdf.geometry
    code_col = in_situ_gdf[field_name_code].astype(int)

    shapes = ((geom,value) for geom, value in zip(geom_col, code_col))

    in_situ_arr = features.rasterize(shapes=shapes,
                                     fill=no_data_rasterize,
                                     out=dst_arr,
                                     transform=dst.transform)

    dst.write_band(1, in_situ_arr)

else:
    print('CRS are different --> repoject in-situ data shapefile with "to_crs"')


# Close rasterio objects
src.close()
dst.close()


The CRS of in-situ data is       : epsg:32631
The CRS of classification map is : EPSG:32631
CRS are the same
Rasterize /Volumes/nbdid-sst-lbrat2104/GROUP_X/WORK/IN_SITU_SD/WALLONIA_2018_IN_SITU_ROI_val.shp


## Create Confusion Matrix

In [6]:
# Open in-situ used for validation
src = rasterio.open(in_situ_val_tif, "r")
val_arr = src.read(1)
src.close()

# Open classification map
src = rasterio.open(classif_tif, "r")
classif_arr = src.read(1)
src.close()

# Get the postion of validation pixels
idx = np.where(val_arr == no_data_rasterize, 0, 1).astype(bool)


# Ground truth (correct) target values
y_true = val_arr[idx]
# Estimated targets as returned by a classifier.
y_pred = classif_arr[idx]


labels_all    = np.concatenate((y_true, y_pred))
labels_unique = np.unique(labels_all)


# Compute Confusion Matrix
cm = confusion_matrix(y_true, y_pred)

#disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)


# Convert CM into panda dataframe and save it
cm_df = pd.DataFrame(cm)

cm_df.to_csv(cm_csv, index=False, sep=';')

cm_values = cm_df.values

cm_df.columns = labels_unique
cm_df.index = labels_unique

display(cm_df)



Unnamed: 0,6,9,12,19,20,21,36,45,53,62,...,901,931,959,4111,9202,9410,9535,9741,9742,9812
6,794,0,0,5,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,142,1,0,0,0,0,0,0,0,45,...,0,0,0,0,0,0,0,0,0,0
12,10,0,0,10,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19,1,0,0,557,0,30,0,0,0,2,...,2,0,0,0,0,0,0,0,0,0
20,10,0,0,23,2,16,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
21,0,0,0,2,0,54,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
36,0,0,0,11,0,0,18,0,0,0,...,0,0,0,0,0,0,0,0,0,0
45,0,0,0,0,0,0,6,0,0,3,...,0,0,0,1,0,0,0,0,0,0
53,0,0,0,9,0,0,82,0,0,0,...,0,0,0,0,0,0,0,0,0,0
62,95,0,0,68,0,0,113,0,0,9523,...,123,0,0,84,0,0,0,0,8,0


## Compute accuracy metrics

### Compute Overall Accuracy

In [7]:

# By hand
OA = (np.trace(cm) / float(np.sum(cm)))

# With scikit-learn function
OA_sk = accuracy_score(y_true, y_pred)

print(f'Overall Accuracy (by hand)      : {round(OA*100,2)}%')
print(f'Overall Accuracy (scikit-learn) : {round(OA_sk*100,2)}%')


Overall Accuracy (by hand)      : 41.32%
Overall Accuracy (scikit-learn) : 41.32%


### Build a text report showing the main classification metrics

 Some labels in `y_test` don't appear in `y_pred`. Specifically in this case, label '2' is never predicted:

In [8]:
classes_missing = set(y_true) - set(y_pred)
print(f'{len(classes_missing)} classes are missing in the classification (y_pred) : {classes_missing} \n')


acc_metrics_all = classification_report(y_true, y_pred, labels=labels_unique)

print(acc_metrics_all)

7 classes are missing in the classification (y_pred) : {9535, 743, 12, 9202, 341, 53, 959} 

              precision    recall  f1-score   support

           6       0.64      0.99      0.78       802
           9       1.00      0.01      0.01       188
          12       0.00      0.00      0.00        21
          19       0.43      0.91      0.59       613
          20       0.29      0.03      0.06        61
          21       0.54      0.96      0.69        56
          36       0.01      0.02      0.01      1173
          45       0.00      0.00      0.00       354
          53       0.00      0.00      0.00       515
          62       0.79      0.92      0.85     10301
          73       0.00      0.00      0.00       374
          91       0.00      0.00      0.00      8729
         201       0.07      0.36      0.11      1649
         311       0.80      0.42      0.55     18885
         321       0.16      0.59      0.25      2161
         341       0.00      0.00      0.0

If you decide that you are not interested in the scores of labels that were not predicted, then you can explicitly specify the labels you are interested in (which are labels that were predicted at least once).

In [9]:
acc_metrics_pred = classification_report(y_true, y_pred, labels=np.unique(y_pred))

print(acc_metrics_pred)


              precision    recall  f1-score   support

           6       0.64      0.99      0.78       802
           9       1.00      0.01      0.01       188
          19       0.43      0.91      0.59       613
          20       0.29      0.03      0.06        61
          21       0.54      0.96      0.69        56
          36       0.01      0.02      0.01      1173
          45       0.00      0.00      0.00       354
          62       0.79      0.92      0.85     10301
          73       0.00      0.00      0.00       374
          91       0.00      0.00      0.00      8729
         201       0.07      0.36      0.11      1649
         311       0.80      0.42      0.55     18885
         321       0.16      0.59      0.25      2161
         342       0.63      0.76      0.69       307
         541       0.65      0.40      0.49       358
         542       0.00      0.00      0.00       285
         821       0.00      0.00      0.00       733
         901       0.20    