In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'  # default is ‘last_expr’

%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('/Source/repos/GitHub_MSFT/landcover-orinoquiaa')

In [3]:
import json
import os
import pickle
from collections import defaultdict

import rasterio
from tqdm import tqdm
import numpy as np
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.calibration import calibration_curve
import matplotlib.pyplot as plt
from PIL import Image

from geospatial.visualization.raster_label_visualizer import RasterLabelVisualizer

plt.rcParams['figure.figsize'] = (10.0, 10.0)

In [4]:
# from data/tile_and_mask.py - which needs to be run in the Solaris env

def get_lon_lat_from_tile_name(tile_name):
    """Returns _lon_lat"""
    parts = tile_name.split('_')
    lon_lat = f'_{parts[-2]}_{parts[-1].split(".tif")[0]}'
    return lon_lat

# Evaluate a tiles of model predictions

In [5]:
viz_util = RasterLabelVisualizer('../constants/class_lists/wcs_coarse_label_map.json')

In [6]:
label_names = sorted(viz_util.num_to_name.items(), key=lambda x: int(x[0]))
label_names = [i[1] for i in label_names]
label_names

['Empty of data',
 'Urban and infrastructure',
 'Agriculture',
 'Arboreal and forestry crops',
 'Pasture',
 'Vegetation',
 'Forest',
 'Savanna',
 'Sand, rocks and bare land',
 'Unavailable',
 'Swamp',
 'Water',
 'Seasonal savanna',
 'Seasonally flooded savanna']

In [None]:
output_paths = '/Data/WCS_land_use/delivered/20200701/results_coarse_baseline_201314'

mask_paths = '/Data/WCS_land_use/train_full_region_median/tiles_masks_coarse'

eval_saved_to = '/Data/WCS_land_use/train_full_region_median/result_val_analysis_coarse_baseline'

num_classes = viz_util.num_classes

In [None]:
tile_accuracies = {}

cm = np.zeros((num_classes, num_classes), dtype=np.int64)

true_counts = np.zeros((num_classes), dtype=np.int64)
pred_counts = np.zeros((num_classes), dtype=np.int64)

classes_present_in_gt = set()

for output_tile_fn in os.listdir(output_paths):
    if not output_tile_fn.endswith('.tif'):
        continue
# for output_tile_fn in ['res_wcs_orinoquia_sr_median_2013_2014-0000000000-0000022272_-68.962_6.593.tif']:
    
    output_tile_path = os.path.join(output_paths, output_tile_fn)
    out_reader = rasterio.open(output_tile_path)
    output_tile = np.array(Image.open(output_tile_path), dtype=np.uint8)
    
    # mask_-68.423_6.054.png
    lon_lat = get_lon_lat_from_tile_name(output_tile_path)
    label_mask_path = os.path.join(mask_paths, f'mask{lon_lat}.tif')
    label_mask = np.array(Image.open(label_mask_path), dtype=np.uint8)
 
    output = output_tile.flatten()
    labels = label_mask.flatten()
    
    # mask out where labels is 0, which is outside of boundary of region
    # and also where output is 0, which is where no imagery is available on the tile
    # now get rid of such entries
    labels_masked = labels * (output != 0)
    no_label_entries = np.where(labels_masked == 0)
    
    labels = np.delete(labels, no_label_entries)
    output = np.delete(output, no_label_entries)
    
    classes_present_in_gt.update(labels)
    
    tile_accuracy = accuracy_score(labels, output, normalize=True)
    tile_accuracies[lon_lat] = tile_accuracy

    for y_true, y_pred in tqdm(zip(labels, output)):
        cm[y_true][y_pred] += 1
        true_counts[y_true] += 1
        pred_counts[y_pred] += 1
    
overall_accuracy = sum(tile_accuracies.values())/len(tile_accuracies)
print(f'Overall accuracy is {overall_accuracy}')

In [None]:
tile_accuracies

### Accurate distribution of land types
The shapefile's area attribute did not look correct

In [None]:
true_counts

### Confusion matrix

In [None]:
# normalize by ground truth label counts
cm_norm = np.zeros((num_classes, num_classes), dtype=np.float)
for y_true in range(num_classes):
    for y_pred in range(num_classes):
        if true_counts[y_true] == 0:
            cm_norm[y_true][y_pred] = 0.0
        else:
            cm_norm[y_true][y_pred] = cm[y_true][y_pred] / true_counts[y_true]

In [None]:
# docs: https://matplotlib.org/3.1.3/gallery/images_contours_and_fields/image_annotated_heatmap.html#sphx-glr-gallery-images-contours-and-fields-image-annotated-heatmap-py

cm_to_plot = cm_norm


fig = plt.figure(figsize=(10, 10), dpi=200)  # set dpi to 300 to look good
ax = fig.add_axes([0.0, 0.0, 1.0, 1.0])
im = ax.matshow(cm_to_plot, cmap=plt.cm.YlGnBu)
_ = ax.set_xticks(np.array(range(num_classes)))
_ = ax.set_yticks(np.array(range(num_classes)))
_ = ax.set_xticklabels(label_names)
_ = ax.set_yticklabels(label_names)
_ = ax.set_ylabel('Provided labels')
_ = ax.set_xlabel('Predicted by model')
ax.xaxis.tick_top()

# Rotate the tick labels
_ = plt.setp(ax.get_xticklabels(), rotation=90)

_ = ax.set_xticks(np.array(range(num_classes)) - 0.5, minor=True)
_ = ax.set_yticks(np.array(range(num_classes)) - 0.5, minor=True)
ax.grid(which='minor', color='white', linestyle='-', linewidth=3)

cbar = ax.figure.colorbar(im, ax=ax)

# no border
for edge, spine in ax.spines.items():
        spine.set_visible(False)

# right-click save - layout isn't right otherwise
        
#fig.tight_layout()
#plt.savefig('/Users/siyuyang/Source/temp_data/WCS_land_use/train_200218/result_val/evaluation/cm.png')

In [None]:
cm_norm[32][32]
cm_norm[33][33]

cm_norm[30, 33]  # row, col - ground truth, predicted
cm_norm[33][30] 

### Side by side label and output counts, in log scale

### Per-class accuracy, precision and recall

In [None]:
# per-class accuracy
total_obs = cm.sum()

per_class_accuracy = {}
per_class_recall = {}
per_class_precision = {}

for cls in range(num_classes):
    if cls not in classes_present_in_gt:
        continue
        
    true_pos = cm[cls, cls]
    
    true_neg = total_obs - cm[cls, :].sum() - cm[:, cls].sum() + true_pos
    
    false_pos = cm[:, cls].sum() - true_pos
    
    false_neg = cm[cls, :].sum() - true_pos
    
    per_class_accuracy[cls] = (true_pos + true_neg) / total_obs
    
    per_class_precision[cls] = true_pos / (true_pos + false_pos)
    
    per_class_recall[cls] = true_pos / (true_pos + false_neg)

In [None]:
print('Category, Accuracy, Precision, Recall')
for cls, acc in per_class_accuracy.items():
    prec = per_class_precision[cls]
    recall = per_class_recall[cls]
    print(f'{cls} {viz_util.num_to_name[str(cls)]},{acc},{prec},{recall}')
    
# paste the result into Pages, and fix the row for "27 Lakes, lagoons, and natural cienaga"

Since the dataset is so unbalanced (mostly 12 - dense forest) and accuracy counts "true negatives" as a win, this is not a good measure of performance.

### Save the evaluation findings - not yet done

In [None]:
saved = {
    'overall_accuracy': overall_accuracy,
    'per_class_accuracy': per_class_accuracy,
    # 'calibration_summary': calibration_summary
}

with open(eval_saved_to, 'w') as f:
    json.dump(saved, f, indent=4)

### Is the model well-calibrated?

We can also just record a 2D shape - each cell is the confidence of the most confident class?

In [None]:
with open(output_scores_path, 'rb') as f:
    dict_scores = pickle.load(f)

In [None]:
classes_to_plot = [0, 11, 12, 17, 19, 26, 32]

In [None]:
y_true = defaultdict(list)
y_prob = defaultdict(list)

for window, chip_scores in tqdm(dict_scores.items()):
    # rasterio window is (col_off x, row_off y, width, height)
    
    chip_scores = chip_scores.squeeze()  # chip_scores have shape (1, 33, 256, 256)
    chip_scores = chip_scores.reshape((33, -1))

    chip_labels = label_mask[window[0]:window[0] + 256, window[1]:window[1] + 256]
    chip_labels = chip_labels.reshape((1, -1))
    # we pad 0 to the end of chips after the tile ends
    chip_labels = np.pad(chip_labels, ((0, 0), (0, 256*256 - chip_labels.shape[1]))).squeeze()
    
    assert chip_scores.shape == (33, 256*256), chip_scores.shape
    assert chip_labels.shape == (256*256,), chip_labels.shape
    
    for cls in classes_to_plot:
        cls_y_true = chip_labels == cls
        cls_y_prob = chip_scores[cls]
        assert len(list(cls_y_true)) == len(list(cls_y_prob)), '{}, {}'.format(
            len(list(cls_y_true)), len(list(cls_y_prob))
        )
        y_true[cls].extend(list(cls_y_true))
        y_prob[cls].extend(list(cls_y_prob))

In [None]:
len(y_true[12])
len(y_prob[12])

In [None]:
_ = plt.plot([0.0, 1.0], color='grey', linestyle=':')

for cls in classes_to_plot:
    _ = frac_positives, mean_prob_in_bin = calibration_curve(y_true[cls], y_prob[cls], n_bins=10)
    _ = plt.plot(mean_prob_in_bin, frac_positives, label=cls, color=viz_util.num_to_color[str(cls)])
_ = plt.legend()

In [None]:
mean_prob_in_bin
frac_positives

#### Expected number of pixels for the whole validation area

In [None]:
probability_sum = np.zeros(num_classes, dtype=np.float)

for window, chip_scores in dict_scores.items():
    # print(chip_scores.shape)  # (1, 33, 256, 256)
    chip_scores = chip_scores.squeeze()
    chip_scores = chip_scores.sum(axis=(1, 2))  # height and width dims
    probability_sum += chip_scores

In [None]:
calibration_summary = {}

for cls, (prob_sum, label_sum) in enumerate(zip(probability_sum, true_counts)):
    calibration_summary[cls] = {
        'prediction_probability_sum': prob_sum,
        'label_sum': int(label_sum)
    }
    print('Class {} - {}, prob_sum {}, label_sum {}'.format(cls, viz_util.num_to_name[str(cls)], round(prob_sum), label_sum))
    if label_sum > 0:
        print('    diff is {}%'.format(100 * round((prob_sum - label_sum)/label_sum, 3)))
        calibration_summary[cls]['difference_wrt_label_sum'] = (prob_sum - label_sum)/label_sum