In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys; sys.path.extend(["../src", ".."])
import sensai
import logging
import config

cfg = config.get_config(reload=True)
sensai.util.logging.configureLogging(level=logging.INFO)

# Evaluating Clustering Algorithms

The present library contains utilities for evaluating different clustering algorithms
(with or without ground truth labels). On top of the evaluation utilities there are classes for
performing parameters sweeps and model selection. Here we give an overview of the most important functionality

In [None]:
import numpy as np
import os
from pprint import pprint
from sklearn.cluster import DBSCAN
import seaborn as sns
import geopandas as gp
import pandas as pd
import matplotlib.pyplot as plt
import logging

from sensai.geoanalytics.geopandas.coordinate_clustering import SkLearnCoordinateClustering
from sensai.hyperopt import GridSearch
from sensai.evaluation.evaluator_clustering import ClusteringModelSupervisedEvaluator, \
    ClusteringModelUnsupervisedEvaluator
from sensai.evaluation.eval_stats import ClusteringUnsupervisedEvalStats, ClusteringSupervisedEvalStats, \
    AdjustedMutualInfoScore
from sensai.geoanalytics.geopandas.coordinate_clustering_ground_truth import PolygonAnnotatedCoordinates

In [None]:
# loading data 
sampleFile = cfg.datafile_path("sample", stage=cfg.RAW) # this can point to a directory or a shp/geojson file
coordinatesDF = gp.read_file(sampleFile)

## Evaluating a Single Model

For a single model that was already fitted, evaluation statistics can be extracted with `ClusteringEvalStats`, see the
example below (the eval_stats object can also be used to retrieve evaluation results one by one)


In [None]:
dbscan = SkLearnCoordinateClustering(DBSCAN(eps=150, min_samples=20))
dbscan.fit(coordinatesDF)

evalStats = ClusteringUnsupervisedEvalStats.fromModel(dbscan)

pprint(evalStats.getAll())

plt.hist(evalStats.clusterSizeDistribution)
plt.show()

## Unsupervised Model Selection

For model selection we need to compare different (or differently parametrized) models that were
trained on the same dataset. The `ClusteringEvaluator` abstraction was designed with this goal in mind.
The evaluator can be used to obtain evaluation statistics for different models that are guaranteed
to be comparable with each other (always computed by the same object in the same way). Here is an example evaluating
DBSCAN's performance on metrics that don't necessitate ground truth labels.

In [None]:
modelEvaluator = ClusteringModelUnsupervisedEvaluator(coordinatesDF)

dbscanEvalStats = modelEvaluator.evalModel(dbscan, fit=False)  # dbscan was already fitted on this data

In [None]:
print("dbscan_performance: \n")
pprint(dbscanEvalStats.getAll())

One of the main purposes of evaluators is to be used within classes that perform a parameter sweep, e.g.
a `GridSearch`. All such objects return a data frame and (optionally) persist all evaluation results
in a CSV file.

In [None]:
parameterOptions = {
    "min_samples": [10, 20],
    "eps": [50, 150]
}

# for running the grid search in multiple processes, all objects need to be picklable.
# Therefore we pass a named function as model factory instead of a lambda
def dbscanFactory(**kwargs):
    return SkLearnCoordinateClustering(DBSCAN(**kwargs))

dbscanGridSearch = GridSearch(dbscanFactory, parameterOptions, csvResultsPath=os.path.join(cfg.temp, "dbscanGridSearchCsv"))

In [None]:
# the results of the grid-search are saved as a CSV file under the path provided above
resultDf = dbscanGridSearch.run(modelEvaluator, sortColumnName="numClusters", ascending=False)
resultDf.head()

The resulting data frame can be used to visualize the results through standard techniques,
e.g. pivoting and heatmaps


In [None]:
print("calinskiHarabaszScores")
chScoreHeatmap = resultDf.pivot(index="min_samples", columns="eps", values="CalinskiHarabaszScore")
sns.heatmap(chScoreHeatmap, annot=True)
plt.show()

In [None]:
print("daviesBouldinScores")
chScoreHeatmap = resultDf.pivot(index="min_samples", columns="eps", values="DaviesBouldinScore")
sns.heatmap(chScoreHeatmap, cmap=sns.cm.rocket_r, annot=True)
plt.show()

In [None]:
print("numClusters")
numClustersHeatmap = resultDf.pivot(index="min_samples", columns="eps", values="numClusters").astype(int)
sns.heatmap(numClustersHeatmap, annot=True)  # something goes wrong with the datatype here, maybe b/c of zero clusters
plt.show()

## Supervised Model Selection

### Obtaining Ground Truth Labels


The evaluation classes can take ground truth labels for all coordinates and use them for calculating related metrics.
However, such labels are typically hard to come by, especially if the coordinates cover a large area. Therefore the
library includes utilities for extracting labels from ground truth provided in form of __cluster polygons in a selected
region__. The central class for dealing with this kind of data is `ground_truth.PolygonAnnotatedCoordinates`,
see examples below.

In [None]:
# The polygons can be read directly from a file, see the documentation for more details
groundTruthClusters = PolygonAnnotatedCoordinates(coordinatesDF, cfg.datafile_path("sample", stage=cfg.GROUND_TRUTH))

As usual, the object has methods for plotting and exporting to geodata frames.
These can be very useful for inspecting the provided data

In [None]:
groundTruthClusters.plot(markersize=0.2, cmap="plasma")
plt.show()

groundTruthClusters.toGeoDF().head()

### Supervised Evaluation Metrics

We can extract the coordinates and labels for the annotated region and use them in evaluation. In the following,
we will evaluate a slight adaptation of DBSCAN which uses an additional bound, i.e. it will ultimately reject clusters that do not reach a minimum size.
We will train it on datapoints in the ground truth region and evaluate the results against the true labels.

In [None]:
boundedDbscan = SkLearnCoordinateClustering(DBSCAN(eps=150, min_samples=20), minClusterSize=100)

groundTruthCoordinates, groundTruthLabels = groundTruthClusters.getCoordinatesLabels()

supervisedEvaluator = ClusteringModelSupervisedEvaluator(groundTruthCoordinates, trueLabels=groundTruthLabels)
supervisedEvalStats = supervisedEvaluator.evalModel(boundedDbscan)

print("Supervised evaluation metrics of bounded dbscan:")
pprint(supervisedEvalStats.getAll())

### Comparing Unsupervised Evaluation Metrics

It can also be instructive to compare unsupervised evaluation metrics.

In [None]:
groundTruthUnsupervisedMetrics = ClusteringUnsupervisedEvalStats(groundTruthCoordinates, groundTruthLabels).metricsDict()
boundedDbscanUnsupervisedMetrics = ClusteringUnsupervisedEvalStats.fromModel(boundedDbscan).metricsDict()

pd.DataFrame({"bounded DBSCAN": boundedDbscanUnsupervisedMetrics, "ground truth": groundTruthUnsupervisedMetrics}, 
    index=groundTruthUnsupervisedMetrics.keys())

The bounded DBSCAN is already performing quite well with the given parameters, although we see that it segregates clusters too
much and has a general tendency towards smaller clusters. These tendencies can be seen visually by comparing the ground
truth and the bounded DBSCAN cluster plots.

In [None]:
groundTruthClusters.plot(markersize=0.2, cmap="plasma", includeNoise=False)

boundedDbscan.plot(markersize=0.2, includeNoise=False)

### Parameter Search

We can now bring everything together by running a grid search and evaluating against ground truth. Very little code
is needed for that:

In [None]:
parameterOptions = {
    "min_samples": [19, 20, 21],
    "eps": [140, 150, 160]
}

supervisedGridSearch = GridSearch(dbscanFactory, parameterOptions,
    csvResultsPath=os.path.join(cfg.temp, "bounded_dbscan_grid_search.csv"))
supervisedResultDf = supervisedGridSearch.run(supervisedEvaluator, sortColumnName=AdjustedMutualInfoScore.name,
    ascending=False)
supervisedResultDf

According to the adjusted mutual information score, we have now found a new parameter combination (see rightmost columns of first row) which yields results even closer to the ground truth.