In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

from google.colab import files

!pip install Levenshtein
!pip install fastkml
!pip install OSMPythonTools
!pip install flickrapi
!pip install ultralytics

In [2]:
#System Imports
import json
import glob
import os

#Library Imports
import pandas as pd
from scipy.spatial import KDTree
import sklearn
from sklearn.cluster import KMeans
import Levenshtein
import numpy as np
from sklearn.cluster import DBSCAN

In [3]:
directory_name = "gdrive/MyDrive/Colab Notebooks/GESTALT/"

In [4]:
#Variables (update these to change the python script that runs)
INPUT_DIRECTORY=directory_name+"/data/nic_output/datacollection"
OUTPUT_DIRECTORY=directory_name+"/data/nic_output/ownershipAssignment"
# Epsilon here = 0.1/6371 (i.e. approx 180m in lat/long)
EPSILON=0.000015696123058
MIN_CLUSTER_SIZE=3

!export PYTHONPATH=src
!python "gdrive/MyDrive/Colab Notebooks/GESTALT/gestalt.py" --ownershipAssignment "dbscan" --inputDirectory "gdrive/MyDrive/Colab Notebooks/GESTALT/data/nic_output/dataCollection" --outputDirectory "gdrive/MyDrive/Colab Notebooks/GESTALT/data/nic_output/ownershipAssignment" --epsilon 0.000015696123058 --numClusters 3

Adding extractedKML_Swan_Valley.json to K-Means
Adding osmwinerybrewery.json to K-Means
Adding flickr_metadata_objects.json to K-Means
Adding osm_-31.90009882641578115.96168231510637-31.77307863942101116.05029961853784_allobjects.json to K-Means
Adding locations_-31.90009882641578115.96168231510637-31.77307863942101116.05029961853784_alllocations.json to K-Means
Adding objects__-31.90009882641578115.96168231510637-31.77307863942101116.05029961853784_allobjects.json to K-Means
Adding objects_flickr_metadata_objects.json to K-Means
Adding objects_KML_Swan_Valley.json to K-Means
Converted objects and OSM details to DataFrames
Clustering with DBScan
                        name  ...  cluster
osm_32485733        crossing  ...        0
osm_32522370        crossing  ...        1
osm_32522378  level_crossing  ...        0
osm_32522385  level_crossing  ...       -1
osm_32522391  level_crossing  ...        2
...                      ...  ...      ...
kml_142         picnic_table  ...      175
km

In [5]:
class ClusteringMetrics:
    def __init__(self, object_assignment_filename):
        self.metrics_df = pd.read_csv(object_assignment_filename)[['true_location','predicted_location_dbscan']].copy()
        self.metrics_df.loc[:,'TP'] = self.metrics_df.apply(lambda row : Levenshtein.ratio(row['true_location'], row['predicted_location_dbscan']) >= 0.7 , axis=1)
        self.metrics_df.loc[:,'F_'] = self.metrics_df.apply(lambda row : Levenshtein.ratio(row['true_location'], row['predicted_location_dbscan']) < 0.7 , axis=1)

    def clustering_recall(self):
        TP_dict = dict()
        FN_dict = dict()
        recall_dict = dict()

        true_locs_list = ['Alis_Vineyard', 'Little_River_Winery', 'Faber_Vineyard', 'Ugly_Duckling_Wines', 'Oakover_Grounds', 'Lancaster_Wines']

        for vineyard in true_locs_list:
            select_df = self.metrics_df[self.metrics_df['true_location'] == vineyard]
            TP_dict[vineyard] = len(select_df[select_df['TP'] == True])
            FN_dict[vineyard] = len(select_df[select_df['F_'] == True])

        for vineyard in TP_dict.keys():
            recall = TP_dict[vineyard] / (TP_dict[vineyard] + FN_dict[vineyard])
            recall_dict[vineyard] = recall
            class_size = TP_dict[vineyard] + FN_dict[vineyard]

        avg_recall = np.mean(list(recall_dict.values()))

        return avg_recall, recall_dict

    def clustering_weighted_recall(self):
        TP_dict = dict()
        FN_dict = dict()
        weighted_recall_dict = dict()
        total_dict = dict()

        true_locs_list = ['Alis_Vineyard', 'Little_River_Winery', 'Faber_Vineyard', 'Ugly_Duckling_Wines', 'Oakover_Grounds', 'Lancaster_Wines']

        for vineyard in true_locs_list:
            select_df = self.metrics_df[self.metrics_df['true_location'] == vineyard]
            TP_dict[vineyard] = len(select_df[select_df['TP'] == True])
            FN_dict[vineyard] = len(select_df[select_df['F_'] == True])

        for vineyard in TP_dict.keys():
            recall = TP_dict[vineyard] / (TP_dict[vineyard] + FN_dict[vineyard])
            class_size = TP_dict[vineyard] + FN_dict[vineyard]
            weighted_recall_dict[vineyard] = (class_size * recall)
            total_dict[vineyard] = class_size

        weighted_avg_recall = np.sum(list(weighted_recall_dict.values()))/(np.sum(list(total_dict.values())))

        return weighted_avg_recall, weighted_recall_dict, total_dict

    def clustering_precision(self):
        TP_dict = dict()
        FP_dict = dict()
        precision_dict = dict()

        true_locs_list = self.metrics_df['predicted_location_dbscan'].unique()

        for vineyard in true_locs_list:
            select_df = self.metrics_df[self.metrics_df['predicted_location_dbscan'] == vineyard]
            TP_dict[vineyard] = len(select_df[select_df['TP'] == True])
            FP_dict[vineyard] = len(select_df[select_df['F_'] == True])

        for vineyard in TP_dict.keys():
            if str(vineyard) == "nan":  # not assigned a cluster, skip
                continue
            precision = TP_dict[vineyard] / (TP_dict[vineyard] + FP_dict[vineyard])
            precision_dict[vineyard] = precision
            class_size = TP_dict[vineyard] + FP_dict[vineyard]

        avg_precision = np.mean(list(precision_dict.values()))

        return avg_precision, precision_dict

    def clustering_weighted_precision(self):
        TP_dict = dict()
        FP_dict = dict()
        weighted_precision_dict = dict()
        total_dict = dict()

        true_locs_list = self.metrics_df['predicted_location_dbscan'].unique()

        for vineyard in true_locs_list:
            select_df = self.metrics_df[self.metrics_df['predicted_location_dbscan'] == vineyard]
            TP_dict[vineyard] = len(select_df[select_df['TP'] == True])
            FP_dict[vineyard] = len(select_df[select_df['F_'] == True])

        for vineyard in TP_dict.keys():
            if str(vineyard) == "nan":  # not assigned a cluster, skip
                continue
            precision = TP_dict[vineyard] / (TP_dict[vineyard] + FP_dict[vineyard])
            class_size = TP_dict[vineyard] + FP_dict[vineyard]
            weighted_precision_dict[vineyard] = (class_size * precision)
            total_dict[vineyard] = class_size

        weighted_avg_precision = np.sum(list(weighted_precision_dict.values()))/(np.sum(list(total_dict.values())))

        return weighted_avg_precision, weighted_precision_dict, total_dict

if __name__ == "__main__":
    metrics = ClusteringMetrics(directory_name+'/data/nic_output/ownershipAssignment/obj_df.csv')
    print("RECALL: ", metrics.clustering_recall())
    print("WEIGHTED RECALL: ", metrics.clustering_weighted_recall())
    print("PRECISION: ", metrics.clustering_precision())
    print("WEIGHTED PRECISION: ", metrics.clustering_weighted_precision())

RECALL:  (0.9272366522366523, {'Alis_Vineyard': 1.0, 'Little_River_Winery': 1.0, 'Faber_Vineyard': 0.9166666666666666, 'Ugly_Duckling_Wines': 1.0, 'Oakover_Grounds': 0.8285714285714286, 'Lancaster_Wines': 0.8181818181818182})
WEIGHTED RECALL:  (0.9178082191780822, {'Alis_Vineyard': 18.0, 'Little_River_Winery': 19.0, 'Faber_Vineyard': 22.0, 'Ugly_Duckling_Wines': 28.0, 'Oakover_Grounds': 29.000000000000004, 'Lancaster_Wines': 18.0}, {'Alis_Vineyard': 18, 'Little_River_Winery': 19, 'Faber_Vineyard': 24, 'Ugly_Duckling_Wines': 28, 'Oakover_Grounds': 35, 'Lancaster_Wines': 22})
PRECISION:  (1.0, {"Ali's Vineyard": 1.0, 'Little River Winery and Café': 1.0, 'Faber Vineyard': 1.0, 'Ugly Duckling Wines': 1.0, 'Oakover Grounds': 1.0, 'Lancaster Wines': 1.0})
WEIGHTED PRECISION:  (1.0, {"Ali's Vineyard": 18.0, 'Little River Winery and Café': 19.0, 'Faber Vineyard': 22.0, 'Ugly Duckling Wines': 28.0, 'Oakover Grounds': 29.0, 'Lancaster Wines': 18.0}, {"Ali's Vineyard": 18, 'Little River Winery an