In [7]:
from google.colab import drive
drive.mount('/content/gdrive')

from google.colab import files

!pip install Levenshtein
!pip install fastkml
!pip install OSMPythonTools
!pip install flickrapi
!pip install ultralytics

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
#System Imports
import json
import glob
import os

#Library Imports
import pandas as pd
from scipy.spatial import KDTree
import sklearn
from sklearn.cluster import KMeans
import Levenshtein
import numpy as np
from sklearn.cluster import DBSCAN

In [8]:
directory_name = "gdrive/MyDrive/Colab Notebooks/GESTALT/"

In [9]:
#Variables (update these to change the python script that runs)
INPUT_DIRECTORY=directory_name+"/data/nic_output/datacollection"
OUTPUT_DIRECTORY=directory_name+"/data/nic_output/ownershipAssignment"
# Epsilon here = 0.1/6371 (i.e. approx 180m in lat/long)
EPSILON=0.000015696123058
MIN_CLUSTER_SIZE=3

!export PYTHONPATH=src
!python "gdrive/MyDrive/Colab Notebooks/GESTALT/gestalt.py" --ownershipAssignment "dbscan" --inputDirectory "gdrive/MyDrive/Colab Notebooks/GESTALT/data/nic_output/dataCollection" --outputDirectory "gdrive/MyDrive/Colab Notebooks/GESTALT/data/nic_output/ownershipAssignment" --epsilon 0.000015696123058 --numClusters 3

Adding extractedKML_Swan_Valley.json to K-Means
Adding osmwinerybrewery.json to K-Means
Adding flickr_metadata_objects.json to K-Means
Adding osm_-31.90009882641578115.96168231510637-31.77307863942101116.05029961853784_allobjects.json to K-Means
Adding locations_-31.90009882641578115.96168231510637-31.77307863942101116.05029961853784_alllocations.json to K-Means
Adding objects__-31.90009882641578115.96168231510637-31.77307863942101116.05029961853784_allobjects.json to K-Means
Adding objects_flickr_metadata_objects.json to K-Means
Adding objects_KML_Swan_Valley.json to K-Means
Converted objects and OSM details to DataFrames
Clustering with DBScan
                        name  ...  cluster
osm_32485733        crossing  ...        0
osm_32522370        crossing  ...        1
osm_32522378  level_crossing  ...        0
osm_32522385  level_crossing  ...       -1
osm_32522391  level_crossing  ...        2
...                      ...  ...      ...
kml_142         picnic_table  ...      175
km

In [139]:
#System Imports
import json

#Library Imports
import pandas as pd
from scipy.spatial import KDTree
import sklearn
from sklearn.cluster import KMeans
import Levenshtein
import numpy as np
from sklearn.cluster import DBSCAN

import matplotlib.pyplot as plt

class OwnershipAssigner():
	def __init__(self,locationData, objData):
		self._locationDict = locationData
		self._objectDict = objData

	def flatten_locations(self, locationsFile):
		'''
		Function to take the fully expressive Locations from openStreetMaps and squash into a flatter dict to be made into data frames
		Input Args:
			locationsFile - dict of dicts - contains all the locations within the bounding box from OSM.
		Operations:
			- Iterate through the dictionary, generate lists
		Output
			- flatLocations - dict of lists
		'''

		flatLocations = {}															#Initialize vars
		locations = []
		latitudes = []
		longitudes = []

		for loc in locationsFile.keys():										#Loop through dict & append vals to list
			locations.append(loc)
			latitudes.append(locationsFile[loc]['latitude'])
			longitudes.append(locationsFile[loc]['longitude'])

		flatLocations["location"] = locations 										#Create the flattened dict of lists.
		flatLocations["latitude"] = latitudes
		flatLocations["longitude"] = longitudes

		return flatLocations

	def flatten_objects_from_osm_dump(self, objectsDict):
		print("Starting to flatten_objects_from_osm_dump")

		flatObjects = {}
		flatObjects["object"] = []
		flatObjects["latitude"] = []
		flatObjects["longitude"] = []

		for object in objectsDict.keys():
			flatObjects["object"].append(objectsDict[object]['name'])
			flatObjects["latitude"].append(objectsDict[object]['latitude'])
			flatObjects["longitude"].append(objectsDict[object]['longitude'])

		return flatObjects




	def flatten_objects_from_kml(self, region):
		'''
		Function to take the fully expressive objects from the KML and squash into a flatter dict to be made into data frames
		Input Args:
			region - string - the name of the region within the dict of objects to be flattened.
			(implicit) self._objectDict - dict of dicts - contains all the objects within the bounding box from OSM.
		Operations:
			- Iterate through the dictionary, generate lists
		Output
			- flatOBJ - dict of lists
		'''

		flatOBJ = {} 															# Initialize vars
		locations = []
		objects = []
		latitudes = []
		longitudes = []

		attributeNumbers = []
		attributes = set() 														# Set used to generate list of unique descriptors

		for region in self._objDict.keys():										# Loop to get all the attribute descriptors.
			for loc in self._objDict[region].keys():
				for obj in self._objDict[region][loc].keys():
					if self._objDict[region][loc][obj]["description"] != None:
						attributeNumbers.append(len(self._objDict[region][loc][obj]["description"]))
						for key in self._objDict[region][loc][obj]["description"].keys():
							attributes.add(list(self._objDict[region][loc][obj]["description"][key].keys())[0])

					else:
						attributeNumbers.append(0)


		flatOBJ["object"] = [] 											# Construct the dictionary
		flatOBJ["latitude"] = []
		flatOBJ["longitude"] = []
		flatOBJ["true_location"] = []
		for attribute in attributes: 									# Add in keys and empty lists for each descriptor
			flatOBJ[attribute] = []



		for loc in self._objDict[region].keys():								# Loop through each object
			for obj in self._objDict[region][loc].keys():
				#print(self._objDict[region][loc][obj])
				flatOBJ["object"].append(self._objDict[region][loc][obj]['name'])
				flatOBJ["latitude"].append(self._objDict[region][loc][obj]['latitude'])
				flatOBJ["longitude"].append(self._objDict[region][loc][obj]['longitude'])
				flatOBJ["true_location"].append(loc)

				usedDescriptors = [] 											#Loop through each descriptor for an object, append to respective list or append None
				if self._objDict[region][loc][obj]['description'] is not None:
					for descriptor in self._objDict[region][loc][obj]['description'].keys():
						for key in self._objDict[region][loc][obj]['description'][descriptor].keys():
							#try:
							flatOBJ[key].append((self._objDict[region][loc][obj]['description'][descriptor][key]))
							#except KeyError:
							#	flatOBJ[key] = []
							#	flatOBJ[key].append((self._objDict[region][loc][obj]['description'][descriptor][key]))
							usedDescriptors.append(key)

				for attribute in attributes:
					if attribute not in usedDescriptors:
						#try:
						flatOBJ[attribute].append(None)
						usedDescriptors.append(attribute)
						#except KeyError:
						#	flatOBJ[attribute] = []
						#	flatOBJ[attribute].append(None)


		for obj in flatOBJ.keys():
			if len(flatOBJ[obj]) > len(flatOBJ["object"]): #Hacky workaround to get dataframes to be same length. TODO: Fix bug.
				del flatOBJ[obj][-1]

		return flatOBJ

	def convertToDataFrame(self, flatLocations, flatObjects):								# Convert two flattened dictionaries into data frames

		self._df_locations = pd.DataFrame.from_dict(flatLocations, orient="index")

		self._df_objects = pd.DataFrame.from_dict(flatObjects, orient="index")
		self._locationCoordinates = []
		self._locationIndex = {}

		i = 0
		for index, row in self._df_locations.iterrows():
			elem = [row[2],row[1]]								#Long, Lat
			self._locationCoordinates.append(elem)
			#print("LOCATION INDEX", row[0])
			#print("LOCATION COORDINATES", elem)

			self._locationIndex[i] = row[0]
			i+=1

		self._location_kdTree = KDTree(self._locationCoordinates)

		self._objectCoordinates = []

		for index, row in self._df_objects.iterrows():
			elem = [row[2],row[1]]							#Long, lat
			self._objectCoordinates.append(elem)
			#self._objectIndex[index] = row[0]

		self._objects_kdTree = KDTree(self._objectCoordinates)

		print("Converted objects and OSM details to DataFrames")

		return ((self._df_locations, self._df_objects))

	def printToFile(self):
		'''
		Function to flatten coordiantes into a 0-100 grid for vizualization.
		Input Args:
			- boundingBox - list of floats - defines the max and min x and y coords to serve as 0 and 100.
			- (implicit) self._df_osm - pandas dataframe containing the names, lat and longs of locations.
			- (implicit) self._df_obj - pandas dataframe containing the names, lat, longs and parent locations of objects.
		Actions:
			- Use minimax normalization to make the bounding box go from 0:100
			- Append minimax normalized coordinates to end of dataframe
		Returns:
			- Prints to csv the modified self._df_osm and self._df_obj dataframes.
 		'''

 		#Print the dataframes to file.
		directory_name = "gdrive/MyDrive/Colab Notebooks/GESTALT/"  #NSCH
		self._df_locations.to_csv(directory_name+"data/osm_df.csv", index=False)  #NSCH
		self._df_objects.to_csv(directory_name+"data/obj_df.csv", index=False)  #NSCH

	def kMeans_membership(self, objs_to_cluster_df, numberOfClusters, fuzzy=False):
		print("Clustering with kMeans")
		kmeans = KMeans(n_clusters=numberOfClusters, random_state=0, n_init="auto")
		objs_to_cluster_df['cluster'] = kmeans.fit_predict(objs_to_cluster_df[['latitude','longitude']])
		centroids = kmeans.cluster_centers_

		self.inferLocation(objs_to_cluster_df, centroids,"kmeans")
		print(self._df_objects)

		plt.xlabel('Longitude')
		plt.ylabel('Latitude')
		plt.scatter(y=self._df_objects.latitude ,x=self._df_objects.longitude, c=self._df_objects.cluster, alpha =0.6, s=10)
		plt.scatter(y=self._df_locations['latitude'],x=self._df_locations['longitude'], label="Locations",alpha =0.6, s=10 )
		plt.show()  #NSCH
		#plt.savefig('../data/output/clusters.png')  #NSCH

	def dbscan_membership(self, epsilon=0.5/6371., minCluster=3, fuzzy_threshold=1.0):  # default is exact assignment
		#1/6371 is ~100m
		print("Clustering with DBScan")
		loc_arr = np.array(self._objectCoordinates)

		db_cluster =  DBSCAN(eps=epsilon, min_samples=minCluster).fit(np.radians(loc_arr))
		self._df_objects['cluster'] = db_cluster.labels_

		centroids = self.calculateCentroids(db_cluster.labels_)

		dists = []
		for idx, row in self._df_objects.iterrows():
			obj_coord = (row['latitude'], row['longitude'])
			centroid_coord = centroids[row['cluster']]  # look up in centroid list
			dists.append(self.__distance__(obj_coord, centroid_coord))

		self._df_objects['assignment_prob'] = 1 - self.__normalize_probs__(dists, mask=list(self._df_objects['cluster'] != -1))
		display(self._df_objects)

		# Fuzzy multiple asn
		if fuzzy_threshold < 1.0:
				df_multi_asn_objects = self._df_objects.copy()
				for idx, row in self._df_objects.iterrows():
						for centroid in centroids:
								obj_centroid_dist = self.__distance__((row['latitude'], row['longitude']), centroid)
								# if obj-centroid distance is within THRESHOLD% of range of obj-centroid distances we saw during exact assignment
								threshold = fuzzy_threshold * (self.cluster_max_dist - self.cluster_min_dist)
								print("THRESHOLD : ", threshold)
								print("range : ", self.cluster_max_dist - self.cluster_min_dist)
								print("ACTIAL: ", obj_centroid_dist - self.cluster_min_dist)
								if (obj_centroid_dist - self.cluster_min_dist) < threshold:
										#print("doubling up on: ", idx)
										df_multi_asn_objects = df_multi_asn_objects.append(row)
										#display(pd.concat([df_multi_asn_objects, pd.DataFrame(row, index=[idx], columns=df_multi_asn_objects.columns)], axis=1, ignore_index=True).tail())


		self._df_objects = df_multi_asn_objects

		self.inferLocation(self._df_objects, centroids,"dbscan")

		self._df_objects = df_multi_asn_objects
		display(self._df_objects)

	def __distance__(self, point1, point2):
		return np.linalg.norm(point1 - point2)

	def __normalize_probs__(self, column, mask):
		# expects a list mask of booleans, where True means we account for the datapoint as valid max or min
		valid_data =  np.array(column)[np.array(mask)]

		# Set class vars for fuzzy multiple assignment if applicable
		self.cluster_min_dist = np.min(valid_data)
		self.cluster_max_dist = np.max(valid_data)

		return_col = np.array((column - np.min(valid_data)) / (np.max(valid_data) - np.min(valid_data)))
		return_col[~np.array(mask)] = 0.5  # forcing the ones we don't count to have prob = 0.5
		return return_col

	def calculateCentroids(self, clusters):
		print("Calculating Centroids")
		centroids = []

		for cluster in range (0, (max(clusters)+1)): 								#+1 to account for indexing from 0
			cluster_df = self._df_objects.loc[self._df_objects['cluster'] == cluster]		# Get only the coords belonging to this cluster
			coords = []

			for index, obj in cluster_df.iterrows(): 								# Make the coords into a list, then numpy array
				coords.append([obj.latitude, obj.longitude])
			np_coords = np.array(coords)

			centroid = np.mean(np_coords,axis=0) 									# Get the midpoint of the array
			centroids.append(centroid) 												# Build list of centroids

		return(centroids)


	def inferLocation(self, objs_to_assign_df, centroids, method):
		print("Inferring object location")
		mappings = {} 																#Dict so that arbitrary number of clusters can be used
		for centroid in range (0, (len(centroids))): 								# For each centroid
			d, i = self._location_kdTree.query(centroids[centroid],1) 				# Look up its nearest neighbour in the KD tree
			#idx = (list(self._locationIndex.keys()))[i]
			#print(self._locationIndex[idx])
#			print(self._locationIndex[i])  # NSCH shhh
#			print(centroids[centroid])  # NSCH shhh
			mappings[centroid] = self._locationIndex[i]

		objs_to_assign_df['predicted_location'] = objs_to_assign_df.cluster.map(mappings) 		# Infer that the nearest neighbour is the cluster location


	def evaluateClusters(self, df_to_eval, method):
		#Move this to own function later. Use Levenshtein at 0.7 to handle labelling differences.
		matches = []

		for index,row in self._df_objects.iterrows():
			if Levenshtein.ratio(row['predicted_location'], row["true_location"]) >= 0.7:
				matches.append("True")
			else:
				matches.append("False")

		df_to_eval[method+"_correct"] = matches

		print(df_to_eval)

In [120]:
import warnings
warnings.filterwarnings("ignore")

In [142]:
# FROM gestalt.py DBSCAN ownersip assignemnt

prefix = "gdrive/MyDrive/Colab Notebooks/GESTALT/data/nic_output/dataCollection"														#Read parameters for the file
outputFile = "gdrive/MyDrive/Colab Notebooks/GESTALT/data/nic_output/ownershipAssignment"
numClusters = 3
epsilon= 0.000015696123058
minCluster = 3

objectsDict = {}																			#Initilaize the Dicts
locationsDict = {}

for file in os.listdir(prefix):																# Load in the files
    print("Adding",file,"to DBSCAN")
    if file.startswith("objects"): 															# Get the objects files
        with open(prefix+"/"+file, "r") as inObjs:
	  	 	    objects = json.load(inObjs)
        objectsDict.update(objects)

    if file.startswith("locations"):														# Get the locations files
        with open(prefix+"/"+file, "r") as inLocs:
            locations = json.load(inLocs)
        locationsDict.update(locations)

ownerAssigner = OwnershipAssigner(locations, objects) 										# Initalize the ownership assigner

df_locations, df_objects = ownerAssigner.convertToDataFrame(locationsDict, objectsDict)		# Convert the dictionaries created from the JSON inputs to Pandas Dataframes

ownerAssigner._df_locations.to_csv(outputFile+"/locationsNSCH.csv", index=False) 				# Write the dataframes to file before clustering
ownerAssigner._df_objects.to_csv(outputFile+"/objectsNSCH.csv", index=False)


ownerAssigner.dbscan_membership(epsilon,minCluster,fuzzy_threshold=0.2) 										# Cluster the objects
clusters = ownerAssigner._df_objects["cluster"] 											# Infer the location
print(clusters.value_counts())

ownerAssigner._df_objects.to_csv(outputFile+"/DBSCAN_PredictedLocations.csv", index=False)	# Save to file

Adding extractedKML_Swan_Valley.json to DBSCAN
Adding osmwinerybrewery.json to DBSCAN
Adding flickr_metadata_objects.json to DBSCAN
Adding osm_-31.90009882641578115.96168231510637-31.77307863942101116.05029961853784_allobjects.json to DBSCAN
Adding locations_-31.90009882641578115.96168231510637-31.77307863942101116.05029961853784_alllocations.json to DBSCAN
Adding objects__-31.90009882641578115.96168231510637-31.77307863942101116.05029961853784_allobjects.json to DBSCAN
Adding objects_flickr_metadata_objects.json to DBSCAN
Adding objects_KML_Swan_Valley.json to DBSCAN
Converted objects and OSM details to DataFrames
Clustering with DBScan
Calculating Centroids


Unnamed: 0,name,longitude,latitude,date,origin,source,object_prob,assignment_prob,cluster
osm_32485733,crossing,115.982075,-31.895145,15-06-23 21:33:37,osm,https://www.openstreetmap.org/node/32485733,0.7,0.986951,0
osm_32522370,crossing,115.976219,-31.898668,15-06-23 21:33:37,osm,https://www.openstreetmap.org/node/32522370,0.7,0.832233,1
osm_32522378,level_crossing,115.981850,-31.895455,15-06-23 21:33:37,osm,https://www.openstreetmap.org/node/32522378,0.7,0.989414,0
osm_32522385,level_crossing,115.988869,-31.892207,15-06-23 21:33:37,osm,https://www.openstreetmap.org/node/32522385,0.7,0.500000,-1
osm_32522391,level_crossing,115.996081,-31.891174,15-06-23 21:33:37,osm,https://www.openstreetmap.org/node/32522391,0.7,0.954510,2
...,...,...,...,...,...,...,...,...,...
kml_142,picnic_table,115.990902,-31.851874,05-06-23 10:19:26,kml,../data/input/Swan_Valley.kml,1.0,0.980118,175
kml_143,picnic_table,115.990921,-31.851803,05-06-23 10:19:26,kml,../data/input/Swan_Valley.kml,1.0,0.978155,175
kml_144,picnic_table,115.990867,-31.851802,05-06-23 10:19:26,kml,../data/input/Swan_Valley.kml,1.0,0.981154,175
kml_145,picnic_table,115.990801,-31.851801,05-06-23 10:19:26,kml,../data/input/Swan_Valley.kml,1.0,0.984722,175


Inferring object location


Unnamed: 0,name,longitude,latitude,date,origin,source,object_prob,assignment_prob,cluster,predicted_location
osm_32485733,crossing,115.982075,-31.895145,15-06-23 21:33:37,osm,https://www.openstreetmap.org/node/32485733,0.7,0.986951,0,Chapel of St Mary & St George
osm_32522370,crossing,115.976219,-31.898668,15-06-23 21:33:37,osm,https://www.openstreetmap.org/node/32522370,0.7,0.832233,1,Guildford Garden Centre
osm_32522378,level_crossing,115.981850,-31.895455,15-06-23 21:33:37,osm,https://www.openstreetmap.org/node/32522378,0.7,0.989414,0,Chapel of St Mary & St George
osm_32522385,level_crossing,115.988869,-31.892207,15-06-23 21:33:37,osm,https://www.openstreetmap.org/node/32522385,0.7,0.500000,-1,
osm_32522391,level_crossing,115.996081,-31.891174,15-06-23 21:33:37,osm,https://www.openstreetmap.org/node/32522391,0.7,0.954510,2,The Cheesecake Shop
...,...,...,...,...,...,...,...,...,...,...
kml_142,picnic_table,115.990902,-31.851874,05-06-23 10:19:26,kml,../data/input/Swan_Valley.kml,1.0,0.980118,175,Lancaster Wines
kml_143,picnic_table,115.990921,-31.851803,05-06-23 10:19:26,kml,../data/input/Swan_Valley.kml,1.0,0.978155,175,Lancaster Wines
kml_144,picnic_table,115.990867,-31.851802,05-06-23 10:19:26,kml,../data/input/Swan_Valley.kml,1.0,0.981154,175,Lancaster Wines
kml_145,picnic_table,115.990801,-31.851801,05-06-23 10:19:26,kml,../data/input/Swan_Valley.kml,1.0,0.984722,175,Lancaster Wines


-1      1138
 1       528
 48      462
 22      300
 67      282
        ... 
 134       6
 23        6
 34        6
 24        6
 161       6
Name: cluster, Length: 177, dtype: int64
