In [72]:
# Load required libraries
from scipy.optimize import linear_sum_assignment
from scipy.spatial.distance import cdist
import random
import pandas as pd

In [73]:
# Randomly generate 2 tables of latitudes and longitudes which match up to a small random shift
# We want to join these tables based on comparing the lat/longs using bipartite matching then see how accurate the match is. 
table_1 = []
table_2 = []
for i in range(1000):
    table_1_lat = random.uniform(-90, 90)
    table_1_long = random.uniform(-180, 180)
    table_1.append({'table_1_id':i,'lat':table_1_lat,'long': table_1_long})
    table_2.append({'table_1_id':i,'lat':table_1_lat + random.uniform(-0.25, 0.25),'long': table_1_long + random.uniform(-0.25, 0.25)})

In [74]:
# Create pandas dataframes.
table_1 = pd.DataFrame.from_records(table_1)
random.shuffle(table_2)
table_2 = pd.DataFrame.from_records(table_2)

In [75]:
# One degree of latitude is not equal to one degree of longitude, so account for that by scaling them independently.
# This is important when dealing with actual lat/long values.
table_1['lat_miles'] = table_1['lat'] * 69
table_2['lat_miles'] = table_2['lat'] * 69
table_1['long_miles'] = table_1['long'] * 54.6
table_2['long_miles'] = table_2['long'] * 54.6

In [76]:
# Create matrix of distances which will be used as weights for the bipartite matching.
distances = cdist(table_1[['lat_miles', 'long_miles']], table_2[['lat_miles','long_miles']])

In [77]:
# Get the predicted matches from bipartite matching.
assignment = linear_sum_assignment(distances)

In [78]:
# Reorder table_2 according to the predictions, then assign the predicted index. 
# This should match the original index from table_1.
table_2 = table_2.reindex(assignment[1])
table_2['predicted_index'] = assignment[0]

In [83]:
# Check the proportion matched correctly.
sum(table_2['table_1_id'] == table_2['predicted_index']) / len(table_2)

0.996

In [81]:
# Observe any mistakes and note how close they are. 
table_2[table_2['table_1_id'] != table_2['predicted_index']]

Unnamed: 0,table_1_id,lat,long,lat_miles,long_miles,predicted_index
175,150,8.904399,104.720638,614.403524,5717.746836,145
323,145,8.928629,104.715262,616.075389,5717.453287,150
264,700,51.170766,141.812026,3530.782867,7742.93663,153
612,153,50.876547,141.694298,3510.48171,7736.508644,700
