In [None]:
! pip3 install pandas
! pip3 install geopandas
! pip3 install levenshtein


In [2]:
import pandas as pd
import geopandas as gpd
from Levenshtein import ratio

In [3]:
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)
pd.set_option("display.float_format", lambda x: "%.2f" % x)


In [None]:
# Match rate

# Load data into a DataFrame
df = pd.read_csv("output/here-geocoding-result-scenario-1.csv")

# Add a new column to distinguish between matched and unmatched results
df["is_geocoded"] = df["longitude"].apply(
    lambda x: "yes" if not pd.isna(x) else "no")

# Calculate Match Rate
all_addresses_count = df.shape[0]
matched_addresses_count = df[df["is_geocoded"] == "yes"].shape[0]
match_rate = matched_addresses_count * 100 / all_addresses_count
print(f"match rate: {match_rate}%")

# Convert geocoded addresses into a GeoDataFrame and set the coordinate reference system to EPSG:4326
geocoded_addresses = df[df["is_geocoded"] == "yes"]
geocoded_addresses = gpd.GeoDataFrame(
    geocoded_addresses,
    geometry=gpd.points_from_xy(
        geocoded_addresses["longitude"], geocoded_addresses["latitude"]
    ),
).set_crs("epsg:4326")

# Load the city boundary dataset into a GeoDataFrame and set the coordinate reference system to EPSG:4326
city_boundary = gpd.read_file(
    "input/city-boundary.geojson").set_crs("epsg:4326")
#  Create a single geometry representing the entire city boundary
city_boundary = city_boundary["geometry"].unary_union

# Calculate the proportion of geocoded addresses that fall within the city boundaries
geocoded_addresses_within_city = geocoded_addresses[
    geocoded_addresses["geometry"].within(city_boundary)
]
proportion = round(len(geocoded_addresses_within_city)
                   * 100 / all_addresses_count, 1)
print(
    f"proportion of geocoded results that fall within city boundaries: {proportion}%")


In [None]:
# Positional accuracy

# Load data into DataFrames
baseline = pd.read_csv("input/input-addresses.csv")
geocoder = pd.read_csv("output/here-geocoding-result-scenario-1.csv")

# Convert DataFrames into GeoDataFrames with a local projection system (EPSG:3776 for Alberta) to calculate distances in meters
baseline = (
    gpd.GeoDataFrame(
        baseline,
        geometry=gpd.points_from_xy(
            baseline["longitude"], baseline["latitude"]),
    )
    .set_crs("epsg:4326")
    .to_crs("epsg:3776")
)
geocoder = (
    gpd.GeoDataFrame(
        geocoder,
        geometry=gpd.points_from_xy(
            geocoder["longitude"], geocoder["latitude"]),
    )
    .set_crs("epsg:4326")
    .to_crs("epsg:3776")
)

# Calculate the distance between the geocoded points and the baseline
baseline["distance"] = baseline["geometry"].distance(geocoder["geometry"])

# Calculate summary statistics
summary_stats = baseline["distance"].describe()
print(summary_stats)


# Classify distance values into 5 groups to better understand their distributions
labels = ["10m", "100m", "1km", "10km", "10km+"]
classes = [0, 10, 100, 1000, 10000, float("inf")]

baseline["distance_class"] = pd.cut(
    baseline["distance"], bins=classes, labels=labels, include_lowest=True
)

# Calculate summary statistics
counts = baseline["distance_class"].value_counts(sort=False)
print(counts)

summary_stats = baseline["distance_class"].describe()
print(summary_stats)


In [None]:
# Positional similarity

# Load data into DataFrames
geocoder_1 = pd.read_csv("output/esri-geocoding-result-scenario-1.csv")
geocoder_2 = pd.read_csv("output/here-geocoding-result-scenario-1.csv")

# Conver DataFrames into GeoDataFrames with a local projection system (EPSG:3776 for Alberta) to calculate distances in meters
geocoder_1 = (
    gpd.GeoDataFrame(
        geocoder_1,
        geometry=gpd.points_from_xy(
            geocoder_1["longitude"], geocoder_1["latitude"]),
    )
    .set_crs("epsg:4326")
    .to_crs("epsg:3776")
)
geocoder_2 = (
    gpd.GeoDataFrame(
        geocoder_2,
        geometry=gpd.points_from_xy(
            geocoder_2["longitude"], geocoder_2["latitude"]),
    )
    .set_crs("epsg:4326")
    .to_crs("epsg:3776")
)

# Calculate pairwise distance between the geocoded points
df = pd.DataFrame()
df["distance"] = geocoder_1["geometry"].distance(geocoder_2["geometry"])

# Calculate summary statistics
summary_stats = df["distance"].describe()
print(summary_stats)

# Classify distance values into 5 groups to better understand their distributions
labels = ["10m", "100m", "1km", "10km", "10km+"]
classes = [0, 10, 100, 1000, 10000, float("inf")]

df["distance_class"] = pd.cut(
    df["distance"], bins=classes, labels=labels, include_lowest=True
)

# Calculate summary statistics
counts = baseline["distance_class"].value_counts(sort=False)
print(counts)

summary_stats = baseline["distance_class"].describe()
print(summary_stats)


In [None]:
# Lexical accuracy with provided labels

# Load data into DataFrames
baseline = pd.read_csv("output/canada-post-geocoding-result-scenario-1.csv")
geocoder = pd.read_csv("output/esri-geocoding-result-scenario-1.csv")

# Rename the label column in all DataFrames and join them
baseline = baseline[["id", "label"]].rename(
    columns={"label": "label_baseline"})
geocoder = geocoder[["label"]].rename(columns={"label": "label_geocoder"})
df = pd.concat([baseline, geocoder], axis=1)

# Calculate lexical accuracy
df["lexical_accuracy"] = df.apply(
    lambda x: ratio(x["label_baseline"], x["label_geocoder"]), axis=1
)

# Calculate summary statistics
summary_stats = df["lexical_accuracy"].describe()
print(summary_stats)

# Classify distance values into 5 groups to better understand their distributions
labels = ["0.25", "0.5", "0.75", "1"]
classes = [0, 0.25, 0.5, 0.75, 1]

df["lexical_accuracy_class"] = pd.cut(
    df["lexical_accuracy"], bins=classes, labels=labels, include_lowest=True
)

# Calculate summary statistics
counts = df["lexical_accuracy_class"].value_counts(sort=False)
print(counts)

summary_stats = df["lexical_accuracy_class"].describe()
print(summary_stats)


In [None]:
# Lexical accuracy with constructed labels

# Load data into DataFrames
baseline = pd.read_csv("output/canada-post-geocoding-result-scenario-1.csv")
geocoder = pd.read_csv("output/esri-geocoding-result-scenario-1.csv")

# Construct labels and join DataFrames
baseline["label_baseline"] = baseline.apply(
    lambda x: f"{x['street_name']} {x['city']}, {x['province']}, {x['postal_code']}, {x['country']}",
    axis=1,
)
geocoder["label_geocoder"] = geocoder.apply(
    lambda x: f"{x['street_number']} {x['street_name']}, {x['city']}, {x['province']}, {x['postal_code']}, {x['country']}",
    axis=1,
)
df = pd.concat([baseline, geocoder], axis=1)


# Calculate lexical accuracy
df["lexical_accuracy"] = df.apply(
    lambda x: ratio(x["label_baseline"], x["label_geocoder"]), axis=1
)

# Calculate summary statistics
summary_stats = df["lexical_accuracy"].describe()
print(summary_stats)

# Classify distance values into 5 groups to better understand their distributions
labels = ["0.25", "0.5", "0.75", "1"]
classes = [0, 0.25, 0.5, 0.75, 1]

df["lexical_accuracy_class"] = pd.cut(
    df["lexical_accuracy"], bins=classes, labels=labels, include_lowest=True
)

# Calculate summary statistics
counts = df["lexical_accuracy_class"].value_counts(sort=False)
print(counts)

summary_stats = df["lexical_accuracy_class"].describe()
print(summary_stats)


In [None]:
# Lexical similarity with provided labels

# Load data into DataFrames
geocoder_1 = pd.read_csv("output/esri-geocoding-result-scenario-1.csv")
geocoder_2 = pd.read_csv("output/here-geocoding-result-scenario-1.csv")

# Rename the label column in all DataFrames and join them
geocoder_1 = geocoder_1[["id", "label"]].rename(
    columns={"label": "label_geocoder_1"})
geocoder_2 = geocoder_2[["label"]].rename(
    columns={"label": "label_geocoder_2"})
df = pd.concat([geocoder_1, geocoder_2], axis=1)

# Calculate lexical similarity
df["lexical_similarity"] = df.apply(
    lambda x: ratio(x["label_geocoder_1"], x["label_geocoder_2"]), axis=1
)

# Calculate summary statistics
summary_stats = df["lexical_similarity"].describe()
print(summary_stats)

# Classify distance values into 5 groups to better understand their distributions
labels = ["0.25", "0.5", "0.75", "1"]
classes = [0, 0.25, 0.5, 0.75, 1]

df["lexical_similarity_class"] = pd.cut(
    df["lexical_similarity"], bins=classes, labels=labels, include_lowest=True
)

# Calculate summary statistics
counts = df["lexical_similarity_class"].value_counts(sort=False)
print(counts)

summary_stats = df["lexical_similarity_class"].describe()
print(summary_stats)


In [None]:
# Lexical similarity with constructed labels

# Load data into DataFrames
geocoder_1 = pd.read_csv("output/esri-geocoding-result-scenario-1.csv")
geocoder_2 = pd.read_csv("output/here-geocoding-result-scenario-1.csv")

# Rename the label column in all DataFrames and join them
# Construct labels and join DataFrames
geocoder_1["label_geocoder_1"] = geocoder_1.apply(
    lambda x: f"{x['street_name']} {x['city']}, {x['province']}, {x['postal_code']}, {x['country']}",
    axis=1,
)
geocoder_2["label_geocoder_2"] = geocoder_2.apply(
    lambda x: f"{x['street_number']} {x['street_name']}, {x['city']}, {x['province']}, {x['postal_code']}, {x['country']}",
    axis=1,
)
df = pd.concat([geocoder_1, geocoder_2], axis=1)

# Calculate lexical similarity
df["lexical_similarity"] = df.apply(
    lambda x: ratio(x["label_geocoder_1"], x["label_geocoder_2"]), axis=1
)

# Calculate summary statistics
summary_stats = df["lexical_similarity"].describe()
print(summary_stats)

# Classify distance values into 5 groups to better understand their distributions
labels = ["0.25", "0.5", "0.75", "1"]
classes = [0, 0.25, 0.5, 0.75, 1]

df["lexical_similarity_class"] = pd.cut(
    df["lexical_similarity"], bins=classes, labels=labels, include_lowest=True
)

# Calculate summary statistics
counts = df["lexical_similarity_class"].value_counts(sort=False)
print(counts)

summary_stats = df["lexical_similarity_class"].describe()
print(summary_stats)
