In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lower, regexp_replace, udf, array
from pyspark.sql.types import ArrayType, StringType, FloatType, IntegerType
from pyspark.ml.evaluation import MulticlassClassificationEvaluator



In [2]:
spark = SparkSession.builder \
    .appName("Entity Resolution Evaluation") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/12 09:31:22 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
ground_truth_data = [
    (1, 2, 1),  # (id1, id2, label) where label 1 = match, 0 = no match
    (1, 3, 0),
    (2, 3, 1),
    (2, 4, 0),
    (3, 4, 1)
]
ground_truth_columns = ["id1", "id2", "label"]
ground_truth_df = spark.createDataFrame(ground_truth_data, schema=ground_truth_columns)

In [4]:
data = [
    (1, "John Smith", "1234 Elm St, Springfield, IL"),
    (2, "john smith", "1234 elm street, springfield, il"),
    (3, "Jane Doe", "5678 Oak St, Springfield, IL"),
    (4, "JANE DOE", "5678 OAK STREET, SPRINGFIELD, IL")
]
columns = ["id", "name", "address"]
df = spark.createDataFrame(data, schema=columns)

In [5]:
def preprocess(text_col):
    return lower(regexp_replace(text_col, "[^a-zA-Z0-9\\s]", ""))

In [6]:
def jaccard_similarity(set1, set2):
    set1, set2 = set(set1), set(set2)
    intersection_len = len(set1.intersection(set2))
    union_len = len(set1.union(set2))
    return float(intersection_len) / union_len if union_len != 0 else 0.0

jaccard_udf = udf(jaccard_similarity, FloatType())

In [7]:
preprocessed_df = df \
    .withColumn("name_tokens", udf(lambda x: x.split(), ArrayType(StringType()))(preprocess(col("name")))) \
    .withColumn("address_tokens", udf(lambda x: x.split(), ArrayType(StringType()))(preprocess(col("address"))))


In [8]:
cross_df = preprocessed_df.alias("df1").crossJoin(preprocessed_df.alias("df2"))

In [9]:
result_df = cross_df \
    .withColumn("name_jaccard_similarity", jaccard_udf(col("df1.name_tokens"), col("df2.name_tokens"))) \
    .withColumn("address_jaccard_similarity", jaccard_udf(col("df1.address_tokens"), col("df2.address_tokens"))) \
    .withColumn("combined_similarity", (col("name_jaccard_similarity") + col("address_jaccard_similarity")) / 2.0) \
    .withColumn("prediction", (col("combined_similarity") > 0.8).cast(IntegerType())) \
    .select(col("df1.id").alias("id1"), col("df2.id").alias("id2"), "prediction")

In [10]:
evaluation_df = result_df.join(ground_truth_df, on=["id1", "id2"], how="left") \
    .na.fill(0, subset=["label"])  # Fill NaNs in label column with 0 (no match)

In [11]:
TP = evaluation_df.filter((col("prediction") == 1) & (col("label") == 1)).count()
TN = evaluation_df.filter((col("prediction") == 0) & (col("label") == 0)).count()
FP = evaluation_df.filter((col("prediction") == 1) & (col("label") == 0)).count()
FN = evaluation_df.filter((col("prediction") == 0) & (col("label") == 1)).count()


24/08/12 09:33:00 WARN ExtractPythonUDFFromJoinCondition: The join condition:((cast((jaccard_similarity(name_tokens#13, name_tokens#29)#40 + jaccard_similarity(address_tokens#19, address_tokens#28)#53) as double) / 2.0) > 0.8) of the join plan contains PythonUDF only, it will be moved out and the join plan will be turned to cross join.
24/08/12 09:33:22 WARN ExtractPythonUDFFromJoinCondition: The join condition:((cast((jaccard_similarity(name_tokens#13, name_tokens#29)#40 + jaccard_similarity(address_tokens#19, address_tokens#28)#53) as double) / 2.0) <= 0.8) of the join plan contains PythonUDF only, it will be moved out and the join plan will be turned to cross join.
24/08/12 09:33:43 WARN ExtractPythonUDFFromJoinCondition: The join condition:((cast((jaccard_similarity(name_tokens#13, name_tokens#29)#40 + jaccard_similarity(address_tokens#19, address_tokens#28)#53) as double) / 2.0) > 0.8) of the join plan contains PythonUDF only, it will be moved out and the join plan will be turned 

In [12]:
precision = TP / (TP + FP) if (TP + FP) > 0 else 0
recall = TP / (TP + FN) if (TP + FN) > 0 else 0
f1_score = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0


In [13]:
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1_score:.2f}")


Precision: 0.25
Recall: 0.67
F1 Score: 0.36


In [14]:
spark.stop()