In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lower, regexp_replace, udf, array
from pyspark.sql.types import ArrayType, StringType, FloatType

In [2]:
spark = SparkSession.builder \
    .appName("Similarity Calculation") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/12 09:28:54 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
data = [
    (1, "John Smith", "1234 Elm St, Springfield, IL"),
    (2, "john smith", "1234 elm street, springfield, il"),
    (3, "Jane Doe", "5678 Oak St, Springfield, IL"),
    (4, "JANE DOE", "5678 OAK STREET, SPRINGFIELD, IL")
]

In [4]:
columns = ["id", "name", "address"]
df = spark.createDataFrame(data, schema=columns)

In [5]:
def preprocess(text_col):
    return lower(regexp_replace(text_col, "[^a-zA-Z0-9\\s]", ""))

In [6]:
def jaccard_similarity(set1, set2):
    set1, set2 = set(set1), set(set2)
    intersection_len = len(set1.intersection(set2))
    union_len = len(set1.union(set2))
    return float(intersection_len) / union_len if union_len != 0 else 0.0

In [7]:
jaccard_udf = udf(jaccard_similarity, FloatType())

In [8]:
preprocessed_df = df \
    .withColumn("name_tokens", udf(lambda x: x.split(), ArrayType(StringType()))(preprocess(col("name")))) \
    .withColumn("address_tokens", udf(lambda x: x.split(), ArrayType(StringType()))(preprocess(col("address"))))

In [9]:
cross_df = preprocessed_df.alias("df1").crossJoin(preprocessed_df.alias("df2"))

In [10]:
result_df = cross_df \
    .withColumn("name_jaccard_similarity", jaccard_udf(col("df1.name_tokens"), col("df2.name_tokens"))) \
    .withColumn("address_jaccard_similarity", jaccard_udf(col("df1.address_tokens"), col("df2.address_tokens"))) \
    .select(col("df1.id").alias("id1"), col("df2.id").alias("id2"), 
            "name_jaccard_similarity", "address_jaccard_similarity")

In [11]:
result_df.show(truncate=False)

                                                                                

+---+---+-----------------------+--------------------------+
|id1|id2|name_jaccard_similarity|address_jaccard_similarity|
+---+---+-----------------------+--------------------------+
|1  |1  |1.0                    |1.0                       |
|1  |2  |1.0                    |0.6666667                 |
|1  |3  |0.0                    |0.42857143                |
|1  |4  |0.0                    |0.25                      |
|2  |1  |1.0                    |0.6666667                 |
|2  |2  |1.0                    |1.0                       |
|2  |3  |0.0                    |0.25                      |
|2  |4  |0.0                    |0.42857143                |
|3  |1  |0.0                    |0.42857143                |
|3  |2  |0.0                    |0.25                      |
|3  |3  |1.0                    |1.0                       |
|3  |4  |1.0                    |0.6666667                 |
|4  |1  |0.0                    |0.25                      |
|4  |2  |0.0            

In [12]:
spark.stop()