In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lower, regexp_replace, udf
from pyspark.sql.types import ArrayType, StringType

In [2]:
spark = SparkSession.builder \
    .appName("Data Preprocessing for Entity Resolution") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/12 09:43:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
data = [
    (1, "John Smith", "1234 Elm St, Springfield, IL"),
    (2, "john smith", "1234 elm street, springfield, il"),
    (3, "Jane Doe", "5678 Oak St, Springfield, IL"),
    (4, "JANE DOE", "5678 OAK STREET, SPRINGFIELD, IL")
]

columns = ["id", "name", "address"]

df = spark.createDataFrame(data, schema=columns)

In [4]:
def tokenize(text):
    if text:
        return text.split()
    return []

tokenize_udf = udf(tokenize, ArrayType(StringType()))

In [5]:
def preprocess_df(df):
    df = df.withColumn("name", lower(col("name")))
    df = df.withColumn("address", lower(col("address")))

    df = df.withColumn("name", regexp_replace(col("name"), "[^a-zA-Z0-9\\s]", ""))
    df = df.withColumn("address", regexp_replace(col("address"), "[^a-zA-Z0-9\\s]", ""))

    df = df.withColumn("name_tokens", tokenize_udf(col("name")))
    df = df.withColumn("address_tokens", tokenize_udf(col("address")))

    return df

In [6]:
preprocessed_df = preprocess_df(df)

preprocessed_df.show(truncate=False)

                                                                                

+---+----------+------------------------------+-------------+------------------------------------+
|id |name      |address                       |name_tokens  |address_tokens                      |
+---+----------+------------------------------+-------------+------------------------------------+
|1  |john smith|1234 elm st springfield il    |[john, smith]|[1234, elm, st, springfield, il]    |
|2  |john smith|1234 elm street springfield il|[john, smith]|[1234, elm, street, springfield, il]|
|3  |jane doe  |5678 oak st springfield il    |[jane, doe]  |[5678, oak, st, springfield, il]    |
|4  |jane doe  |5678 oak street springfield il|[jane, doe]  |[5678, oak, street, springfield, il]|
+---+----------+------------------------------+-------------+------------------------------------+



In [7]:
spark.stop()