In [None]:
import pyspark
import os
import sys
from pyspark import SparkContext

# Setting environment variables to specify Python executables for Spark
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

from pyspark.sql import SparkSession

# Initializing a Spark session with specified memory and an application name
spark = SparkSession.builder.config("spark.driver.memory", "16g").appName('chapter_2').getOrCreate()

# Loading data from a CSV file into a DataFrame without initially specifying schema
prev = spark.read.csv("data/linkage/donation/block_1/block_1.csv")
prev

# Displaying the inferred schema of the data as strings
prev.show(2)

# Reading the same data with options set for headers, schema inference, and handling of null values
parsed = spark.read.option("header", "true").option("inferSchema", "true").option("nullValue", "?").csv("data/linkage/donation/block_1/block_1.csv")

# Printing the schema to see data types and structure
parsed.printSchema()

# Displaying the first five rows of the dataset
parsed.show(5)

# Counting the total number of rows in the DataFrame
parsed.count()

# Caching the DataFrame in memory for faster access
parsed.cache()

# Displaying the schema of the DataFrame now with data types and caching confirmation
parsed.printSchema()

# Grouping data by the 'is_match' field and counting occurrences, then ordering by count descending
parsed.groupBy("is_match").count().orderBy(col("count").desc()).show()

# Registering the DataFrame as a temporary SQL table for queries
parsed.createOrReplaceTempView("linkage")

# Executing a SQL query to count matches and non-matches from the temp view
spark.sql("""
SELECT is_match, COUNT(*) cnt
FROM linkage
GROUP BY is_match
ORDER BY cnt DESC
""").show()

# Generating summary statistics for all columns in the DataFrame
summary = parsed.describe()

# Selecting specific statistical results for certain columns
summary.select("summary", "cmp_fname_c1", "cmp_fname_c2").show()

# Filtering rows where matches are true and describing the statistics for these rows
matches = parsed.where("is_match = true")
match_summary = matches.describe()

# Filtering rows where matches are false and describing the statistics for these rows
misses = parsed.filter(col("is_match") == False)
miss_summary = misses.describe()

# Converting the summary DataFrame to a Pandas DataFrame
summary_p = summary.toPandas()

# Displaying the head of the Pandas DataFrame
summary_p.head()

# Checking the shape of the Pandas DataFrame
summary_p.shape

# Setting the index of the Pandas DataFrame and transposing it
summary_p = summary_p.set_index('summary').transpose().reset_index()

# Renaming columns after transposition
summary_p = summary_p.rename(columns={'index':'field'})

# Removing the index name axis
summary_p = summary_p.rename_axis(None, axis=1)

# Checking the shape again after transformations
summary_p.shape

# Converting the Pandas DataFrame back to a Spark DataFrame
summaryT = spark.createDataFrame(summary_p)

# Printing the schema of the new Spark DataFrame
summaryT.printSchema()

from pyspark.sql.types import DoubleType

# Casting all columns except 'field' to DoubleType to ensure numerical operations can be performed
for c in summaryT.columns:
    if c == 'field':
        continue
    summaryT = summaryT.withColumn(c, summaryT[c].cast(DoubleType()))

# Printing the schema again to confirm type changes
summaryT.printSchema()

# Defining a function to automate the summary statistic transposition and conversion
def pivot_summary(desc):
    # Convert to pandas dataframe
    desc_p = desc.toPandas()
    # Transpose
    desc_p = desc_p.set_index('summary').transpose().reset_index()
    desc_p = desc_p.rename(columns={'index':'field'})
    desc_p = desc_p.rename_axis(None, axis=1)
    # Convert to Spark dataframe
    descT = spark.createDataFrame(desc_p)
    # Convert metric columns to double from string
    for c in descT.columns:
        if c == 'field':
            continue
        else:
            descT = descT.withColumn(c, descT[c].cast(DoubleType()))
    return descT

# Applying the function to match and miss summaries
match_summaryT = pivot_summary(match_summary)
miss_summaryT = pivot_summary(miss_summary)

# Creating temporary views for SQL operations
match_summaryT.createOrReplaceTempView("match_desc")
miss_summaryT.createOrReplaceTempView("miss_desc")

# Executing a SQL query to compute differences between match and miss summaries
spark.sql("""
SELECT a.field, a.count + b.count total, a.mean - b.mean delta
FROM match_desc a INNER JOIN miss_desc b ON a.field = b.field
WHERE a.field NOT IN ("id_1", "id_2")
ORDER BY delta DESC, total DESC
""")

# Defining a list of good features based on domain knowledge
good_features = ["cmp_lname_c1", "cmp_plz", "cmp_by", "cmp_bd", "cmp_bm"]

# Creating an expression string to sum the good features for scoring
sum_expression = " + ".join(good_features)

# Filling NA values in good features, computing a score, and selecting relevant columns
scored = parsed.fillna(0, subset=good_features).withColumn('score', expr(sum_expression)).select('score', 'is_match')

# Defining a function to create cross tabs for score thresholds
def crossTabs(scored: DataFrame, t: DoubleType) -> DataFrame:
    return scored.selectExpr(f"score >= {t} as above", "is_match").groupBy("above").pivot("is_match", ["true", "false"]).count()

# Displaying crosstab results for different score thresholds
crossTabs(scored, 4.0).show()
crossTabs(scored, 2.0).show()


In [None]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col, explode
from pyspark.sql.types import ArrayType, StringType, DoubleType
from pyspark.ml.feature import Tokenizer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql import DataFrame

# Set up the Spark session
spark = SparkSession.builder.appName("EntityResolution").getOrCreate()

# Load dataset
data = spark.read.option("header", "true").option("inferSchema", "true")\
                 .csv("data/linkage/donation/block_1/block_1.csv")

# Define a UDF for normalization (e.g., lowercasing, removing non-alphanumeric characters)
def normalize(text):
    if text is None:
        return []
    return ''.join(filter(str.isalnum, text.lower())).split()

normalize_udf = udf(normalize, ArrayType(StringType()))

# Apply the tokenizer and normalization to the relevant columns
tokenizer = Tokenizer(inputCol="cmp_fname_c1", outputCol="tokenized")
data = tokenizer.transform(data)
data = data.withColumn("normalized", normalize_udf(col("tokenized")))

# Define a UDF to compute Jaccard similarity between two arrays of tokens
def jaccard_similarity(list1, list2):
    if not list1 or not list2:
        return 0.0
    set1, set2 = set(list1), set(list2)
    return float(len(set1 & set2) / len(set1 | set2))

jaccard_udf = udf(jaccard_similarity, DoubleType())

# Compute the similarity score for the records, assuming 'normalized' column for both records
data = data.alias("df1").join(data.alias("df2"), "id_1")  # Self-join to compare records
data = data.withColumn("similarity_score", jaccard_udf(col("df1.normalized"), col("df2.normalized")))

# Add prediction column based on similarity score threshold
threshold = 0.5  # Threshold for determining whether the pair is a match
data = data.withColumn("prediction", (col("similarity_score") > threshold).cast("integer"))

# Select relevant columns to evaluate the model
data = data.select(col("is_match").cast("integer").alias("label"), "prediction")

# Calculate precision, recall, and F1-score
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")
precision = evaluator.evaluate(data, {evaluator.metricName: "weightedPrecision"})
recall = evaluator.evaluate(data, {evaluator.metricName: "weightedRecall"})
f1_score = evaluator.evaluate(data, {evaluator.metricName: "f1"})

# Output the evaluation metrics
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1_score}")
