The aim of this notebook is to implement the basic EM approach used by the R fastLink package in Apache Spark

In [None]:
from pyspark.context import SparkContext
from pyspark.sql import SparkSession, Window
from pyspark.sql.types import StructType
import pyspark.sql.functions as f

sc = SparkContext.getOrCreate()
spark = SparkSession(sc)

In [None]:
df_l = spark.read.csv("em_in_spark/input_data/left_table.csv", header=True)

labels = spark.read.csv("em_in_spark/input_data/real_matches.csv", header=True)

In [None]:
df_l.registerTempTable("df_l")
df_r.registerTempTable("df_r")
labels.registerTempTable("labels")

sql = """
select 
    df_l.row_id_l, 
    df_l.mob_l, 
    df_l.surname_l, 
    df_r.row_id_r, 
    df_r.mob_r, 
    df_r.surname_r,
    coalesce(labels.label, 0)  as label
from df_l
cross join df_r
left join labels
on labels.row_id_l = df_l.row_id_l and 
labels.row_id_r = df_r.row_id_r

"""
df = spark.sql(sql)


In [None]:
df.registerTempTable("df")
sql = """
select *, 

case when
mob_l = mob_r then 1 
else 0
end
as gamma_0,

case when
surname_l = surname_r then 1 
when levenshtein(surname_l, surname_r)<4 then 0.5
else 0
end
as gamma_1


from df 

"""
df_with_gamma = spark.sql(sql)
df_with_gamma.persist()

In [None]:
params = {
    "λ": 0.12,
    "π": {
        "gamma_0": {
            "desc": "Month of birth match",
            "type": "exact_match_only",
            "prob_dist_match": {
                "level_0": {
                    "value": 0,
                    "probability": 0.4
                },
                "level_1": {
                    "value": 1,
                    "probability": 0.6
                }
            },
            "prob_dist_non_match": {
                "level_0": {
                    "value": 0,
                    "probability": 0.6
                },
                "level_1": {
                    "value": 1,
                    "probability": 0.4
                }
            },

        },
        "gamma_1": {
            "desc": "Surname match",
            "type": "include_approximate_match",
            "prob_dist_match": {
                "level_0": {
                    "value": 0,
                    "probability": 0.3
                },
                "level_1": {
                    "value": 0.5,
                    "probability": 0.4
                },
                "level_2": {
                    "value": 1,
                    "probability": 0.4
                }
            },
            "prob_dist_non_match": {
                "level_0": {
                    "value": 0,
                    "probability": 0.4
                },
                "level_1": {
                    "value": 0.5,
                    "probability": 0.3
                },
                "level_2": {
                    "value": 1,
                    "probability": 0.3
                }
            },

        }
    }
}

## Expectation step

In [None]:
%autoreload True

In [None]:
from em_in_spark.fns import *
df_e = run_expectation_step(df_with_gamma, spark, params)
new_params = update_params(df_e, spark, params)

df_e = run_expectation_step(df_with_gamma, spark, new_params)
new_params = update_params(df_e, spark, new_params)

df_e = run_expectation_step(df_with_gamma, spark, new_params)
new_params = update_params(df_e, spark, new_params)

df_e = run_expectation_step(df_with_gamma, spark, new_params)
new_params = update_params(df_e, spark, new_params)

df_e = run_expectation_step(df_with_gamma, spark, new_params)
new_params = update_params(df_e, spark, new_params)

df_e.show()

In [None]:
new_params