In [None]:
%autoreload 2

In [2]:
import logging 
logging.basicConfig()

# logging.getLogger("sql").setLevel("DEBUG")
# logging.getLogger("gammas").setLevel("DEBUG")
# logging.getLogger("expectation_step").setLevel("DEBUG")
# logging.getLogger("maximisation_step").setLevel("DEBUG")
# logging.getLogger("comparison_evaluation").setLevel("DEBUG")
logging.getLogger("sparklink").setLevel(logging.ERROR)

In [3]:
from pyspark.context import SparkContext, SparkConf
from pyspark.sql import SparkSession, Window
from pyspark.sql.types import StructType
import pyspark.sql.functions as f

# WARNING:
# These config options are appropriate only if you're running Spark locally!!!
conf=SparkConf()
conf.set('spark.driver.memory', '8g')
conf.set("spark.sql.shuffle.partitions", "8") 

sc = SparkContext.getOrCreate(conf=conf)

spark = SparkSession(sc)

In [4]:
from sparklink.sql import *
from sparklink.blocking import *
from sparklink.gammas import *
from sparklink.params import *
from sparklink.expectation_step import *
from sparklink.maximisation_step import *
from sparklink.iterate import *
from sparklink.comparison_evaluation import *

In [5]:
df = spark.read.csv("data/data.csv", header=True)
df.show()

+---------+---+-------+-----+
|unique_id|mob|surname|group|
+---------+---+-------+-----+
|        1| 10|Linacre|    1|
|        2| 10|Linacre|    1|
|        3| 10|Linacer|    1|
|        4|  7|  Smith|    2|
|        5|  8|  Smith|    2|
|        6|  8|  Smith|    2|
|        7|  8|  Jones|    3|
+---------+---+-------+-----+



In [6]:
df_comparison = cartestian_block(df, ["mob", "surname", "group"], spark=spark)


In [7]:
gamma_settings = {
    "mob": {
        "levels": 2
    },
    "surname": {
        "levels": 3
    }}

df_gammas = add_gammas(df_comparison, gamma_settings, spark, include_orig_cols = True)
df_gammas.show()

+-----------+-----------+-----+-----+---------+---------+-------+-------+
|unique_id_l|unique_id_r|mob_l|mob_r|surname_l|surname_r|gamma_0|gamma_1|
+-----------+-----------+-----+-----+---------+---------+-------+-------+
|          1|          2|   10|   10|  Linacre|  Linacre|      1|      2|
|          1|          3|   10|   10|  Linacre|  Linacer|      1|      1|
|          1|          4|   10|    7|  Linacre|    Smith|      0|      0|
|          1|          5|   10|    8|  Linacre|    Smith|      0|      0|
|          1|          6|   10|    8|  Linacre|    Smith|      0|      0|
|          1|          7|   10|    8|  Linacre|    Jones|      0|      0|
|          2|          3|   10|   10|  Linacre|  Linacer|      1|      1|
|          2|          4|   10|    7|  Linacre|    Smith|      0|      0|
|          2|          5|   10|    8|  Linacre|    Smith|      0|      0|
|          2|          6|   10|    8|  Linacre|    Smith|      0|      0|
|          2|          7|   10|    8| 

In [8]:
import copy 

params = Params(gamma_settings, starting_lambda=0.2)

params.set_pi_value("gamma_0", 0, "match", 0.2)
params.set_pi_value("gamma_0", 1, "match", 0.8)

params.set_pi_value("gamma_0", 0, "non_match", 0.8)
params.set_pi_value("gamma_0", 1, "non_match", 0.2)

params.set_pi_value("gamma_1", 0, "match", 0.2)
params.set_pi_value("gamma_1", 1, "match", 0.4)
params.set_pi_value("gamma_1", 2, "match", 0.4)

params.set_pi_value("gamma_1", 0, "non_match", 0.8)
params.set_pi_value("gamma_1", 1, "non_match", 0.15)
params.set_pi_value("gamma_1", 2, "non_match", 0.05)



In [9]:
df_e = iterate(df_gammas, spark, params, num_iterations=1)

In [10]:
df_e.show()

+-----------+-----------+-----+-----+---------+---------+-------+-------+----------------------+------------------+----------------------+------------------+-----------------+
|unique_id_l|unique_id_r|mob_l|mob_r|surname_l|surname_r|gamma_0|gamma_1|prob_gamma_0_non_match|prob_gamma_0_match|prob_gamma_1_non_match|prob_gamma_1_match|match_probability|
+-----------+-----------+-----+-----+---------+---------+-------+-------+----------------------+------------------+----------------------+------------------+-----------------+
|          1|          2|   10|   10|  Linacre|  Linacre|      1|      2|                   0.2|               0.8|                  0.05|               0.4|   0.888888888889|
|          1|          3|   10|   10|  Linacre|  Linacer|      1|      1|                   0.2|               0.8|                  0.15|               0.4|   0.727272727273|
|          1|          4|   10|    7|  Linacre|    Smith|      0|      0|                   0.8|               0.2|     

In [11]:
params

λ (proportion of matches) = 0.21423761546611786
------------------------------------
gamma_0: Comparison of mob

Probability distribution of gamma values amongst matches:
    value 0: 0.192636 (level represents lowest category of string similarity)
    value 1: 0.807364 (level represents highest category of string similarity)

Probability distribution of gamma values amongst non-matches:
    value 0: 0.856513 (level represents lowest category of string similarity)
    value 1: 0.143487 (level represents highest category of string similarity)
------------------------------------
gamma_1: Comparison of surname

Probability distribution of gamma values amongst matches:
    value 0: 0.133363 (level represents lowest category of string similarity)
    value 1: 0.323305 
    value 2: 0.543332 (level represents highest category of string similarity)

Probability distribution of gamma values amongst non-matches:
    value 0: 0.872674 (level represents lowest category of string similarity)
    

In [12]:
import copy 

params = Params(gamma_settings, starting_lambda=0.2)

params.set_pi_value("gamma_0", 0, "match", 0.2)
params.set_pi_value("gamma_0", 1, "match", 0.8)

params.set_pi_value("gamma_0", 0, "non_match", 0.8)
params.set_pi_value("gamma_0", 1, "non_match", 0.2)

params.set_pi_value("gamma_1", 0, "match", 0.2)
params.set_pi_value("gamma_1", 1, "match", 0.4)
params.set_pi_value("gamma_1", 2, "match", 0.4)

params.set_pi_value("gamma_1", 0, "non_match", 0.8)
params.set_pi_value("gamma_1", 1, "non_match", 0.15)
params.set_pi_value("gamma_1", 2, "non_match", 0.05)



In [13]:
df_e_2 = iterate(df_gammas, spark, params, num_iterations=2)

In [14]:
params

λ (proportion of matches) = 0.22355400025844574
------------------------------------
gamma_0: Comparison of mob

Probability distribution of gamma values amongst matches:
    value 0: 0.136957 (level represents lowest category of string similarity)
    value 1: 0.863043 (level represents highest category of string similarity)

Probability distribution of gamma values amongst non-matches:
    value 0: 0.880510 (level represents lowest category of string similarity)
    value 1: 0.119490 (level represents highest category of string similarity)
------------------------------------
gamma_1: Comparison of surname

Probability distribution of gamma values amongst matches:
    value 0: 0.106618 (level represents lowest category of string similarity)
    value 1: 0.399400 
    value 2: 0.493981 (level represents highest category of string similarity)

Probability distribution of gamma values amongst non-matches:
    value 0: 0.889245 (level represents lowest category of string similarity)
    