In [1]:
from splink_data_generation.generate_data_exact import generate_df_gammas_exact
from splink_data_generation.generate_data_random import generate_df_gammas_random
from splink_data_generation.match_prob import add_match_prob

In [21]:
settings_2 = {
    "proportion_of_matches": 0.2,
    "link_type": "dedupe_only",
    "comparison_columns": [
        {
            "col_name": "col_1",
            "m_probabilities": [0.3, 0.7],  # Probability of typo
            "u_probabilities": [0.9, 0.1],  # Probability of collision
        },
        {
            "col_name": "col_2",
            "m_probabilities": [0.1, 0.9],  # Probability of typo
            "u_probabilities": [0.975, 0.025],  # Probability of collision
        },
        {
            "col_name": "col_3",
            "m_probabilities": [0.05, 0.95],  # Probability of typo
            "u_probabilities": [0.8, 0.2],  # Probability of collision
        },
    ],
    "max_iterations": 200,
    "em_convergence": 0.0001,
     "additional_columns_to_retain": [
        "true_match", "true_match_probability"
    ]
}

In [22]:
df = generate_df_gammas_exact(settings_2)
df = add_match_prob(df, settings_2)



{'proportion_of_matches': 0.5, 'link_type': 'dedupe_only', 'comparison_columns': [{'col_name': 'col_1', 'm_probabilities': [0.3, 0.7], 'u_probabilities': [0.9, 0.1]}, {'col_name': 'col_2', 'm_probabilities': [0.1, 0.9], 'u_probabilities': [0.975, 0.025]}, {'col_name': 'col_3', 'm_probabilities': [0.05, 0.95], 'u_probabilities': [0.8, 0.2]}], 'max_iterations': 200, 'em_convergence': 0.0001, 'additional_columns_to_retain': ['true_match', 'true_match_probability']}


In [23]:
df.head()

Unnamed: 0,gamma_col_1,gamma_col_2,gamma_col_3,true_match_l,true_match_r,unique_id_l,unique_id_r,numerator,true_match_probability_l,true_match_probability_r
0,0,0,0,1,1,5c534588,68027012,0.00075,0.002132,0.002132
1,0,1,0,1,1,05c70eed,ce8dbfc9,0.00675,0.428571,0.428571
2,0,1,0,1,1,00b4166a,ab2fd237,0.00675,0.428571,0.428571
3,0,1,0,1,1,0e0fd78e,698befa4,0.00675,0.428571,0.428571
4,0,1,0,1,1,3fcd1ede,670b1263,0.00675,0.428571,0.428571


In [24]:
# f1 = df["gamma_col_1"] == 0
# f2 = df["gamma_col_2"] == 0
# f3 = df["gamma_col_3"] == 0
# df[f1&f2&f3]

In [25]:
import logging 
logging.basicConfig()  # Means logs will print in Jupyter Lab

# Set to DEBUG if you want splink to log the SQL statements it's executing under the hood
logging.getLogger("splink").setLevel(logging.INFO)

from pyspark.context import SparkContext
from pyspark.sql import SparkSession
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)

In [26]:
# Now use Splink to estimate the params from the data
from splink_data_generation.estimate_splink import estimate

settings_2["proportion_of_matches"] = df["true_match_l"].mean()
df_e, linker = estimate(df, settings_2 ,spark)

INFO:splink.iterate:Iteration 0 complete
INFO:splink.params:The maximum change in parameters was 2.384185793236071e-08 for key π_gamma_col_1_prob_dist_non_match_level_0_probability
INFO:splink.iterate:EM algorithm has converged


In [27]:
df_e.limit(5).toPandas()

Unnamed: 0,match_probability,unique_id_l,unique_id_r,gamma_col_1,gamma_col_2,gamma_col_3,true_match_l,true_match_r,true_match_probability_l,true_match_probability_r
0,0.002132,5c534588,68027012,0,0,0,1,1,0.002132,0.002132
1,0.428571,05c70eed,ce8dbfc9,0,1,0,1,1,0.428571,0.428571
2,0.428571,00b4166a,ab2fd237,0,1,0,1,1,0.428571,0.428571
3,0.428571,0e0fd78e,698befa4,0,1,0,1,1,0.428571,0.428571
4,0.428571,3fcd1ede,670b1263,0,1,0,1,1,0.428571,0.428571


In [28]:
from splink.truth import roc_chart

df_labels = df[["true_match_r", "unique_id_l", "unique_id_r"]]
df_labels["true_match_r"] = df_labels["true_match_r"].astype(float)
df_labels = df_labels.rename(columns={"true_match_r": "clerical_match_score"})
df_labels.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_labels["true_match_r"] = df_labels["true_match_r"].astype(float)


Unnamed: 0,clerical_match_score,unique_id_l,unique_id_r
0,1.0,5c534588,68027012
1,1.0,05c70eed,ce8dbfc9
2,1.0,00b4166a,ab2fd237
3,1.0,0e0fd78e,698befa4
4,1.0,3fcd1ede,670b1263


In [29]:
df_labels_sp = spark.createDataFrame(df_labels)
roc_chart(df_labels_sp, df_e, settings_2, spark)



In [30]:
# Now we're in a position to subset df into only rows where col_1 matches and re-estimate

f1 = df["gamma_col_1"] == 1
df_blocked = df[f1]
df_blocked.head()

Unnamed: 0,gamma_col_1,gamma_col_2,gamma_col_3,true_match_l,true_match_r,unique_id_l,unique_id_r,numerator,true_match_probability_l,true_match_probability_r
30,1,0,0,1,1,daa57c09,643fdf2c,0.00175,0.042945,0.042945
31,1,1,0,1,1,4f183f22,7bb5617c,0.01575,0.940299,0.940299
32,1,1,0,1,1,0df18ec0,f6ddc796,0.01575,0.940299,0.940299
33,1,1,0,1,1,ea2b589e,1e9b81bd,0.01575,0.940299,0.940299
34,1,1,0,1,1,42fa4edd,8950c40b,0.01575,0.940299,0.940299


In [38]:
settings_2["comparison_columns"] = [  {
            "col_name": "col_2",
            "m_probabilities": [0.1, 0.9],  # Probability of typo
            "u_probabilities": [0.975, 0.025],  # Probability of collision
        },
        {
            "col_name": "col_3",
            "m_probabilities": [0.05, 0.95],  # Probability of typo
            "u_probabilities": [0.8, 0.2],  # Probability of collision
        },]
settings_2["proportion_of_matches"] = df_blocked["true_match_l"].mean()
settings_2

{'proportion_of_matches': 0.875,
 'link_type': 'dedupe_only',
 'comparison_columns': [{'col_name': 'col_2',
   'm_probabilities': [0.1, 0.9],
   'u_probabilities': [0.975, 0.025]},
  {'col_name': 'col_3',
   'm_probabilities': [0.05, 0.95],
   'u_probabilities': [0.8, 0.2]}],
 'max_iterations': 200,
 'em_convergence': 0.0001,
 'additional_columns_to_retain': ['true_match', 'true_match_probability'],
 'retain_intermediate_calculation_columns': False,
 'retain_matching_columns': False,
 'unique_id_column_name': 'unique_id'}

In [39]:
df_e, linker = estimate(df_blocked, settings_2 ,spark)

INFO:splink.iterate:Iteration 0 complete
INFO:splink.params:The maximum change in parameters was 2.384185793236071e-08 for key π_gamma_col_2_prob_dist_match_level_1_probability
INFO:splink.iterate:EM algorithm has converged


In [40]:
linker.params

λ (proportion of matches) = 0.875
------------------------------------
gamma_col_2: Comparison of col_2

Probability distribution of gamma values amongst matches:
    value 0: 0.100000 (level represents lowest category of string similarity)
    value 1: 0.900000 (level represents highest category of string similarity)

Probability distribution of gamma values amongst non-matches:
    value 0: 0.975000 (level represents lowest category of string similarity)
    value 1: 0.025000 (level represents highest category of string similarity)
------------------------------------
gamma_col_3: Comparison of col_3

Probability distribution of gamma values amongst matches:
    value 0: 0.050000 (level represents lowest category of string similarity)
    value 1: 0.950000 (level represents highest category of string similarity)

Probability distribution of gamma values amongst non-matches:
    value 0: 0.800000 (level represents lowest category of string similarity)
    value 1: 0.200000 (level repr

In [41]:
# Try again without starting params 
settings_2["comparison_columns"] = [{
            "col_name": "col_2",
        
        },
        {
            "col_name": "col_3",
  
        }]
settings_2["proportion_of_matches"] = 0.8


In [42]:
df_e, linker = estimate(df_blocked, settings_2 ,spark)

INFO:splink.iterate:Iteration 0 complete
INFO:splink.params:The maximum change in parameters was 0.08415516614913943 for key π_gamma_col_3_prob_dist_non_match_level_0_probability
INFO:splink.iterate:Iteration 1 complete
INFO:splink.params:The maximum change in parameters was 0.002641439437866211 for key π_gamma_col_3_prob_dist_non_match_level_0_probability
INFO:splink.iterate:Iteration 2 complete
INFO:splink.params:The maximum change in parameters was 0.0005588233470916748 for key π_gamma_col_3_prob_dist_non_match_level_1_probability
INFO:splink.iterate:Iteration 3 complete
INFO:splink.params:The maximum change in parameters was 0.00011701881885528564 for key π_gamma_col_3_prob_dist_non_match_level_1_probability
INFO:splink.iterate:Iteration 4 complete
INFO:splink.params:The maximum change in parameters was 2.4437904357910156e-05 for key π_gamma_col_3_prob_dist_non_match_level_0_probability
INFO:splink.iterate:EM algorithm has converged


In [43]:
linker.params

λ (proportion of matches) = 0.8699336051940918
------------------------------------
gamma_col_2: Comparison of col_2

Probability distribution of gamma values amongst matches:
    value 0: 0.103110 (level represents lowest category of string similarity)
    value 1: 0.896890 (level represents highest category of string similarity)

Probability distribution of gamma values amongst non-matches:
    value 0: 0.920114 (level represents lowest category of string similarity)
    value 1: 0.079886 (level represents highest category of string similarity)
------------------------------------
gamma_col_3: Comparison of col_3

Probability distribution of gamma values amongst matches:
    value 0: 0.042763 (level represents lowest category of string similarity)
    value 1: 0.957237 (level represents highest category of string similarity)

Probability distribution of gamma values amongst non-matches:
    value 0: 0.819187 (level represents lowest category of string similarity)
    value 1: 0.18081

In [None]:
from splink.roc import precision_recall_chart
precision_recall_chart(df_labels, df_e, settings, spark)

0.9824561403508771

In [33]:
df_blocked.head()

Unnamed: 0,gamma_col_1,gamma_col_2,gamma_col_3,true_match_l,true_match_r,unique_id_l,unique_id_r,numerator,true_match_probability_l,true_match_probability_r
30,1,0,0,1,1,3939c7b2,522ec328,0.003111,0.304348,0.304348
31,1,1,0,1,1,b3d4a2bd,6c8ff567,0.028,0.940299,0.940299
32,1,1,0,1,1,92e6fbb5,a26eaf26,0.028,0.940299,0.940299
33,1,1,0,1,1,1e67bbfb,b5a6e147,0.028,0.940299,0.940299
34,1,1,0,1,1,05845774,d37b6e19,0.028,0.940299,0.940299


In [34]:
matches = df_blocked["true_match_l"].sum()
matches

1400

In [35]:
len(df_blocked)

1425

In [49]:
df_blocked.pivot_table(index="gamma_col_2",columns = 'true_match_r', values="true_match_l", aggfunc="count")

true_match_r,0,1
gamma_col_2,Unnamed: 1_level_1,Unnamed: 2_level_1
0,20,140
1,5,1260
