In [1]:
from splink.postgres.postgres_linker import PostgresLinker
from psycopg2 import connect
from splink.postgres.postgres_comparison_library import (
    exact_match,
    levenshtein_at_thresholds,
)
import os

import pandas as pd

df = pd.read_csv("./tests/datasets/fake_1000_from_splink_demos.csv")

settings = {
    "link_type": "dedupe_only",
    "blocking_rules_to_generate_predictions": [
        "l.first_name = r.first_name",
        "l.surname = r.surname",
    ],
    "comparisons": [
        levenshtein_at_thresholds("first_name", 2),
        exact_match("surname"),
        exact_match("dob"),
        exact_match("city", term_frequency_adjustments=True),
        exact_match("email"),
    ],
}

connection = connect(
    dbname=os.getenv('SPLINK_DB_NAME'),
    user=os.getenv('SPLINK_DB_USER'),
    password=os.getenv('SPLINK_DB_PASSWORD'),
    host=os.getenv('SPLINK_DB_HOST'),
    port=os.getenv('SPLINK_DB_PORT'),
)

linker = PostgresLinker(df, settings, connection=connection)
linker.estimate_u_using_random_sampling(max_pairs=1e6)

blocking_rule_for_training = "l.first_name = r.first_name and l.surname = r.surname"
linker.estimate_parameters_using_expectation_maximisation(blocking_rule_for_training)

blocking_rule_for_training = "l.dob = r.dob"
linker.estimate_parameters_using_expectation_maximisation(blocking_rule_for_training)

pairwise_predictions = linker.predict()

clusters = linker.cluster_pairwise_predictions_at_threshold(pairwise_predictions, 0.95)
clusters.as_pandas_dataframe(limit=5)

----- Estimating u probabilities using random sampling -----

Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - first_name (no m values are trained).
    - surname (no m values are trained).
    - dob (no m values are trained).
    - city (no m values are trained).
    - email (no m values are trained).

----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
l.first_name = r.first_name and l.surname = r.surname

Parameter estimates will be made for the following comparison(s):
    - dob
    - city
    - email

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 
    - first_name
    - surname

Iteration 1: Largest change in params was 0.244 in the m_probability of city, level `All other comparisons`
Iteration 2: Largest change in params was 0.0943 in probability_two_random_records_match
Iteration 3: Largest change in p

Unnamed: 0,cluster_id,unique_id,first_name,surname,dob,city,email,group,tf_city
0,0,0,Julia,,2015-10-29,London,hannah88@powers.com,0,0.28844
1,1,1,Julia,Taylor,2015-07-31,London,hannah88@powers.com,0,0.28844
2,1,2,Julia,Taylor,2016-01-27,London,hannah88@powers.com,0,0.28844
3,3,3,Julia,Taylor,2015-10-29,,hannah88opowersc@m,0,
4,4,4,oNah,Watson,2008-03-23,Bolton,matthew78@ballard-mcdonald.net,1,0.015713
