In [None]:
from splink.duckdb.duckdb_linker import DuckDBLinker
from try_settings import settings_dict
import pandas as pd
full_name_cc = {
    "column_name": "full_name",
    "comparison_levels": [
        {
            "sql_condition": "full_name_l IS NULL OR full_name_r IS NULL or length(full_name_l) < 2 or length(full_name_r) < 2",
            "label_for_charts": "Comparison includes null",
            "is_null_level": True,
        },
        {
            "sql_condition": "full_name_l = full_name_r",
            "label_for_charts": "Exact match",
            "m_probability": 0.7,
            "u_probability": 0.1,
            "tf_adjustment_column": "full_name",
            "tf_adjustment_weight": 1.0,
        },
        {
            "sql_condition": "levenshtein(full_name_l, full_name_r) <= 2",
            "m_probability": 0.2,
            "u_probability": 0.1,
            "label_for_charts": "Levenstein <= 2",
        },
        {
            "sql_condition": "levenshtein(full_name_l, full_name_r) <= 4",
            "m_probability": 0.2,
            "u_probability": 0.1,
            "label_for_charts": "Levenstein <= 4",
        },
        {
            "sql_condition": "levenshtein(full_name_l, full_name_r) <= 8",
            "m_probability": 0.2,
            "u_probability": 0.1,
            "label_for_charts": "Levenstein <= 8",
        },
        {
            "sql_condition": "ELSE",
            "label_for_charts": "All other comparisons",
            "m_probability": 0.1,
            "u_probability": 0.8,
        },
    ],
}


dob_cc = {
    "column_name": "dob",
    "comparison_levels": [
        {
            "sql_condition": "dob_l IS NULL OR dob_r IS NULL",
            "label_for_charts": "Comparison includes null",
            "is_null_level": True,
        },
        {
            "sql_condition": "dob_l = dob_r",
            "label_for_charts": "Exact match",
            "m_probability": 0.9,
            "u_probability": 0.1,
        },
        {
            "sql_condition": "ELSE",
            "label_for_charts": "All other comparisons",
            "m_probability": 0.1,
            "u_probability": 0.9,
        },
    ],
}

birth_place_cc = {
    "column_name": "birth_place",
    "comparison_levels": [
        {
            "sql_condition": "birth_place_l IS NULL OR birth_place_r IS NULL",
            "label_for_charts": "Comparison includes null",
            "is_null_level": True,
        },
        {
            "sql_condition": "birth_place_l = birth_place_r",
            "label_for_charts": "Exact match",
            "m_probability": 0.9,
            "u_probability": 0.1,
        },
        {
            "sql_condition": "ELSE",
            "label_for_charts": "All other comparisons",
            "m_probability": 0.1,
            "u_probability": 0.9,
        },
    ],
}

postcode_cc = {
    "column_name": "postcode",
    "comparison_levels": [
        {
            "sql_condition": "postcode_l IS NULL OR postcode_r IS NULL",
            "label_for_charts": "Comparison includes null",
            "is_null_level": True,
        },
        {
            "sql_condition": "postcode_l = postcode_r",
            "label_for_charts": "Exact match",
            "m_probability": 0.9,
            "u_probability": 0.1,
            "tf_adjustment_column": "postcode",
            "tf_adjustment_weight": 1.0,
        },
        {
            "sql_condition": "ELSE",
            "label_for_charts": "All other comparisons",
            "m_probability": 0.1,
            "u_probability": 0.9,
        },
    ],
}


occupation_cc = {
    "column_name": "occupation",
    "comparison_levels": [
        {
            "sql_condition": "occupation_l IS NULL OR occupation_r IS NULL",
            "label_for_charts": "Comparison includes null",
            "is_null_level": True,
        },
        {
            "sql_condition": "occupation_l = occupation_r",
            "label_for_charts": "Exact match",
            "m_probability": 0.9,
            "u_probability": 0.1,
            "tf_adjustment_column": "occupation",
            "tf_adjustment_weight": 1.0,
        },
        {
            "sql_condition": "ELSE",
            "label_for_charts": "All other comparisons",
            "m_probability": 0.1,
            "u_probability": 0.9,
        },
    ],
}

settings_dict = {
    "proportion_of_matches": 0.01,
    "link_type": "dedupe_only",
    "blocking_rules_to_generate_predictions": [
        "l.postcode = r.postcode and substr(l.full_name,1,2) = substr(r.full_name,1,2)",
        "l.dob = r.dob and substr(l.postcode,1,2) = substr(r.postcode,1,2)",
        "l.postcode = r.postcode and substr(l.dob,1,3) = substr(r.dob,1,3)",
        "l.postcode = r.postcode and substr(l.dob,4,5) = substr(r.dob,4,5)",
        
    ],
    "comparisons": [
        full_name_cc,
        dob_cc,
        birth_place_cc,
        postcode_cc,
        occupation_cc,
    ],
    "retain_matching_columns": False,
    "retain_intermediate_calculation_columns": False,
    "additional_columns_to_retain": ["cluster"],
    "max_iterations": 2,
}




### Create on disk linker

In [None]:
df_orig = pd.read_parquet("./benchmarking/synthetic_data_all.parquet")


linker = DuckDBLinker(
    settings_dict, input_tables={"main": df_orig}, connection="1m.duckdb"
)


# Train it as a dedupe job.
# If you were to do that, the left hand table would be '__splink__df_concat_with_tf'

# It needs a 'link_incremental' method that treats ''__splink__df_concat_with_tf'' as the left
# table and 'main' as the right table of a link_only.


linker.compute_tf_table("full_name")
linker.compute_tf_table("postcode")
linker.compute_tf_table("occupation")
linker.train_u_using_random_sampling(target_rows=1e6)

linker.train_m_using_expectation_maximisation("l.full_name = r.full_name")

linker.train_m_using_expectation_maximisation(
    "l.dob = r.dob and substr(l.postcode,1,2) = substr(r.postcode,1,2)"
)

### Compare two records interactively using ipywidgets

In [None]:


linker = DuckDBLinker(
    settings_dict, input_tables={}, connection="1m.duckdb"
)


import ipywidgets as widgets
fields = ["unique_id", "full_name","birth_place","dob","postcode","gender","occupation", "cluster"]

left_text_boxes = []
right_text_boxes = []

inputs_to_interactive_output = {}

for f in fields:
    wl = widgets.Text( description=f)
    left_text_boxes.append(wl)
    inputs_to_interactive_output[f"{f}_l"] = wl
    wr = widgets.Text( description=f)
    right_text_boxes.append(wr)
    inputs_to_interactive_output[f"{f}_r"] = wr


b1 = widgets.VBox(left_text_boxes)
b2 = widgets.VBox(right_text_boxes)
ui = widgets.HBox([b1,b2])

def myfn(**kwargs):
    my_args = dict(kwargs)
    
    record_left = {}
    record_right = {}
    
    for key, value in my_args.items():
        if value == '':
            value = None
        if key.endswith("_l"):
            record_left[key[:-2]] = value
        if key.endswith("_r"):
            record_right[key[:-2]] = value
            

    linker.settings_obj._retain_intermediate_calculation_columns = True
    linker.settings_obj._retain_matching_columns = True

    df_two = linker.compare_two_records(record_left, record_right)
    df_two.as_pandas_dataframe()
    recs = df_two.as_pandas_dataframe().to_dict(orient="records")
    from splink.charts import waterfall_chart
    waterfall_chart(recs, linker.settings_obj)
#     display(df_two.as_pandas_dataframe().T)

out = widgets.interactive_output(myfn, inputs_to_interactive_output)

display(out, ui)


### Incremental linkage against existing database

In [None]:
linker = DuckDBLinker(
    settings_dict, input_tables={}, connection="1m.duckdb"
)


In [None]:
from splink.charts import waterfall_chart

import ipywidgets as widgets
pd.options.display.max_columns = 1000

@widgets.interact(full_name='Rudyard Kipling', birth_place="Mumbai", dob="1865-12-30", postcode="B95 5DG", gender="male", occupation='writer')
def interactive_link(full_name, birth_place, dob, postcode, gender, occupation):    
    
    linker.settings_obj._retain_intermediate_calculation_columns = True
    linker.settings_obj._retain_matching_columns = True

    record = {'unique_id': 123987,
     'full_name': full_name,
     'birth_place': birth_place,
     'dob': dob,
     'postcode': postcode,
     'gender': gender,
     'occupation': occupation,
      'cluster': 1}

    for key in record.keys():
        if type(record[key]) == str:

            if record[key].strip() == "":
                record[key] = None
                
    blocking_rules = ['l.full_name = r.full_name', 'l.dob = r.dob', 'l.postcode = r.postcode']

    df_inc = linker.incremental_link([record], blocking_rules = blocking_rules, match_weight_threshold=-1000).as_pandas_dataframe()
    df_inc = df_inc.sort_values("match_weight", ascending=False)
    recs = df_inc.to_dict(orient="records")
    
    waterfall_chart(recs, linker.settings_obj)

