In [None]:
import pandas as pd 
from splink.duckdb.duckdb_linker import DuckDBLinker
import splink.duckdb.duckdb_comparison_library as cl
import splink.duckdb.duckdb_comparison_level_library as cll

df = pd.read_csv("./tests/datasets/fake_1000_from_splink_demos.csv")
print(df.dtypes)
print(f"The number of rows is: {df.shape[0]:d}")
df.head(5)

In [None]:
tricky_dates = ['2021-13-21', '2000-14-22', '1999-10-42', '2002-11-52', '2019-15-55']
tricky_dates_df = tricky_dates * int(df.shape[0]/len(tricky_dates))

In [None]:
df_test = pd.DataFrame(tricky_dates_df, columns=['dob'])
print(df_test.dtypes)
df_test.head()

In [None]:
# make a second dataframe and add the bad date strings
df_2 = df.copy(deep=True)
df_2['dob'] = df_test['dob'].values

In [None]:
print(df.dtypes)
df.head()

In [None]:
print(df_2.dtypes)
df_2.head()

In [None]:
# Initialise the linker, passing in the input dataset(s)
# linker = DuckDBLinker([df, df], input_table_aliases=["df", "df_2"])

In [None]:
# Initialise the linker, passing in the input dataset(s)
linker = DuckDBLinker(df_2)

In [None]:
# linker.missingness_chart("df_2")

In [None]:
# linker.missingness_chart("df")

In [None]:
first_name_comparison = cl.exact_match("first_name")
print(first_name_comparison.human_readable_description)

In [None]:
first_name_comparison.as_dict()

In [None]:
dob_comparison = cl.datediff_at_thresholds("dob",date_thresholds=[1,5], date_metrics = ["day", "year"],
                                cast_strings_to_date=True)
dob_comparison.as_dict()

In [None]:
# dob_comparison = {
#     'output_column_name': 'dob_comparison',
#     'comparison_description': 'dob jaro',
#     'comparison_levels': [
#         cll.null_level("dob"),
#         cll.exact_match_level("dob"),
#         cll.datediff_level("dob", date_threshold=3, date_metric='month', cast_strings_to_date=True)

#     ]
# }
# from splink.comparison import Comparison
# print(Comparison(dob_comparison).human_readable_description)

In [None]:
blocking_rules_predict = [
   
    # Tight(ish) blocking rule to start
    "l.first_name = r.first_name and l.surname = r.surname",
]

In [None]:
city = cl.exact_match("city", term_frequency_adjustments=True)

In [None]:
settings = {
    "link_type": "dedupe_only",
    "unique_id_column_name": "unique_id",
    "retain_matching_columns": True,
    "retain_intermediate_calculation_columns": True,
    "max_iterations": 10,
    "em_convergence": 0.01,
    "comparisons": [
        city,
        first_name_comparison,
        dob_comparison,
    ],
    "blocking_rules_to_generate_predictions": blocking_rules_predict,
}


In [None]:
# settings['link_type'] = 'link_and_dedupe'

In [None]:
linker = DuckDBLinker(df_2, settings)
# linker = DuckDBLinker([df, df_2], settings)

In [None]:
deterministic_rules = [
    "l.first_name = r.first_name and levenshtein(r.dob, l.dob) <= 1",
    "l.surname = r.surname and levenshtein(r.dob, l.dob) <= 1",
    "l.first_name = r.first_name and levenshtein(r.surname, l.surname) <= 2",
    "l.email = r.email"
]

linker.estimate_probability_two_random_records_match(deterministic_rules, recall=0.7)

In [None]:
linker.estimate_u_using_random_sampling(target_rows=5000)

In [None]:
training_blocking_rule = "l.first_name = r.first_name and l.surname = r.surname"
training_session_fname_sname = linker.estimate_parameters_using_expectation_maximisation(training_blocking_rule)

In [None]:
# from case_statements import (
#     sql_gen_case_stmt_name_inversion_4,
#     sql_gen_case_stmt_levenshtein_abs_3,
#     sql_gen_case_stmt_jaro_3,
#     sql_gen_case_stmt_numeric_abs_3,
#     sql_gen_case_stmt_array_intersect_2,
#     sql_gen_case_smnt_strict_equality_2,
#     _size_intersect, 
#     _jaro_winkler_array,
# )


# # Edited from common_case_statements.py
# def dob_case_statement_leven(dob_colname, leven_distance=1):
#     # It's often the case that too many dates of birth are on the first of january
#     # typically because when exact dob is unavailable they round to nearest year
#     return f"""
#     case
#     when {dob_colname}_l is null or {dob_colname}_r is null then -1
#     when {dob_colname}_l = {dob_colname}_r  and substr({dob_colname}_l, -5) = '01-01'  then 4
#     when {dob_colname}_l = {dob_colname}_r  then 5
#     when levenshtein({dob_colname}_l, {dob_colname}_r) <= {leven_distance} then 3
#     when abs(datediff({dob_colname}_l, {dob_colname}_r)) <= 365 then 2
#     when abs(datediff({dob_colname}_l, {dob_colname}_r)) <= 10*365 then 1
#     else 0 end
#     """