In [None]:
back_end = 'duckdb'
import duckdb

In [None]:
import pandas as pd
import pytest


import splink.duckdb.duckdb_comparison_level_library as cll
import splink.duckdb.duckdb_comparison_library as cl
from splink.duckdb.duckdb_linker import DuckDBLinker

import splink.spark.spark_comparison_level_library as cll
import splink.spark.spark_comparison_library as cl
from splink.spark.spark_linker import SparkLinker

Linker = DuckDBLinker

In [None]:
if back_end == 'spark':
    from pyspark.context import SparkContext, SparkConf
    from pyspark.sql import SparkSession, types
    from pyspark.sql.functions import col, udf
    from splink.spark.jar_location import similarity_jar_location
    conf = SparkConf()
    path = similarity_jar_location()
    conf.set("spark.jars", path)
    conf.set("spark.sql.ansi.enabled",True)
    # conf.set("spark.sql.legacy.timeParserPolicy","LEGACY")

    sc = SparkContext.getOrCreate(conf=conf)
    spark = SparkSession(sc)
    spark.sparkContext.setCheckpointDir('/Users/alice.oleary/Documents/spark_checkpoint_dir')

    # Register the jaro winkler custom udf
    spark.udf.registerJavaFunction(
        "jaro_winkler", "uk.gov.moj.dash.linkage.JaroWinklerSimilarity", types.DoubleType()
    )

In [None]:
df = pd.DataFrame(
        [
            {
                "unique_id": 1,
                "first_name": "Tom",
                "dob": "02-03-1993",
            },
            {
                "unique_id": 2,
                "first_name": "Robin",
                "dob": "30-01-1992",
            },
        ]
    )


In [None]:
# exact_match_fn = cl.exact_match("first_name")

# # For testing the cll version
# dob_diff = {
#     "output_column_name": "dob",
#     "comparison_levels": [
#         cll.null_level("dob"),
#         cll.exact_match_level("dob"),
#         cll.datediff_level(
#             date_col="dob",
#             date_threshold=30,
#             date_metric="day",
#             cast_strings_to_date=True,
#         ),
#         cll.datediff_level(
#             date_col="dob",
#             date_threshold=12,
#             date_metric="month",
#             cast_strings_to_date=True,
#         ),
#         cll.datediff_level(
#             date_col="dob",
#             date_threshold=5,
#             date_metric="year",
#             cast_strings_to_date=True,
#         ),
#         cll.datediff_level(
#             date_col="dob",
#             date_threshold=100,
#             date_metric="year",
#             cast_strings_to_date=True,
#         ),
#         cll.else_level(),
#     ],
# }

In [None]:
# settings = {
#     "link_type": "dedupe_only",
#     "comparisons": [exact_match_fn, dob_diff],
# }

# settings_cl = {
#     "link_type": "dedupe_only",
#     "comparisons": [
#         exact_match_fn,
#         cl.datediff_at_thresholds(
#             "dob", [30, 12, 5, 100], ["day", "month", "year", "year"],
#             cast_strings_to_date=True
#         ),
#     ],
# }


In [None]:
# def simple_dob_linker(df, dobs=[], date_format_param='', Linker=None):
#     settings = {
#     "link_type": "dedupe_only",
#     "comparisons": [
#         exact_match_fn,
#         cl.datediff_at_thresholds(
#             "dob", [30, 12, 5, 100], ["day", "month", "year", "year"],
#             cast_strings_to_date=True, date_format=date_format_param
#         ),
#     ],
#     }
#     if len(dobs) == df.shape[0]:
#         df['dob'] = dobs
#     if back_end == 'spark':
#         df = spark.createDataFrame(df)
#         df.persist()
#     linker = Linker(df, settings)   
#     df_e = linker.predict().as_pandas_dataframe()
#     return df_e


In [None]:
def simple_dob_linker(df, dobs=[], date_format_param='', Linker=Linker):
    settings_cl = {
    "link_type": "dedupe_only",
    "comparisons": [
        cl.exact_match("first_name"),
        cl.datediff_at_thresholds(
            "dob", [30, 12, 5, 100], ["day", "month", "year", "year"],
            cast_strings_to_date=True, date_format=date_format_param
        ),
    ],
    }
        # For testing the cll version
    dob_diff = {
        "output_column_name": "dob",
        "comparison_levels": [
            cll.null_level("dob"),
            cll.exact_match_level("dob"),
            cll.datediff_level(
                date_col="dob",
                date_threshold=30,
                date_metric="day",
                cast_strings_to_date=True, 
                date_format=date_format_param,
            ),
            cll.datediff_level(
                date_col="dob",
                date_threshold=12,
                date_metric="month",
                cast_strings_to_date=True, 
                date_format=date_format_param,
            ),
            cll.datediff_level(
                date_col="dob",
                date_threshold=5,
                date_metric="year",
                cast_strings_to_date=True, 
                date_format=date_format_param,
            ),
            cll.datediff_level(
                date_col="dob",
                date_threshold=100,
                date_metric="year",
                cast_strings_to_date=True, 
                date_format=date_format_param,
            ),
            cll.else_level(),
        ],
    }

    settings = {
        "link_type": "dedupe_only",
        "comparisons": [cl.exact_match("first_name"), dob_diff],
    }

    if len(dobs) == df.shape[0]:
        df['dob'] = dobs

    if Linker == SparkLinker:
        df = spark.createDataFrame(df)
        df.persist()

    linker = Linker(df, settings)   
    df_e1 = linker.predict().as_pandas_dataframe()
    return df_e1

    # linker = Linker(df, settings_cl)   
    # df_e2 = linker.predict().as_pandas_dataframe()
    # return df_e1, df_e2

In [None]:
df = pd.DataFrame(
    [
        {
            "unique_id": 1,
            "first_name": "Tom",
            "dob": "02-03-1993",
        },
        {
            "unique_id": 2,
            "first_name": "Robin",
            "dob": "30-01-1992",
        },
    ])

In [None]:

valid_date_formats = ['%d/%m/%Y', '%d-%m-%Y', '%m/%d/%Y', '%Y/%m/%d', '%Y-%m-%d']

In [None]:
df_test = simple_dob_linker(df, dobs=['03/04/1994', '19/02/1993'], 
                date_format_param=valid_date_formats[0], Linker=Linker)

In [None]:
settings_new['comparisons']

In [None]:
simple_dob_linker(df, dobs=['03/04/1994', '19/14/1993'], date_format_param='%d/%m/%Y', Linker=Linker)

In [None]:
date_format_param='%d/%m/%Y'
dob_diff = {
    "output_column_name": "dob",
    "comparison_levels": [
        cll.null_level("dob"),
        cll.exact_match_level("dob"),
        cll.datediff_level(
            date_col="dob",
            date_threshold=30,
            date_metric="day",
            cast_strings_to_date=True, 
            date_format=date_format_param,
        ),
        cll.datediff_level(
            date_col="dob",
            date_threshold=12,
            date_metric="month",
            cast_strings_to_date=True, 
            date_format=date_format_param,
        ),
        cll.datediff_level(
            date_col="dob",
            date_threshold=5,
            date_metric="year",
            cast_strings_to_date=True, 
            date_format=date_format_param,
        ),
        cll.datediff_level(
            date_col="dob",
            date_threshold=100,
            date_metric="year",
            cast_strings_to_date=True, 
            date_format=date_format_param,
        ),
        cll.else_level(),
    ],
}
dob_diff

In [None]:
# with pytest.raises(duckdb.InvalidInputException):
#     simple_linker(df, settings, Linker)
simple_dob_linker(df, dobs=['03/04/1994', '19/02/1993'], date_format_param='%d/%m/%Y', Linker=Linker)
simple_dob_linker(df, dobs=['03-04-1994', '19-02-1993'], date_format_param='%d-%m-%Y', Linker=Linker)
simple_dob_linker(df, dobs=['04/05/1994', '10/02/1993'], date_format_param='%m/%d/%Y', Linker=Linker)

In [None]:
if Linker == SparkLinker:
    valid_date_formats = ['dd/mm/Y', 'dd-mm-Y', 'mm/dd/Y', 'Y/mm/dd']
elif Linker == DuckDBLinker:
    valid_date_formats = ['%d/%m/%Y', '%d-%m-%Y', '%m/%d/%Y', '%Y/%m/%d']

In [None]:
with pytest.raises(py4j.protocol.Py4JJavaError):
    simple_dob_linker(df, dobs=['1994-14-15', '1994-12-03'], 
                        date_format_param='y-M-d', Linker=Linker)