In [1]:
back_end = 'spark'
import duckdb

In [2]:
import pandas as pd
import pytest

if back_end == 'duckdb':
    import splink.duckdb.duckdb_comparison_level_library as cll
    import splink.duckdb.duckdb_comparison_library as cl
    from splink.duckdb.duckdb_linker import DuckDBLinker
    Linker = DuckDBLinker
elif back_end == 'spark':
    import splink.spark.spark_comparison_level_library as cll
    import splink.spark.spark_comparison_library as cl
    from splink.spark.spark_linker import SparkLinker
    Linker = SparkLinker

In [3]:
if back_end == 'spark':
    from pyspark.context import SparkContext, SparkConf
    from pyspark.sql import SparkSession, types
    from pyspark.sql.functions import col, udf
    from splink.spark.jar_location import similarity_jar_location
    conf = SparkConf()
    path = similarity_jar_location()
    conf.set("spark.jars", path)
    conf.set("spark.sql.ansi.enabled",True)
    # conf.set("spark.sql.legacy.timeParserPolicy","LEGACY")

    sc = SparkContext.getOrCreate(conf=conf)
    spark = SparkSession(sc)
    spark.sparkContext.setCheckpointDir('/Users/alice.oleary/Documents/spark_checkpoint_dir')

    # Register the jaro winkler custom udf
    spark.udf.registerJavaFunction(
        "jaro_winkler", "uk.gov.moj.dash.linkage.JaroWinklerSimilarity", types.DoubleType()
    )

23/03/21 14:42:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/03/21 14:42:43 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [4]:
df = pd.DataFrame(
        [
            {
                "unique_id": 1,
                "first_name": "Tom",
                "dob": "02-03-1993",
            },
            {
                "unique_id": 2,
                "first_name": "Robin",
                "dob": "30-01-1992",
            },
        ]
    )


In [5]:
# df = pd.DataFrame(
#         [
#             {
#                 "unique_id": 1,
#                 "first_name": "Tom",
#                 "dob": "2000-13-01",
#             },
#             {
#                 "unique_id": 2,
#                 "first_name": "Robin",
#                 "dob": "2000-01-24",
#             },
#             {
#                 "unique_id": 3,
#                 "first_name": "Zoe",
#                 "dob": "1995-14-30",
#             },
#             {
#                 "unique_id": 4,
#                 "first_name": "Sam",
#                 "dob": "1966-07-05",
#             },
#             {
#                 "unique_id": 5,
#                 "first_name": "Andy",
#                 "dob": "1996-32-15",
#             },
#             {
#                 "unique_id": 6,
#                 "first_name": "Alice",
#                 "dob": "2000-03-25",
#             },
#             {
#                 "unique_id": 7,
#                 "first_name": "Afua",
#                 "dob": "1960-01-01",
#             },
#         ]
#     )


In [6]:
df

Unnamed: 0,unique_id,first_name,dob
0,1,Tom,02-03-1993
1,2,Robin,30-01-1992


In [7]:
exact_match_fn = cl.exact_match("first_name")

# For testing the cll version
dob_diff = {
    "output_column_name": "dob",
    "comparison_levels": [
        cll.null_level("dob"),
        cll.exact_match_level("dob"),
        cll.datediff_level(
            date_col="dob",
            date_threshold=30,
            date_metric="day",
            cast_strings_to_date=True,
        ),
        cll.datediff_level(
            date_col="dob",
            date_threshold=12,
            date_metric="month",
            cast_strings_to_date=True,
        ),
        cll.datediff_level(
            date_col="dob",
            date_threshold=5,
            date_metric="year",
            cast_strings_to_date=True,
        ),
        cll.datediff_level(
            date_col="dob",
            date_threshold=100,
            date_metric="year",
            cast_strings_to_date=True,
        ),
        cll.else_level(),
    ],
}

In [8]:
settings = {
    "link_type": "dedupe_only",
    "comparisons": [exact_match_fn, dob_diff],
}

settings_cl = {
    "link_type": "dedupe_only",
    "comparisons": [
        exact_match_fn,
        cl.datediff_at_thresholds(
            "dob", [30, 12, 5, 100], ["day", "month", "year", "year"],
            cast_strings_to_date=True
        ),
    ],
}

In [9]:
# df["dob"] = pd.to_datetime(df["dob"])

In [10]:
# if back_end == 'spark':
#     df_new = spark.createDataFrame(df)
#     df_new.persist()

In [11]:
# linker = Linker(df, settings)
# df_e = linker.predict().as_pandas_dataframe()
# linker = Linker(df, settings_cl)
# cl_df_e = linker.predict().as_pandas_dataframe()

In [12]:
# settings_cl = {
#     "link_type": "dedupe_only",
#     "comparisons": [
#         exact_match_fn,
#         cl.datediff_at_thresholds(
#             "dob", [30, 12, 5, 100], ["day", "month", "year", "year"],
#             cast_strings_to_date=True, date_format=date_format_param
#         ),
#     ],
# }

In [13]:
def simple_dob_linker(df, dobs=[], date_format_param='', Linker=None):
    settings = {
    "link_type": "dedupe_only",
    "comparisons": [
        exact_match_fn,
        cl.datediff_at_thresholds(
            "dob", [30, 12, 5, 100], ["day", "month", "year", "year"],
            cast_strings_to_date=True, date_format=date_format_param
        ),
    ],
    }
    if len(dobs) == df.shape[0]:
        df['dob'] = dobs
    if back_end == 'spark':
        df = spark.createDataFrame(df)
        df.persist()
    linker = Linker(df, settings)   
    df_e = linker.predict().as_pandas_dataframe()
    return df_e

In [14]:
simple_dob_linker(df, dobs=['03/04/1994', '19/02/1993'], date_format_param='d/m/Y', Linker=Linker)

  if LooseVersion(pandas.__version__) < LooseVersion(minimum_pandas_version):
  if LooseVersion(pandas.__version__) < LooseVersion(minimum_pandas_version):
  if LooseVersion(pandas.__version__) < LooseVersion(minimum_pandas_version):
  if LooseVersion(pandas.__version__) < LooseVersion(minimum_pandas_version):
  if LooseVersion(pandas.__version__) < LooseVersion(minimum_pandas_version):
  if LooseVersion(pandas.__version__) < LooseVersion(minimum_pandas_version):
  if LooseVersion(pandas.__version__) < LooseVersion(minimum_pandas_version):
  if LooseVersion(pandas.__version__) < LooseVersion(minimum_pandas_version):
23/03/21 14:42:47 WARN SimpleFunctionRegistry: The function jaro_winkler replaced a previously registered function.
23/03/21 14:42:50 WARN DataSource: All paths were ignored:                      
  file:/Users/alice.oleary/Documents/spark_checkpoint_dir/ffb6d686-aa41-478e-a33a-b015897210c2/__splink__df_concat_with_tf_35659bfda
23/03/21 14:42:51 ERROR FileFormatWriter: Abor

Py4JJavaError: An error occurred while calling o133.parquet.
: org.apache.spark.SparkException: Job aborted.
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:231)
	at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:188)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:108)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:106)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.doExecute(commands.scala:131)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:180)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:218)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:215)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:176)
	at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:132)
	at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:131)
	at org.apache.spark.sql.DataFrameWriter.$anonfun$runCommand$1(DataFrameWriter.scala:989)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:103)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:772)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:989)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:438)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:415)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:293)
	at org.apache.spark.sql.DataFrameWriter.parquet(DataFrameWriter.scala:874)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:750)
Caused by: org.apache.spark.SparkUpgradeException: You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'd/m/Y' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html
	at org.apache.spark.sql.catalyst.util.DateTimeFormatterHelper$$anonfun$checkLegacyFormatter$1.applyOrElse(DateTimeFormatterHelper.scala:196)
	at org.apache.spark.sql.catalyst.util.DateTimeFormatterHelper$$anonfun$checkLegacyFormatter$1.applyOrElse(DateTimeFormatterHelper.scala:185)
	at scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:38)
	at org.apache.spark.sql.catalyst.util.Iso8601TimestampFormatter.validatePatternString(TimestampFormatter.scala:109)
	at org.apache.spark.sql.catalyst.util.TimestampFormatter$.getFormatter(TimestampFormatter.scala:300)
	at org.apache.spark.sql.catalyst.util.TimestampFormatter$.apply(TimestampFormatter.scala:333)
	at org.apache.spark.sql.catalyst.expressions.TimestampFormatterHelper.getFormatter(datetimeExpressions.scala:72)
	at org.apache.spark.sql.catalyst.expressions.TimestampFormatterHelper.getFormatter$(datetimeExpressions.scala:67)
	at org.apache.spark.sql.catalyst.expressions.ToTimestamp.getFormatter(datetimeExpressions.scala:918)
	at org.apache.spark.sql.catalyst.expressions.TimestampFormatterHelper.$anonfun$formatterOption$1(datetimeExpressions.scala:64)
	at scala.Option.map(Option.scala:230)
	at org.apache.spark.sql.catalyst.expressions.TimestampFormatterHelper.formatterOption(datetimeExpressions.scala:64)
	at org.apache.spark.sql.catalyst.expressions.TimestampFormatterHelper.formatterOption$(datetimeExpressions.scala:62)
	at org.apache.spark.sql.catalyst.expressions.ToTimestamp.formatterOption$lzycompute(datetimeExpressions.scala:918)
	at org.apache.spark.sql.catalyst.expressions.ToTimestamp.formatterOption(datetimeExpressions.scala:918)
	at org.apache.spark.sql.catalyst.expressions.ToTimestamp.doGenCode(datetimeExpressions.scala:978)
	at org.apache.spark.sql.catalyst.expressions.Expression.$anonfun$genCode$3(Expression.scala:146)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.sql.catalyst.expressions.Expression.genCode(Expression.scala:141)
	at org.apache.spark.sql.catalyst.expressions.CastBase.doGenCode(Cast.scala:853)
	at org.apache.spark.sql.catalyst.expressions.Expression.$anonfun$genCode$3(Expression.scala:146)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.sql.catalyst.expressions.Expression.genCode(Expression.scala:141)
	at org.apache.spark.sql.catalyst.expressions.CastBase.genCode(Cast.scala:848)
	at org.apache.spark.sql.catalyst.expressions.BinaryExpression.nullSafeCodeGen(Expression.scala:608)
	at org.apache.spark.sql.catalyst.expressions.BinaryExpression.defineCodeGen(Expression.scala:591)
	at org.apache.spark.sql.catalyst.expressions.DateDiff.doGenCode(datetimeExpressions.scala:1869)
	at org.apache.spark.sql.catalyst.expressions.Expression.$anonfun$genCode$3(Expression.scala:146)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.sql.catalyst.expressions.Expression.genCode(Expression.scala:141)
	at org.apache.spark.sql.catalyst.expressions.UnaryExpression.nullSafeCodeGen(Expression.scala:519)
	at org.apache.spark.sql.catalyst.expressions.UnaryExpression.defineCodeGen(Expression.scala:503)
	at org.apache.spark.sql.catalyst.expressions.Abs.doGenCode(arithmetic.scala:150)
	at org.apache.spark.sql.catalyst.expressions.Expression.$anonfun$genCode$3(Expression.scala:146)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.sql.catalyst.expressions.Expression.genCode(Expression.scala:141)
	at org.apache.spark.sql.catalyst.expressions.BinaryExpression.nullSafeCodeGen(Expression.scala:608)
	at org.apache.spark.sql.catalyst.expressions.BinaryExpression.defineCodeGen(Expression.scala:591)
	at org.apache.spark.sql.catalyst.expressions.BinaryComparison.doGenCode(predicates.scala:768)
	at org.apache.spark.sql.catalyst.expressions.Expression.$anonfun$genCode$3(Expression.scala:146)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.sql.catalyst.expressions.Expression.genCode(Expression.scala:141)
	at org.apache.spark.sql.catalyst.expressions.CaseWhen.$anonfun$multiBranchesCodegen$1(conditionalExpressions.scala:211)
	at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:238)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at scala.collection.TraversableLike.map(TraversableLike.scala:238)
	at scala.collection.TraversableLike.map$(TraversableLike.scala:231)
	at scala.collection.AbstractTraversable.map(Traversable.scala:108)
	at org.apache.spark.sql.catalyst.expressions.CaseWhen.multiBranchesCodegen(conditionalExpressions.scala:210)
	at org.apache.spark.sql.catalyst.expressions.CaseWhen.doGenCode(conditionalExpressions.scala:293)
	at org.apache.spark.sql.catalyst.expressions.Expression.$anonfun$genCode$3(Expression.scala:146)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.sql.catalyst.expressions.Expression.genCode(Expression.scala:141)
	at org.apache.spark.sql.catalyst.expressions.codegen.CodegenContext.$anonfun$subexpressionEliminationForWholeStageCodegen$3(CodeGenerator.scala:1051)
	at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:238)
	at scala.collection.immutable.List.foreach(List.scala:392)
	at scala.collection.TraversableLike.map(TraversableLike.scala:238)
	at scala.collection.TraversableLike.map$(TraversableLike.scala:231)
	at scala.collection.immutable.List.map(List.scala:298)
	at org.apache.spark.sql.catalyst.expressions.codegen.CodegenContext.commonExprVals$lzycompute$1(CodeGenerator.scala:1051)
	at org.apache.spark.sql.catalyst.expressions.codegen.CodegenContext.commonExprVals$1(CodeGenerator.scala:1051)
	at org.apache.spark.sql.catalyst.expressions.codegen.CodegenContext.subexpressionEliminationForWholeStageCodegen(CodeGenerator.scala:1072)
	at org.apache.spark.sql.execution.ProjectExec.doConsume(basicPhysicalOperators.scala:71)
	at org.apache.spark.sql.execution.CodegenSupport.consume(WholeStageCodegenExec.scala:194)
	at org.apache.spark.sql.execution.CodegenSupport.consume$(WholeStageCodegenExec.scala:149)
	at org.apache.spark.sql.execution.InputAdapter.consume(WholeStageCodegenExec.scala:496)
	at org.apache.spark.sql.execution.InputRDDCodegen.doProduce(WholeStageCodegenExec.scala:483)
	at org.apache.spark.sql.execution.InputRDDCodegen.doProduce$(WholeStageCodegenExec.scala:456)
	at org.apache.spark.sql.execution.InputAdapter.doProduce(WholeStageCodegenExec.scala:496)
	at org.apache.spark.sql.execution.CodegenSupport.$anonfun$produce$1(WholeStageCodegenExec.scala:95)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:218)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:215)
	at org.apache.spark.sql.execution.CodegenSupport.produce(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.CodegenSupport.produce$(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.InputAdapter.produce(WholeStageCodegenExec.scala:496)
	at org.apache.spark.sql.execution.ProjectExec.doProduce(basicPhysicalOperators.scala:54)
	at org.apache.spark.sql.execution.CodegenSupport.$anonfun$produce$1(WholeStageCodegenExec.scala:95)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:218)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:215)
	at org.apache.spark.sql.execution.CodegenSupport.produce(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.CodegenSupport.produce$(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.ProjectExec.produce(basicPhysicalOperators.scala:41)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doCodeGen(WholeStageCodegenExec.scala:655)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:718)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:180)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:218)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:215)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:176)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:177)
	... 33 more
Caused by: java.lang.IllegalArgumentException: All week-based patterns are unsupported since Spark 3.0, detected: Y, Please use the SQL function EXTRACT instead
	at org.apache.spark.sql.catalyst.util.DateTimeFormatterHelper$.$anonfun$convertIncompatiblePattern$4(DateTimeFormatterHelper.scala:323)
	at org.apache.spark.sql.catalyst.util.DateTimeFormatterHelper$.$anonfun$convertIncompatiblePattern$4$adapted(DateTimeFormatterHelper.scala:321)
	at scala.collection.TraversableLike$WithFilter.$anonfun$foreach$1(TraversableLike.scala:877)
	at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)
	at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33)
	at scala.collection.immutable.StringOps.foreach(StringOps.scala:33)
	at scala.collection.TraversableLike$WithFilter.foreach(TraversableLike.scala:876)
	at org.apache.spark.sql.catalyst.util.DateTimeFormatterHelper$.$anonfun$convertIncompatiblePattern$2(DateTimeFormatterHelper.scala:321)
	at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:238)
	at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)
	at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:198)
	at scala.collection.TraversableLike.map(TraversableLike.scala:238)
	at scala.collection.TraversableLike.map$(TraversableLike.scala:231)
	at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:198)
	at org.apache.spark.sql.catalyst.util.DateTimeFormatterHelper$.convertIncompatiblePattern(DateTimeFormatterHelper.scala:318)
	at org.apache.spark.sql.catalyst.util.DateTimeFormatterHelper.getOrCreateFormatter(DateTimeFormatterHelper.scala:121)
	at org.apache.spark.sql.catalyst.util.DateTimeFormatterHelper.getOrCreateFormatter$(DateTimeFormatterHelper.scala:117)
	at org.apache.spark.sql.catalyst.util.Iso8601TimestampFormatter.getOrCreateFormatter(TimestampFormatter.scala:59)
	at org.apache.spark.sql.catalyst.util.Iso8601TimestampFormatter.formatter$lzycompute(TimestampFormatter.scala:68)
	at org.apache.spark.sql.catalyst.util.Iso8601TimestampFormatter.formatter(TimestampFormatter.scala:67)
	at org.apache.spark.sql.catalyst.util.Iso8601TimestampFormatter.validatePatternString(TimestampFormatter.scala:108)
	... 123 more


In [None]:
# with pytest.raises(duckdb.InvalidInputException):
#     simple_linker(df, settings, Linker)
simple_dob_linker(df, dobs=['03/04/1994', '19/02/1993'], date_format_param='%d/%m/%Y', Linker=Linker)
simple_dob_linker(df, dobs=['03-04-1994', '19-02-1993'], date_format_param='%d-%m-%Y', Linker=Linker)
simple_dob_linker(df, dobs=['04/05/1994', '10/02/1993'], date_format_param='%m/%d/%Y', Linker=Linker)

In [None]:
if Linker == SparkLinker:
    valid_date_formats = ['dd/mm/Y', 'dd-mm-Y', 'mm/dd/Y', 'Y/mm/dd']
elif Linker == DuckDBLinker:
    valid_date_formats = ['%d/%m/%Y', '%d-%m-%Y', '%m/%d/%Y', '%Y/%m/%d']


In [24]:
with pytest.raises(py4j.protocol.Py4JJavaError):
    simple_dob_linker(df, dobs=['1994-14-15', '1994-12-03'], 
                        date_format_param='y-M-d', Linker=Linker)

  if LooseVersion(pandas.__version__) < LooseVersion(minimum_pandas_version):
  if LooseVersion(pandas.__version__) < LooseVersion(minimum_pandas_version):
  if LooseVersion(pandas.__version__) < LooseVersion(minimum_pandas_version):
  if LooseVersion(pandas.__version__) < LooseVersion(minimum_pandas_version):
  if LooseVersion(pandas.__version__) < LooseVersion(minimum_pandas_version):
  if LooseVersion(pandas.__version__) < LooseVersion(minimum_pandas_version):
  if LooseVersion(pandas.__version__) < LooseVersion(minimum_pandas_version):
  if LooseVersion(pandas.__version__) < LooseVersion(minimum_pandas_version):
23/03/21 14:52:02 WARN SimpleFunctionRegistry: The function jaro_winkler replaced a previously registered function.
23/03/21 14:52:02 WARN SimpleFunctionRegistry: The function jaccard replaced a previously registered function.
23/03/21 14:52:02 WARN SimpleFunctionRegistry: The function cosine_distance replaced a previously registered function.
23/03/21 14:52:02 WARN SimpleF

In [23]:
import py4j

In [22]:
simple_dob_linker(df, dobs=['1994/14/15', '1994/12/03'], 
                    date_format_param='y/M/d', Linker=Linker)

  if LooseVersion(pandas.__version__) < LooseVersion(minimum_pandas_version):
  if LooseVersion(pandas.__version__) < LooseVersion(minimum_pandas_version):
  if LooseVersion(pandas.__version__) < LooseVersion(minimum_pandas_version):
  if LooseVersion(pandas.__version__) < LooseVersion(minimum_pandas_version):
  if LooseVersion(pandas.__version__) < LooseVersion(minimum_pandas_version):
  if LooseVersion(pandas.__version__) < LooseVersion(minimum_pandas_version):
  if LooseVersion(pandas.__version__) < LooseVersion(minimum_pandas_version):
  if LooseVersion(pandas.__version__) < LooseVersion(minimum_pandas_version):
23/03/21 14:48:47 WARN SimpleFunctionRegistry: The function jaro_winkler replaced a previously registered function.
23/03/21 14:48:47 WARN SimpleFunctionRegistry: The function jaccard replaced a previously registered function.
23/03/21 14:48:47 WARN SimpleFunctionRegistry: The function cosine_distance replaced a previously registered function.
23/03/21 14:48:47 WARN SimpleF

Py4JJavaError: An error occurred while calling o1008.parquet.
: org.apache.spark.SparkException: Job aborted.
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:231)
	at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:188)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:108)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:106)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.doExecute(commands.scala:131)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:180)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:218)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:215)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:176)
	at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:132)
	at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:131)
	at org.apache.spark.sql.DataFrameWriter.$anonfun$runCommand$1(DataFrameWriter.scala:989)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:103)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:772)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:989)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:438)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:415)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:293)
	at org.apache.spark.sql.DataFrameWriter.parquet(DataFrameWriter.scala:874)
	at sun.reflect.GeneratedMethodAccessor198.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:750)
Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 43.0 failed 1 times, most recent failure: Lost task 1.0 in stage 43.0 (TID 421) (192.168.0.3 executor driver): java.time.format.DateTimeParseException: Text '1994/14/15' could not be parsed: Invalid value for MonthOfYear (valid values 1 - 12): 14
	at java.time.format.DateTimeFormatter.createError(DateTimeFormatter.java:1920)
	at java.time.format.DateTimeFormatter.parse(DateTimeFormatter.java:1781)
	at org.apache.spark.sql.catalyst.util.Iso8601TimestampFormatter.$anonfun$parse$1(TimestampFormatter.scala:78)
	at scala.runtime.java8.JFunction0$mcJ$sp.apply(JFunction0$mcJ$sp.java:23)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.sql.catalyst.util.Iso8601TimestampFormatter.parse(TimestampFormatter.scala:77)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage3.project_subExpr_3$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage3.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:755)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeTask(FileFormatWriter.scala:265)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.$anonfun$write$15(FileFormatWriter.scala:210)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:750)
Caused by: java.time.DateTimeException: Invalid value for MonthOfYear (valid values 1 - 12): 14
	at java.time.temporal.ValueRange.checkValidIntValue(ValueRange.java:330)
	at java.time.temporal.ChronoField.checkValidIntValue(ChronoField.java:722)
	at java.time.chrono.IsoChronology.resolveYMD(IsoChronology.java:550)
	at java.time.chrono.IsoChronology.resolveYMD(IsoChronology.java:123)
	at java.time.chrono.AbstractChronology.resolveDate(AbstractChronology.java:472)
	at java.time.chrono.IsoChronology.resolveDate(IsoChronology.java:492)
	at java.time.chrono.IsoChronology.resolveDate(IsoChronology.java:123)
	at java.time.format.Parsed.resolveDateFields(Parsed.java:352)
	at java.time.format.Parsed.resolveFields(Parsed.java:257)
	at java.time.format.Parsed.resolve(Parsed.java:244)
	at java.time.format.DateTimeParseContext.toResolved(DateTimeParseContext.java:331)
	at java.time.format.DateTimeFormatter.parseResolved0(DateTimeFormatter.java:1955)
	at java.time.format.DateTimeFormatter.parse(DateTimeFormatter.java:1777)
	... 18 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2253)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2202)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2201)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2201)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1078)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1078)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1078)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2440)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2382)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2371)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:868)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2202)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:200)
	... 32 more
Caused by: java.time.format.DateTimeParseException: Text '1994/14/15' could not be parsed: Invalid value for MonthOfYear (valid values 1 - 12): 14
	at java.time.format.DateTimeFormatter.createError(DateTimeFormatter.java:1920)
	at java.time.format.DateTimeFormatter.parse(DateTimeFormatter.java:1781)
	at org.apache.spark.sql.catalyst.util.Iso8601TimestampFormatter.$anonfun$parse$1(TimestampFormatter.scala:78)
	at scala.runtime.java8.JFunction0$mcJ$sp.apply(JFunction0$mcJ$sp.java:23)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.sql.catalyst.util.Iso8601TimestampFormatter.parse(TimestampFormatter.scala:77)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage3.project_subExpr_3$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage3.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:755)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeTask(FileFormatWriter.scala:265)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.$anonfun$write$15(FileFormatWriter.scala:210)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more
Caused by: java.time.DateTimeException: Invalid value for MonthOfYear (valid values 1 - 12): 14
	at java.time.temporal.ValueRange.checkValidIntValue(ValueRange.java:330)
	at java.time.temporal.ChronoField.checkValidIntValue(ChronoField.java:722)
	at java.time.chrono.IsoChronology.resolveYMD(IsoChronology.java:550)
	at java.time.chrono.IsoChronology.resolveYMD(IsoChronology.java:123)
	at java.time.chrono.AbstractChronology.resolveDate(AbstractChronology.java:472)
	at java.time.chrono.IsoChronology.resolveDate(IsoChronology.java:492)
	at java.time.chrono.IsoChronology.resolveDate(IsoChronology.java:123)
	at java.time.format.Parsed.resolveDateFields(Parsed.java:352)
	at java.time.format.Parsed.resolveFields(Parsed.java:257)
	at java.time.format.Parsed.resolve(Parsed.java:244)
	at java.time.format.DateTimeParseContext.toResolved(DateTimeParseContext.java:331)
	at java.time.format.DateTimeFormatter.parseResolved0(DateTimeFormatter.java:1955)
	at java.time.format.DateTimeFormatter.parse(DateTimeFormatter.java:1777)
	... 18 more


In [None]:
# with pytest.raises(duckdb.InvalidInputException):
#     simple_linker(df, settings, Linker)
simple_dob_linker(df, dobs=['03/04/1994', '19/02/1993'], date_format_param='d/m/Y', Linker=Linker)

In [None]:


simple_dob_linker(df, dobs=['03/04/1994', '19/02/1993'], date_format_param='%d/%m/%Y', Linker=Linker)
simple_dob_linker(df, dobs=['03-04-1994', '19-02-1993'], date_format_param='%d-%m-%Y', Linker=Linker)
simple_dob_linker(df, dobs=['04/05/1994', '10/02/1993'], date_format_param='%m/%d/%Y', Linker=Linker)


In [None]:
valid_date_formats = ['%d/%m/%Y', '%d-%m-%Y', '%m/%d/%Y']
#valid_date_formats = ['d/m/%Y', 'd-m-%Y', 'm/d/%Y']

In [None]:
simple_dob_linker(df, dobs=['03/04/1994', '19/02/1993'], date_format_param=valid_date_formats[0], Linker=Linker)

In [None]:

simple_dob_linker(df, dobs=['03/04/1994', '19/02/1993'], date_format_param='d/m/Y', Linker=Linker)
simple_dob_linker(df, dobs=['03-04-1994', '19-02-1993'], date_format_param='d-m-Y', Linker=Linker)
simple_dob_linker(df, dobs=['04/05/1994', '10/02/1993'], date_format_param='m/d/Y', Linker=Linker)


In [None]:
simple_dob_linker(df, dobs=['04/05/1994', '10/02/1993'], date_format_param='m/d/Y', Linker=Linker)
simple_dob_linker(df, dobs=['1994/05/04', '1993/14/02'], date_format_param='Y-m-d', Linker=Linker)


In [None]:
simple_dob_linker(df, dobs=['1994/55/54', '1993/14/02'], date_format_param='%Y-%m-%d', Linker=Linker)


In [None]:
# incompatible date formats
with pytest.raises(duckdb.InvalidInputException):
    simple_dob_linker(df, dobs=['03-04-1994', '19-02-1993'], date_format_param='%d/%m/%Y', Linker=Linker)


In [None]:
dobs = ['03-04-1994', '31-02-1993']
df['dob'] = dobs

In [None]:
settings_cl

In [None]:
simple_linker(df, settings_cl, Linker)
df_e.head()

In [None]:
# # Dict key: {size: gamma_level value}
size_gamma_lookup = {1: 11, 2: 6, 3: 3, 4: 1}

linker_outputs = {
    "cll": df_e,
    "cl": cl_df_e,
}

# Check gamma sizes are as expected
for gamma, gamma_lookup in size_gamma_lookup.items():
    for linker_pred in linker_outputs.values():
        assert sum(linker_pred["gamma_dob"] == gamma) == gamma_lookup

# Check individual IDs are assigned to the correct gamma values
# Dict key: {gamma_value: tuple of ID pairs}
size_gamma_lookup = {
    4: [(1, 2)],
    3: [(3, 5), (1, 6), (2, 6)],
    2: [(1, 3), (2, 3), (1, 5), (2, 5), (3, 6), (5, 6)],
}

for gamma, id_pairs in size_gamma_lookup.items():
    for left, right in id_pairs:
        for linker_name, linker_pred in linker_outputs.items():

            print(f"Checking IDs: {left}, {right} for {linker_name}")

            assert (
                linker_pred.loc[
                    (linker_pred.unique_id_l == left)
                    & (linker_pred.unique_id_r == right)
                ]["gamma_dob"].values[0]
                == gamma
            )


In [None]:
simple_dob_linker(df, dobs=['1994/05/04', '1993/14/02'], date_format_param='%Y/%m/%d', Linker=Linker)

In [None]:
with pytest.raises(Exception) as e:
    simple_dob_linker(df, dobs=['1994/05/04', '1993/14/02'], date_format_param='%Y/%m/%d', Linker=Linker)
e.type   

In [None]:
with pytest.raises(Exception) as e:
    simple_dob_linker(df, dobs=['03-14-1994', '19-22-1993'], date_format_param='%d-%m-%Y', Linker=Linker)
e.type   

In [None]:
with pytest.raises(Exception) as e:
    simple_dob_linker(df, dobs=['20-04-1993', '19-02-1993'], date_format_param='%d/%m/%Y', Linker=Linker)
e.type   

In [None]:
from splink.duckdb.duckdb_linker import DuckDBLinker
from splink.spark.spark_linker import SparkLinker

In [None]:
import py4j
if Linker == SparkLinker:
    expected_bad_dates_error = py4j.protocol.Py4JJavaError
elif Linker == DuckDBLinker:
    expected_bad_dates_error = duckdb.InvalidInputException

In [None]:
with pytest.raises(expected_bad_dates_error):
    simple_dob_linker(df, dobs=['1994/05/04', '1993/14/02'], date_format_param='%Y/%m/%d', Linker=Linker)
with pytest.raises(expected_bad_dates_error):
    simple_dob_linker(df, dobs=['03-14-1994', '19-22-1993'], date_format_param='%d-%m-%Y', Linker=Linker)
with pytest.raises(expected_bad_dates_error):
    simple_dob_linker(df, dobs=['20-04-1993', '19-02-1993'], date_format_param='%d/%m/%Y', Linker=Linker)


In [None]:
    # # Differing lengths between thresholds and units

cl.datediff_at_thresholds("dob", [1], ["day", "month", "year", "year"])
    # # Negative threshold
    # with pytest.raises(ValueError):
    #     cl.datediff_at_thresholds("dob", [-1], ["day"])
    # # Invalid metric
    # with pytest.raises(ValueError):
    #     cl.datediff_at_thresholds("dob", [1], ["dy"])
    # # Threshold len == 0
    # with pytest.raises(ValueError):
    #     cl.datediff_at_thresholds("dob", [], ["dy"])
    # # Metric len == 0
    # with pytest.raises(ValueError):
    #     cl.datediff_at_thresholds("dob", [1], [])

In [None]:
cl.datediff_at_thresholds("dob", [1], ["day", "month", "year", "year"])