In [None]:
import pandas as pd 
from splink.spark.spark_linker import SparkLinker
import splink.spark.spark_comparison_library as cl
import splink.spark.spark_comparison_level_library as cll

df = pd.read_csv("./tests/datasets/fake_1000_from_splink_demos.csv")
print(df.dtypes)
print(f"The number of rows is: {df.shape[0]:d}")
df.head(5)

In [None]:
tricky_dates = ['2021-13-21', '2000-14-22', '1999-10-42', '2002-11-52', '2019-15-55']
tricky_dates_df = tricky_dates * int(df.shape[0]/len(tricky_dates))
df_test = pd.DataFrame(tricky_dates_df, columns=['dob'])
print(df_test.dtypes)
df_test.head()

In [None]:
# make a second dataframe and add the bad date strings
df_2 = df.copy(deep=True)
df_2['dob'] = df_test['dob'].values

In [None]:
from pyspark.context import SparkContext, SparkConf
from pyspark.sql import SparkSession, types
from pyspark.sql.functions import col, udf
from splink.spark.jar_location import similarity_jar_location
# sc = SparkContext.getOrCreate()

# spark=(
#     SparkSession.builder.master('local[*]')
#     .appName('test')
#     .config('spark.sql.ansi.enabled','true')
#     .getOrCreate())

# hadoop_conf = spark.sparkContext._jsc.hadoopConfiguration()

In [None]:
conf = SparkConf()
path = similarity_jar_location()
conf.set("spark.jars", path)
conf.set("spark.sql.ansi.enabled",True)
conf.set("spark.sql.legacy.timeParserPolicy","LEGACY")

sc = SparkContext.getOrCreate(conf=conf)
spark = SparkSession(sc)
spark.sparkContext.setCheckpointDir('/Users/alice.oleary/Documents/spark_checkpoint_dir')

# Register the jaro winkler custom udf
spark.udf.registerJavaFunction(
    "jaro_winkler", "uk.gov.moj.dash.linkage.JaroWinklerSimilarity", types.DoubleType()
)

In [None]:
from pyspark.sql.types import StructType

my_schema = StructType.fromJson({'fields': [{'metadata': {},'name': 'unique_id','nullable': True,'type': 'integer'},
  {'metadata': {}, 'name': 'first_name', 'nullable': True, 'type': 'string'},
  {'metadata': {}, 'name': 'surname', 'nullable': True, 'type': 'string'},
  {'metadata': {}, 'name': 'dob','nullable': True,'type': 'string'},
  {'metadata': {}, 'name': 'city','nullable': True,'type': 'string'},
  {'metadata': {}, 'name': 'email','nullable': True,'type': 'string'},
  {'metadata': {}, 'name': 'group','nullable': True,'type': 'string'}],
 'type': 'struct'})

In [None]:
spark_df = spark.createDataFrame(df,schema=my_schema)

Need to register dataframe / convert to spark dataframe in order to run the linker

In [None]:
linker = SparkLinker(spark_df)

In [None]:
first_name_comparison = cl.exact_match("first_name")
print(first_name_comparison.human_readable_description)

In [None]:
first_name_comparison.as_dict()

In [None]:
dob_comparison = cl.datediff_at_thresholds("dob",date_thresholds=[1,2,3], date_metrics =["day","year","month"],\
                                            cast_strings_to_date=True, date_format="yyyy-MM-dd")
dob_comparison.as_dict()

In [None]:
blocking_rules_predict = [
   
    # Tight(ish) blocking rule to start
    "l.first_name = r.first_name and l.surname = r.surname",
]

In [None]:
city = cl.exact_match("city", term_frequency_adjustments=True)

In [None]:
settings = {
    "link_type": "dedupe_only",
    "unique_id_column_name": "unique_id",
    "retain_matching_columns": True,
    "retain_intermediate_calculation_columns": True,
    "max_iterations": 10,
    "em_convergence": 0.01,
    "comparisons": [
        city,
        first_name_comparison,
        dob_comparison,
    ],
    "blocking_rules_to_generate_predictions": blocking_rules_predict,
}


In [None]:
linker = SparkLinker(spark_df, settings)

In [None]:
deterministic_rules = [
    "l.first_name = r.first_name and levenshtein(r.dob, l.dob) <= 1",
    "l.surname = r.surname and levenshtein(r.dob, l.dob) <= 1",
    "l.first_name = r.first_name and levenshtein(r.surname, l.surname) <= 2",
    "l.email = r.email"
]

linker.estimate_probability_two_random_records_match(deterministic_rules, recall=0.7)

In [None]:
linker.estimate_u_using_random_sampling(target_rows=5000)

In [None]:
training_blocking_rule = "l.first_name = r.first_name and l.surname = r.surname"
training_session_fname_sname = linker.estimate_parameters_using_expectation_maximisation(training_blocking_rule)

In [None]:
ansi_conf = spark.sparkContext.getConf().get("spark.sql.ansi.enabled")

In [None]:
ansi_conf

In [None]:
linker

In [None]:
linker.spark.sparkContext.getConf().get("spark.sql.ansi.enabled")

In [None]:
test = dir(linker._settings_obj)
test

In [None]:
linker._settings_obj._settings_dict

In [None]:
comparisons_as_list = linker._settings_obj._settings_dict["comparisons"]


Way to see if linker settings have used the datestr thingL

In [None]:
any(['to_timestamp' in str(comparisons_as_list[x].values()) 
     for x in range(0, len(comparisons_as_list))])