In [1]:
import pandas as pd 
from splink.spark.spark_linker import SparkLinker
import splink.spark.spark_comparison_library as cl
import splink.spark.spark_comparison_level_library as cll

df = pd.read_csv("./tests/datasets/fake_1000_from_splink_demos.csv")
print(df.dtypes)
print(f"The number of rows is: {df.shape[0]:d}")
df.head(5)

unique_id      int64
first_name    object
surname       object
dob           object
city          object
email         object
group          int64
dtype: object
The number of rows is: 1000


Unnamed: 0,unique_id,first_name,surname,dob,city,email,group
0,0,Julia,,2015-10-29,London,hannah88@powers.com,0
1,1,Julia,Taylor,2015-07-31,London,hannah88@powers.com,0
2,2,Julia,Taylor,2016-01-27,London,hannah88@powers.com,0
3,3,Julia,Taylor,2015-10-29,,hannah88opowersc@m,0
4,4,oNah,Watson,2008-03-23,Bolton,matthew78@ballard-mcdonald.net,1


In [2]:
tricky_dates = ['2021-13-21', '2000-14-22', '1999-10-42', '2002-11-52', '2019-15-55']
tricky_dates_df = tricky_dates * int(df.shape[0]/len(tricky_dates))
df_test = pd.DataFrame(tricky_dates_df, columns=['dob'])
print(df_test.dtypes)
df_test.head()

dob    object
dtype: object


Unnamed: 0,dob
0,2021-13-21
1,2000-14-22
2,1999-10-42
3,2002-11-52
4,2019-15-55


In [3]:
# make a second dataframe and add the bad date strings
df_2 = df.copy(deep=True)
df_2['dob'] = df_test['dob'].values

In [4]:
from pyspark.context import SparkContext, SparkConf
from pyspark.sql import SparkSession, types
from pyspark.sql.functions import col, udf
from splink.spark.jar_location import similarity_jar_location
# sc = SparkContext.getOrCreate()

# spark=(
#     SparkSession.builder.master('local[*]')
#     .appName('test')
#     .config('spark.sql.ansi.enabled','true')
#     .getOrCreate())

# hadoop_conf = spark.sparkContext._jsc.hadoopConfiguration()

In [5]:
conf = SparkConf()
path = similarity_jar_location()
conf.set("spark.jars", path)
conf.set("spark.sql.ansi.enabled",False)
conf.set("spark.sql.legacy.timeParserPolicy","LEGACY")

sc = SparkContext.getOrCreate(conf=conf)
spark = SparkSession(sc)
spark.sparkContext.setCheckpointDir('/Users/alice.oleary/Documents/spark_checkpoint_dir')

# Register the jaro winkler custom udf
spark.udf.registerJavaFunction(
    "jaro_winkler", "uk.gov.moj.dash.linkage.JaroWinklerSimilarity", types.DoubleType()
)

23/03/14 11:13:21 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/03/14 11:13:22 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [6]:
from pyspark.sql.types import StructType

my_schema = StructType.fromJson({'fields': [{'metadata': {},'name': 'unique_id','nullable': True,'type': 'integer'},
  {'metadata': {}, 'name': 'first_name', 'nullable': True, 'type': 'string'},
  {'metadata': {}, 'name': 'surname', 'nullable': True, 'type': 'string'},
  {'metadata': {}, 'name': 'dob','nullable': True,'type': 'string'},
  {'metadata': {}, 'name': 'city','nullable': True,'type': 'string'},
  {'metadata': {}, 'name': 'email','nullable': True,'type': 'string'},
  {'metadata': {}, 'name': 'group','nullable': True,'type': 'string'}],
 'type': 'struct'})

In [7]:
spark_df = spark.createDataFrame(df,schema=my_schema)

  if LooseVersion(pandas.__version__) < LooseVersion(minimum_pandas_version):
  if LooseVersion(pandas.__version__) < LooseVersion(minimum_pandas_version):


Need to register dataframe / convert to spark dataframe in order to run the linker

In [8]:
linker = SparkLinker(spark_df)

23/03/14 11:13:26 WARN SimpleFunctionRegistry: The function jaro_winkler replaced a previously registered function.


In [9]:
first_name_comparison = cl.exact_match("first_name")
print(first_name_comparison.human_readable_description)

Comparison 'Exact match vs. anything else' of `first_name`.
Similarity is assessed using the following ComparisonLevels:
    - 'Null' with SQL rule: `first_name_l` IS NULL OR `first_name_r` IS NULL
    - 'Exact match' with SQL rule: `first_name_l` = `first_name_r`
    - 'All other comparisons' with SQL rule: ELSE



In [10]:
first_name_comparison.as_dict()

{'output_column_name': 'first_name',
 'comparison_levels': [{'sql_condition': '`first_name_l` IS NULL OR `first_name_r` IS NULL',
   'label_for_charts': 'Null',
   'is_null_level': True},
  {'sql_condition': '`first_name_l` = `first_name_r`',
   'label_for_charts': 'Exact match'},
  {'sql_condition': 'ELSE', 'label_for_charts': 'All other comparisons'}],
 'comparison_description': 'Exact match vs. anything else'}

In [11]:
dob_comparison = cl.datediff_at_thresholds("dob",date_thresholds=[1,2,3], date_metrics =["day","year","month"],\
                                            cast_strings_to_date=True, date_format="yyyy-MM-dd")
dob_comparison.as_dict()

{'output_column_name': 'dob',
 'comparison_levels': [{'sql_condition': '`dob_l` IS NULL OR `dob_r` IS NULL',
   'label_for_charts': 'Null',
   'is_null_level': True},
  {'sql_condition': '`dob_l` = `dob_r`', 'label_for_charts': 'Exact match'},
  {'sql_condition': "\n        abs(datediff(to_timestamp(`dob_l`,'yyyy-MM-dd'),to_timestamp(`dob_r`,'yyyy-MM-dd'))) <= 1\n    ",
   'label_for_charts': 'Within 1 day'},
  {'sql_condition': "\n        floor(abs(months_between(to_timestamp(`dob_l`,'yyyy-MM-dd'),to_timestamp(`dob_r`, 'yyyy-MM-dd')) / 12)) <= 2\n    ",
   'label_for_charts': 'Within 2 years'},
  {'sql_condition': "\n        floor(abs(months_between(to_timestamp(`dob_l`,'yyyy-MM-dd'),to_timestamp(`dob_r`, 'yyyy-MM-dd')))) <= 3\n    ",
   'label_for_charts': 'Within 3 months'},
  {'sql_condition': 'ELSE', 'label_for_charts': 'All other comparisons'}],
 'comparison_description': 'Exact match vs. Dates within the following thresholds Day(s): 1, Year(s): 2, Month(s): 3 vs. anything else'}

In [12]:
blocking_rules_predict = [
   
    # Tight(ish) blocking rule to start
    "l.first_name = r.first_name and l.surname = r.surname",
]

In [13]:
city = cl.exact_match("city", term_frequency_adjustments=True)

In [14]:
settings = {
    "link_type": "dedupe_only",
    "unique_id_column_name": "unique_id",
    "retain_matching_columns": True,
    "retain_intermediate_calculation_columns": True,
    "max_iterations": 10,
    "em_convergence": 0.01,
    "comparisons": [
        city,
        first_name_comparison,
        dob_comparison,
    ],
    "blocking_rules_to_generate_predictions": blocking_rules_predict,
}


In [15]:
linker = SparkLinker(spark_df, settings)

WARN: You are using datediff comparison with str-casting and ANSI is not enabled. Bad dates e.g. 1999-13-54 will not trigger an exception and might mess up comparisons. Ensure date strings are cleaned to remove bad dates
23/03/14 11:13:26 WARN SimpleFunctionRegistry: The function jaro_winkler replaced a previously registered function.
23/03/14 11:13:26 WARN SimpleFunctionRegistry: The function jaccard replaced a previously registered function.
23/03/14 11:13:26 WARN SimpleFunctionRegistry: The function cosine_distance replaced a previously registered function.
23/03/14 11:13:26 WARN SimpleFunctionRegistry: The function dmetaphone replaced a previously registered function.
23/03/14 11:13:26 WARN SimpleFunctionRegistry: The function dmetaphonealt replaced a previously registered function.
23/03/14 11:13:26 WARN SimpleFunctionRegistry: The function qgramtokeniser replaced a previously registered function.


In [16]:
deterministic_rules = [
    "l.first_name = r.first_name and levenshtein(r.dob, l.dob) <= 1",
    "l.surname = r.surname and levenshtein(r.dob, l.dob) <= 1",
    "l.first_name = r.first_name and levenshtein(r.surname, l.surname) <= 2",
    "l.email = r.email"
]

linker.estimate_probability_two_random_records_match(deterministic_rules, recall=0.7)

  if LooseVersion(pandas.__version__) < LooseVersion(minimum_pandas_version):
  if LooseVersion(pandas.__version__) < LooseVersion(minimum_pandas_version):
  if LooseVersion(pandas.__version__) < LooseVersion(minimum_pandas_version):
  if LooseVersion(pandas.__version__) < LooseVersion(minimum_pandas_version):
Probability two random records match is estimated to be  0.0246.                
This means that amongst all possible pairwise record comparisons, one in 40.72 are expected to match.  With 499,500 total possible comparisons, we expect a total of around 12,267.14 matching pairs


In [17]:
linker.estimate_u_using_random_sampling(target_rows=5000)

----- Estimating u probabilities using random sampling -----
23/03/14 11:13:37 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
23/03/14 11:13:37 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 84.47% for 8 writers
23/03/14 11:13:37 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 75.08% for 9 writers
23/03/14 11:13:37 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 67.58% for 10 writers
23/03/14 11:13:37 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 61.43% for 11 writers
23/03/14 11:13:37 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 56.31% for 12 writers
23/03/14 11:13:37 

In [18]:
training_blocking_rule = "l.first_name = r.first_name and l.surname = r.surname"
training_session_fname_sname = linker.estimate_parameters_using_expectation_maximisation(training_blocking_rule)


----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
l.first_name = r.first_name and l.surname = r.surname

Parameter estimates will be made for the following comparison(s):
    - city
    - dob

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 
    - first_name
23/03/14 11:13:41 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
23/03/14 11:13:41 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 84.47% for 8 writers
23/03/14 11:13:41 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 75.08% for 9 writers
23/03/14 11:13:41 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 67.58% for 10 writers
23/03/14 11:1

Way to see if linker settings have used the datestr thingL

In [19]:
type(settings)

dict

In [20]:
not isinstance(settings, (dict, type(None)))

False

In [21]:
(dict, type(None))

(dict, NoneType)

Want to test a bunch of dates/ formats which should throw errors. Catch exceptions that are specific for each backend