In [None]:
#!pip install splink

In [None]:
import pandas as pd
import numpy as np
import json

In [None]:
df_m = pd.read_csv('gs://<your_bucket>/handsonentityresolution/mari_clean.csv')
df_c = pd.read_csv('gs://<your_bucket>/handsonentityresolution/basic_clean.csv')

In [None]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import types

conf = SparkConf()

# This parallelism setting for cluster of 4 n2-standard-4
conf.set("spark.sql.shuffle.partitions", "240")
conf.set("spark.default.parallelism", "240")

sc = SparkContext.getOrCreate(conf=conf)
spark = SparkSession(sc)
spark.sparkContext.setCheckpointDir("gs://<your_bucket>/handsonentityresolution/")

# Register the jaro winkler custom udf
spark.udf.registerJavaFunction(
    "jaro_winkler_similarity", "uk.gov.moj.dash.linkage.JaroWinklerSimilarity", types.DoubleType()
)

In [None]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

schema = StructType([StructField("Postcode", StringType()), StructField("CompanyName", StringType()),StructField("unique_id", IntegerType())])

In [None]:
df_c = df_c[['Postcode','CompanyName','unique_id']]

In [None]:
dfs_m = spark.createDataFrame(df_m, schema)
dfs_c = spark.createDataFrame(df_c, schema)

In [None]:
import splink.spark.spark_comparison_library as cl

settings = {
    "link_type": "link_only",
    "blocking_rules_to_generate_predictions": [
        "l.Postcode = r.Postcode",
        "l.CompanyName = r.CompanyName",
    ],
    "comparisons": [
        cl.jaro_winkler_at_thresholds("CompanyName",[0.9,0.8]),
    ],
    "retain_intermediate_calculation_columns" : True,
    "retain_matching_columns" : True
}

In [None]:
from splink.spark.spark_linker import SparkLinker
linker = SparkLinker([dfs_m, dfs_c], settings, input_table_aliases=["dfs_m", "dfs_c"])

In [None]:
linker.estimate_u_using_random_sampling(target_rows=1e4)

In [None]:
linker.save_settings_to_json("Ch7_Splink_Settings.json", overwrite=True)
linker.load_settings_from_json("./<your_path>/Ch7_Splink_Settings.json")

In [None]:
linker.match_weights_chart()

In [None]:
linker.m_u_parameters_chart()

In [None]:
# Calculate predictions

df_pred = linker.predict(threshold_match_probability=0.5).as_pandas_dataframe()
len(df_pred)