## End to End Workflow

## Step 1:  Imports and setup

The following is just boilerplate code that sets up the Spark session and sets some other non-essential configuration options

In [49]:
from pyspark.context import SparkContext, SparkConf
from pyspark.sql import SparkSession, Window
from pyspark.sql.types import StructType
import pyspark.sql.functions as f
from pyspark.sql.types import StructType,StructField, StringType, IntegerType

In [50]:
import os
import time
import json
import requests
import xml.etree.ElementTree as ET
import datetime

#Extracting the correct URL from hive-site.xml
tree = ET.parse('/etc/hadoop/conf/hive-site.xml')
root = tree.getroot()

for prop in root.findall('property'):
    if prop.find('name').text == "hive.metastore.warehouse.dir":
        storage = prop.find('value').text.split("/")[0] + "//" + prop.find('value').text.split("/")[2]

print("The correct Cloud Storage URL is: {}".format(storage))

os.environ['STORAGE'] = storage

The correct Cloud Storage URL is: s3a://demo-aws-2


In [51]:
#conf=SparkConf()

# Load in a jar that provides extended string comparison functions such as Jaro Winkler.
# Splink
#     conf.set('spark.driver.extraClassPath', 'jars/scala-udf-similarity-0.0.6.jar,jars/graphframes-0.6.0-spark2.3-s_2.11.jar')
#     conf.set('spark.jars', 'jars/scala-udf-similarity-0.0.6.jar,jars/graphframes-0.6.0-spark2.3-s_2.11.jar')
#conf.set('spark.driver.extraClassPath', 'jars/scala-udf-similarity-0.0.6.jar')
#conf.set('spark.jars', 'jars/scala-udf-similarity-0.0.6.jar')
#conf.set('spark.jars.packages', 'graphframes:graphframes:0.6.0-spark2.3-s_2.11')

#sc = SparkContext.getOrCreate(conf=conf)
#sc.setCheckpointDir("temp_graphframes/")


spark = SparkSession\
    .builder\
    .appName("Entity Resolution with Lineage")\
    .config("spark.hadoop.fs.s3a.s3guard.ddb.region","us-east-1")\
    .config("spark.yarn.access.hadoopFileSystems", os.environ['STORAGE'])\
    .config("spark.driver.extraClassPath", "jars/scala-udf-similarity-0.0.6.jar")\
    .config("spark.jars", "jars/scala-udf-similarity-0.0.6.jar")\
    .getOrCreate()

# Register UDFs
from pyspark.sql import types
spark.udf.registerJavaFunction('jaro_winkler_sim', 'uk.gov.moj.dash.linkage.JaroWinklerSimilarity', types.DoubleType())
spark.udf.registerJavaFunction('Dmetaphone', 'uk.gov.moj.dash.linkage.DoubleMetaphone', types.StringType())

In [52]:
spark

In [53]:
import pandas as pd 
pd.options.display.max_columns = 500

In [54]:
import logging 
logging.basicConfig()  # Means logs will print in Jupyter Lab

# Set to DEBUG if you want splink to log the SQL statements it's executing under the hood
logging.getLogger("splink").setLevel(logging.INFO)

## Step 2: Read in the data

The `l` and `r` stand for 'left' and 'right.  It doesn't matter which of the two datasets you choose as the left, performance and results will be the same.

⚠️ Note that `splink` makes the following assumptions about your data:

-  There is a field containing a unique record identifier in each dataset
-  The two datasets being linked have common column names - e.g. date of birth is represented in both datasets in a field of the same name.   In many cases, this means that the user needs to rename columns prior to using `splink`


##### READING FROM PHOENIX INTO A SPARK DF

In [55]:
class Db:
    def __init__(self):
        opts = {}
        opts['authentication'] = 'BASIC'
        opts['avatica_user'] = os.environ["WORKLOAD_USER"]
        opts['avatica_password'] = os.environ["WORKLOAD_PASSWORD"]
        database_url = os.environ["OPDB_ENDPOINT_AWS2"]
        self.TABLENAME = "test_table_paul"
        self.conn = phoenixdb.connect(database_url, autocommit=True,**opts)
        self.curs = self.conn.cursor()
        
    def get_data(self):

        query = f"SELECT * FROM CML_WORKSHOP_TABLE_RIGHT"

        model.curs.execute(query)
        rows = model.curs.fetchall()

        return rows

In [56]:
import logging
logging.basicConfig( level=logging.DEBUG)

import os
import phoenixdb
model = Db()

In [57]:
phoenix_df = model.get_data()

In [58]:
schema = StructType([StructField('unique_id', StringType(), True),
                     StructField('first_name', StringType(), True), 
                     StructField('surname', StringType(), True), 
                     StructField('dob', StringType(), True), 
                     StructField('city', StringType(), True), 
                     StructField('email', StringType(), True), 
                     StructField('group', StringType(), True)])

In [59]:
right_df = spark.createDataFrame(phoenix_df, schema=schema)

##### READING FROM HIVE INTO A SPARK DF

In [60]:
left_df = spark.sql("SELECT * FROM default.mytable")

In [61]:
left_df.show(5)
right_df.show(5)

+---------+----------+-------+----------+------+--------------------+-----+
|unique_id|first_name|surname|       dob|  city|               email|group|
+---------+----------+-------+----------+------+--------------------+-----+
|        1|    Julia | Taylor|2015-07-31|London| hannah88@powers.com|    0|
|        2|    Julia | Taylor|2016-01-27|London| hannah88@powers.com|    0|
|        3|    Julia | Taylor|2015-10-29|  null|  hannah88opowersc@m|    0|
|        5|     Noah | Watson|2008-03-23|Bolton|matthew78@ballard...|    1|
|        6|    Watson|  Noah |2008-03-23|  null|matthew78@ballard...|    1|
+---------+----------+-------+----------+------+--------------------+-----+
only showing top 5 rows

+---------+----------+-------+----------+--------------+--------------------+-----+
|unique_id|first_name|surname|       dob|          city|               email|group|
+---------+----------+-------+----------+--------------+--------------------+-----+
|        0|    Julia |   None|2015-10-2

## Step 3:  Configure splink using the `settings` object

Most of `splink` configuration options are stored in a settings dictionary.  This dictionary allows significant customisation, and can therefore get quite complex.  

💥 We provide an tool for helping to author valid settings dictionaries, which includes tooltips and autocomplete, which you can find [here](http://robinlinacre.com/splink_settings_editor/).

Customisation overrides default values built into splink.  For the purposes of this demo, we will specify a simple settings dictionary, which means we will be relying on these sensible defaults.

To help with authoring and validation of the settings dictionary, we have written a [json schema](https://json-schema.org/), which can be found [here](https://github.com/moj-analytical-services/splink/blob/master/splink/files/settings_jsonschema.json).  




In [62]:
settings = {
    "link_type": "link_only", 
    "max_iterations": 5,
    "blocking_rules": [
        'l.first_name = r.first_name',
        'l.surname = r.surname',
        'l.dob = r.dob'
    ],
    "comparison_columns": [
       {
        "custom_name": "name_inversion",
        "custom_columns_used": ["first_name", "surname", "dob"],
        "case_expression": "CASE WHEN first_name_l = first_name_r AND surname_l = surname_r THEN 2 WHEN first_name_l = surname_r AND surname_l = first_name_r THEN 1 ELSE 0 END",
        "num_levels": 3
        },
        {
            "col_name": "dob"
        },
        {
            "col_name": "city"
        },
        {
            "col_name": "email"
        }
    ],
    "additional_columns_to_retain": ["group"]
    
}

In words, this setting dictionary says:

- We are performing a data linking task (the other options are `dedupe_only`, or `link_and_dedupe`)
- Rather than generate all possible comparisons (the cartesian product of the input datasets), we are going restrict record comparisons to those generated by at least one of the rules in the specified array
- When comparing records, we will use information from the `first_name`, `surname`, `dob`, `city` and `email` columns to compute a match score.
- For `first_name` and `surname`, string comparisons will have three levels:
    - Level 2: Strings are (almost) exactly the same
    - Level 1: Strings are similar 
    - Level 0: No match
- We will make adjustments for term frequencies on the `first_name` and `surname` columns
- We will retain the `group` column in the results even though this is not used as part of comparisons.  This is a labelled dataset and `group` contains the true match - i.e. where group matches, the records pertain to the same person

## Step 4:  Estimate match scores using the Expectation Maximisation algorithm

In [63]:
from splink import Splink

linker = Splink(settings, spark, df_l=left_df, df_r=right_df)
target_df = linker.get_scored_comparisons()

# Later, we will make term frequency adjustments.  
# Persist caches these results in memory, preventing them having to be recomputed when we make these adjustments.
target_df.persist()  

TypeError: missing a required argument: 'spark'

In [None]:
# Inspect main dataframe that contains the match scores
target_df.toPandas().sample(5)

In [None]:
import os
import jaydebeapi
conn = jaydebeapi.connect("com.cloudera.impala.jdbc.DataSource",
                          "jdbc:impala://"+os.environ["IMPALA_HOST"]+":443/;ssl=1;transportMode=http;httpPath=cliservice;AuthMech=3;",
                          {'UID': os.environ["WORKLOAD_USER"], 'PWD': os.environ["WORKLOAD_PASSWORD"]},
                          '/home/cdsw/impala_drivers/ImpalaJDBC41.jar')
cursor = conn.cursor()

upsert_data_jaydebeapi(df, records=100)

#curs.fetchall()

curs.close()
conn.close()

In [12]:
target_df.write.format('parquet').mode("overwrite").saveAsTable('target_ER_TABLE')

## Step 5: Inspect results 



The `params` property of the `linker` is an object that contains a lot of diagnostic information about how the match probability was computed.  The following cells demonstrate some of its functionality

An alternative representation of the parameters displays them in terms of the effect different values in the comparison vectors have on the match probability:

In [45]:
params.bayes_factor_chart()

alt.Chart(...)

In [46]:
# If charts aren't displaying correctly in your notebook, you can write them to a file (by default splink_charts.html)
params.all_charts_write_html_file("splink_charts.html", overwrite=True)

You can also generate a report which explains how the match probability was computed for an individual comparison row.  

Note that you need to convert the row to a dictionary for this to work

In [17]:
from splink.intuition import intuition_report
row_dict = df_e.toPandas().sample(1).to_dict(orient="records")[0]
print(intuition_report(row_dict, params))


Initial probability of match (prior) = λ = 0.3851017951965332

Comparison of name_inversion.  Values are:
name_inversion_l: Patel, Mohammed , 1989-03-19
name_inversion_r: Mohammed , Patel, 1989-03-19
Comparison has 3 levels
𝛾 for this comparison = gamma_name_inversion = 1
Amongst matches, m = P(𝛾|match) = 0.12372622638940811
Amongst non matches, u = P(𝛾|non-match) = 2.1532594018935924e-06
Bayes factor = m/u = 57459.97267240647
New probability of match (updated belief): 0.9999722124537389

Comparison of dob.  Values are:
dob_l: 1989-03-19
dob_r: 1989-03-19
Comparison has 2 levels
𝛾 for this comparison = gamma_dob = 1
Amongst matches, m = P(𝛾|match) = 0.856454074382782
Amongst non matches, u = P(𝛾|non-match) = 0.048313207924366
Bayes factor = m/u = 17.727120826328797
New probability of match (updated belief): 0.9999984324428572

Comparison of city.  Values are:
city_l: Sheffield
city_r: Sheffield
Comparison has 2 levels
𝛾 for this comparison = gamma_city = 1
Amongst matches, m = P(𝛾|mat

In [41]:
from splink.diagnostics import splink_score_histogram
from pyspark.sql.functions import expr 
splink_score_histogram(df_e.filter(expr('match_probability > 0.001')), spark)

alt.Chart(...)

## Step 6: Create a Custom Atlas Type (Process) reflecting the EM algorithm

First we need to instantiate the connection to Atlas in CDP

In [18]:
import atlasclient

Endpoint, Username and Passoword are stored as CML project variables and passed dynamically

In [19]:
from atlasclient.client import Atlas
client = Atlas(os.environ["atlas_endpoint"], port='', username=os.environ["atlas_username"], password=os.environ["atlas_password"])

Verify the Client connection is working by querying a random Atlas entity

In [20]:
guid = "c845eb62-d85d-4591-8abe-0c31449cdd95"

In [21]:
entity = client.entity_guid(guid)

In [22]:
entity.entity['attributes']

ERROR:atlasclient.base:Missing attr EntityGuid: searchParameters


{'createTime': 1607119259591,
 'description': None,
 'displayName': None,
 'metadata': {'default_project_engine_type': 'legacy_engine',
  'project_description': 'Build an XGBoost model to predict churn using customer telco data.',
  'project_visibility': 'private',
  'service_name': 'mlgov/mlgovapiserver.mlx.cloudera.site@DEMO-AWS.YLCU-ATMI.CLOUDERA.SITE',
  'session_username': 'alexbleakley'},
 'modifiedTime': 1607119259591,
 'name': 'Churn Modeling with XGBoost - alexbleakley',
 'owner': 'alexbleakley',
 'qualifiedName': 'crn:cdp:ml:us-west-1:8a1e15cd-04c2-48aa-8f35-b4a8c11997d3:workspace:ab34ab61-368e-46a2-923e-6c5830585734/affaa215-c794-436c-af69-dcccfffb3d02',
 'replicatedFrom': None,
 'replicatedTo': None,
 'userDescription': None}

Looks like we have successfully established the connection. Next we can create a custom Atlas type (process) reflecting the EM algorithm

In [23]:
typedef_dict = {
    "enumTypes": [],
    "structTypes": [],
    "classificationDefs":[],
    "entityDefs": [{
        "superTypes": ["Process"],
        "name": "EM_algorithm_linkage",
        "description":"custom_type_for_Entity_Resolution",
        "attributeDefs": [{
            "name": "startTime",
            "isOptional": True,
            "isUnique": False,
            "isIndexable": False,
            "typeName":"string",
            "valuesMaxCount":1,
            "cardinality":"SINGLE",
            "valuesMinCount":0
        }]
    }]
}

And we can now register the new type with Atlas. For more on the Atlas type model, please visit this page: https://docs.cloudera.com/runtime/7.2.7/cdp-governance-overview/topics/atlas-metadata-model-overview.html

In [24]:
#Has already run once so will not run again
#client.typedefs.create(data=typedef_dict)

## Step 7: Instantiate the EM algorithm in Atlas along with lineage reflecting our Linkage Job above

Notice: we need to pass the Atlas guid for the two datasets we compared above as they were registered in Atlas when they were stored as a Spark table

In [30]:
process_entity_dict = {
  "entity" : {
    "guid" : "-2089428075574333",
    "status" : "ACTIVE",
    "createdBy" : "pdefusco",
    "updatedBy" : "pdefusco",
    "createTime" : "12342",
    "updateTime" : "12342",
    "version" : "12342",
    "relationshipAttributes" : {},
    "classifications" : [],
    "typeName" : "EM_algorithm_linkage",
    "attributes" : {
      "startTime" : "123",
      "qualifiedName": "EM Record Linkage",
      "name":"EM Record Linkage",
      "description":"Record Linkage Algorithm",
      "owner": "pdefusco",
        #, 
      "inputs":[{"guid": "aa955089-5a11-46d9-9dbf-2f6b75f4d65b", "typeName":"hive_table"},
               {"guid": "43d788ce-4af4-4253-af0b-465ea45c1b93", "typeName":"hive_table"}], 
      "outputs":[{"guid":"ac1bdcb3-73c8-4198-a8e6-0aa104c606bb", "type_name":"hive_table"}]
    }, 
  },
  
}

In [31]:
client.entity_post.create(data=process_entity_dict)

{'guidAssignments': {'-2089428075574333': '44848fe5-6950-4a73-a89c-9775b736b4c9'}}

## Step 8: Create a new Atlas Process Type related to writing a Spark Dataset to Impala and Instantiate it with source and target

In [42]:
typedef_dict = {
    "enumTypes": [],
    "structTypes": [],
    "classificationDefs":[],
    "entityDefs": [{
        "superTypes": ["Process"],
        "name": "Write_to_Impala",
        "description":"write_to_impala",
        "attributeDefs": [{
            "name": "startTime",
            "isOptional": True,
            "isUnique": False,
            "isIndexable": False,
            "typeName":"string",
            "valuesMaxCount":1,
            "cardinality":"SINGLE",
            "valuesMinCount":0
        }]
    }]
}

In [None]:
process_entity_dict = {
  "entity" : {
    "guid" : "-2089428075574888",
    "status" : "ACTIVE",
    "createdBy" : "pdefusco",
    "updatedBy" : "pdefusco",
    "createTime" : "12342",
    "updateTime" : "12342",
    "version" : "12342",
    "relationshipAttributes" : {},
    "classifications" : [],
    "typeName" : "Write_to_Impala",
    "attributes" : {
      "startTime" : "123",
      "qualifiedName": "Write_to_Impala",
      "name":"Write_to_Impala",
      "description":"Record Linkage Algorithm",
      "owner": "pdefusco",
        #, 
      "inputs":[{"guid": "740ab8a4-c752-4ee7-b6be-6f4082ec9fee", "typeName":"hive_table"}], 
      "outputs":[{"guid":"", "type_name":"hive_table"}]
    }, 
  },
  
}

![title](images/ER_atlas_lineage.png)

Next we can optionally remove the EM Algorithm instance from Atlas via the client

In [32]:
entity = client.entity_guid("44848fe5-6950-4a73-a89c-9775b736b4c9")

In [35]:
entity.entity['attributes']["owner"]

'pdefusco'

In [36]:
entity.delete()

## We have completed our introduction to Splink and the Atlas Client. 
## Next we will simulate a real world Application with CML Jobs and COD (Cloudera Operational Database)