In [138]:
!pip install jellyfish numpy

Collecting numpy
  Using cached https://files.pythonhosted.org/packages/de/37/fe7db552f4507f379d81dcb78e58e05030a8941757b1f664517d581b5553/numpy-1.15.4-cp27-cp27mu-manylinux1_x86_64.whl
Installing collected packages: numpy
Successfully installed numpy-1.15.4


In [48]:
import apache_beam as beam
from apache_beam.io import BigQuerySource

from apache_beam.metrics import Metrics
from apache_beam.metrics.metric import MetricsFilter
from apache_beam.options.pipeline_options import PipelineOptions, GoogleCloudOptions, StandardOptions
from apache_beam.options.pipeline_options import SetupOptions

In [102]:
options = PipelineOptions()
google_cloud_options = options.view_as(GoogleCloudOptions)
google_cloud_options.project = 'anz-pso-nfaggian'
google_cloud_options.job_name = 'distance_calculation'
google_cloud_options.staging_location = 'gs://anz-pso-nfaggian/stage'
google_cloud_options.temp_location = 'gs://anz-pso-nfaggian/temp'
options.view_as(StandardOptions).runner = 'DirectRunner'

In [142]:
query = """
WITH
  name_index AS (
   -- name: sorted neighbourhood indexing method 
  SELECT
    donor_id,
    name,
    address,
    ARRAY_AGG(STRUCT(donor_id, name, address)) OVER (ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING) AS name_candidates
  FROM (
    SELECT
      *
    FROM
      dedup.processed_donors
    ORDER BY
      name) ),
  address_index AS (
  -- address: sorted neighbourhood indexing method
  SELECT
    donor_id,
    name,
    address,
    ARRAY_AGG(STRUCT(donor_id, name, address)) OVER (ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING) AS address_candidates
  FROM (
    SELECT
      *
    FROM
      dedup.processed_donors
    ORDER BY
      address) )
SELECT
  name_index.donor_id,
  name_index.name,
  name_index.name_candidates,
  address_index.address,
  address_index.address_candidates
FROM
  address_index
JOIN
  name_index
ON
  address_index.donor_id = name_index.donor_id
LIMIT 500
"""

In [None]:
import jellyfish as jf
import numpy as np

class indexer(beam.DoFn): 
    """
    Forms candidate pairs from a structured query
    """
    def process(self, element):
        """
        Split the candidates
        """    
        candidate_groups = ['address_candidates', 'name_candidates']
        for group in candidate_groups:
            for candidate in element[group]:
                yield {'record_a': {'donor_id': element['donor_id'], 
                                    'name': unicode(element['name']), 
                                    'address': unicode(element['address'])},
                       'record_b': {'donor_id': candidate['donor_id'], 
                                    'name': unicode(candidate['name']), 
                                    'address': unicode(candidate['address'])}}
                
def comparator(element):
    """
    Extract similarity features
    """
    return {
        'donor_id1': element['record_a']['donor_id'],
        'donor_id2': element['record_b']['donor_id'],
        'jaro_name': jf.jaro_winkler(element['record_a']['name'], element['record_b']['name']),
        'damerau_name': jf.damerau_levenshtein_distance(element['record_a']['name'], element['record_b']['name']),
        'jaro_address': jf.jaro_winkler(element['record_a']['address'], element['record_b']['address']),
        'damerau_address': jf.damerau_levenshtein_distance(element['record_a']['address'], element['record_b']['address'])  
        }
                
def baseline_classifier(element):
    """
    Simple voting classifier.
    * assumes an equal weighting for the different types of distance metrics. 
    """
    votes = [
        element['jaro_name'] > 0.67,
        element['jaro_address'] > 0.67,
        element['damerau_name'] < 9,
        element['damerau_address'] < 9]
    return {'donor_id1': element['donor_id1'], 
            'donor_id2': element['donor_id2'], 
            'classification': np.mean(votes)}                


schema = 'donor_id1:STRING, donor_id2:STRING, classfication:FLOAT'

def printfn(x): print(x)

In [None]:
with beam.Pipeline(options=options) as p:
     
    _ = (p 
        | "query" >> beam.io.Read(beam.io.BigQuerySource(query=query, 
                                                         project='anz-pso-nfaggian', 
                                                         use_standard_sql=True))
        | "record generator" >> beam.ParDo(indexer())
        | "feature extraction" >> beam.Map(lambda x: comparator(x)) 
        | "duplicate classifier" >> beam.Map(lambda x: baseline_classifier(x)) 
        | "store" >> beam.io.Write(beam.io.BigQuerySink('dedup.classification', 
                                                        schema=schema, 
                                                        create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, 
                                                        write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE))
        )
    
    result = p.run().wait_until_finish()