<a href="https://colab.research.google.com/github/nfaggian/record_linkage/blob/master/simple_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Record linkage: Simple classifier

## Dependencies
- - - 

Jellyfish is a library for comparing strings: [github](https://github.com/jamesturk/jellyfish )

In [3]:
!pip install jellyfish tqdm matplotlib



In [4]:
import numpy as np
import pandas as pd
import tqdm

import uuid

import jellyfish as jf
import matplotlib.pyplot as plt
import seaborn as sns

from IPython.display import HTML, display, clear_output

## GCP authentication
- - - 

In [5]:
try: 
    from google.colab import auth
    auth.authenticate_user()
except:     
    try:
        import google.auth
        credentials, project = google.auth.default()
    except:
        raise Exception('Google cloud authentication required!')



In [6]:
%env GOOGLE_CLOUD_PROJECT=anz-pso-nfaggian

project_id = 'anz-pso-nfaggian'

env: GOOGLE_CLOUD_PROJECT=anz-pso-nfaggian


## Retrieve donor and contributions data

In [7]:
query = f"""
SELECT
  * 
FROM
  dedup.donors
WHERE MOD(ABS(FARM_FINGERPRINT(CAST(donor_id AS STRING))), 1000) = 0
"""

raw_donors = pd.io.gbq.read_gbq(query, project_id=project_id, dialect='standard')
raw_donors = raw_donors.sort_values(by='donor_id')
print(raw_donors.shape)
raw_donors.head()



(726, 10)


Unnamed: 0,donor_id,last_name,first_name,address_1,address_2,city,state,zip,employer,occupation
220,404,1-28-07 cash deposits,,,,,il,,,
548,422,12 congressional district afl-cio cope cont. fund,,59 magnolia dr.,,belleville,il,62221.0,,
295,616,1420 west lexington partnership,,1420 west lexington partnership,,chicago,il,60607.0,,
309,988,18th ward democratic org,,8150 s kedzie ave,,chicago,il,60652.0,,
300,2158,35th & indiana currency exchange,,126 e. 35th street,,chicago,il,60616.0,,


In [8]:
query = f"""
SELECT
  donor_id, amount
FROM
  dedup.contributions
WHERE MOD(ABS(FARM_FINGERPRINT(CAST(donor_id AS STRING))), 1000) = 0
"""

raw_donations = pd.io.gbq.read_gbq(query, project_id=project_id, dialect='standard')
raw_donations = raw_donations.sort_values(by='donor_id')
print(raw_donations.shape)
raw_donations.head()

(1662, 2)


Unnamed: 0,donor_id,amount
1483,404,129.0
543,422,300.0
879,616,600.0
346,988,200.0
1052,2158,500.0


**Data preperation**
- - - 

In [9]:
def transform(record):
  """
  Perform some simple transformations on the record.
  """
  
  # combine the first and last name
  return {
      'donor_id': record.donor_id,
          'city': f"{record.city or ' '}".strip().lower(),
          'name': f"{record.first_name or ' '} {record.last_name or ' '}".strip().lower(),
       'address': f"{record.address_1 or ' '} {record.address_2 or ' '}".strip().lower(), 
         'state': f"{record.state or ' '}".strip().lower(),
           'zip': f"{record.zip or ' '}".strip().lower(),
    'occupation': f"{record.occupation or ' '}".strip().lower(),
      'employer': f"{record.employer or ' '}".strip().lower(),
     'is_person': record.first_name is not None }

In [10]:
donors = [] 
for row, record in tqdm.tqdm(raw_donors.iterrows(), total=raw_donors.shape[0]):
    donors.append(transform(record))
donors = pd.DataFrame(donors)
donors.head()

100%|██████████| 726/726 [00:00<00:00, 2343.38it/s]


Unnamed: 0,address,city,donor_id,employer,is_person,name,occupation,state,zip
0,,,404,,False,1-28-07 cash deposits,,il,
1,59 magnolia dr.,belleville,422,,False,12 congressional district afl-cio cope cont. fund,,il,62221.0
2,1420 west lexington partnership,chicago,616,,False,1420 west lexington partnership,,il,60607.0
3,8150 s kedzie ave,chicago,988,,False,18th ward democratic org,,il,60652.0
4,126 e. 35th street,chicago,2158,,False,35th & indiana currency exchange,,il,60616.0


## Comparing duplicated donor data
- - -

In [11]:
data = []

def comparator(record_a, record_b):
  """
  Compares one field to another, using a series of distance metrics.
  """
  return {
    'donor_id1': record_a.donor_id,
    'donor_id2': record_b.donor_id,
    # name
    'jaro_name': jf.jaro_winkler(record_a['name'], record_b['name']),
    'phonetic_name': jf.match_rating_comparison(record_a['name'], record_b['name']),
    'damerau_name': jf.damerau_levenshtein_distance(record_a['name'], record_b['name']),
    # address  
    'jaro_address': jf.jaro_winkler(record_a['address'], record_b['address']),
    'phonetic_address': jf.match_rating_comparison(record_a['address'], record_b['address']),  
    'damerau_address': jf.damerau_levenshtein_distance(record_a['address'], record_b['address']),  
   }


In [None]:
distance_table = []
for row_a, record_a in tqdm.tqdm(donors.iterrows(), total=donors.shape[0]):
      for row_b, record_b in donors.iterrows():
            if row_a == row_b: continue    
            distance_table.append(comparator(record_a, record_b)) 
distance_table = pd.DataFrame(distance_table)

 22%|██▏       | 162/726 [01:04<04:25,  2.13it/s]

In [None]:
distance_table = distance_table.sort_values(by='jaro_name', ascending=False)

From the dataset (in the small sample) we can see that the patterns of similarity:

* Using the jaro-winkler distance we can see that there are patches of highly similary values (close to 1). 
* Using the damerau distance the patterns are a bit different, it is more sensitive to larger edits and unbound. 

In [None]:
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(10, 8))
sns.set(style="ticks")
sns.heatmap(distance_table.pivot('donor_id1', 'donor_id2', 'jaro_name'), 
            xticklabels=False, yticklabels=False, annot=False, ax=ax1)
ax1.set_title('name (jaro)')
sns.heatmap(distance_table.pivot('donor_id1', 'donor_id2', 'damerau_name'), 
            xticklabels=False, yticklabels=False, annot=False, ax=ax2)
ax2.set_title('name (damerau)')
sns.heatmap(distance_table.pivot('donor_id1', 'donor_id2', 'jaro_address'), 
            xticklabels=False, yticklabels=False, annot=False, ax=ax3)
ax3.set_title('address (jaro)')
sns.heatmap(distance_table.pivot('donor_id1', 'donor_id2', 'damerau_address'), 
            xticklabels=False, yticklabels=False, annot=False, ax=ax4)
ax4.set_title('address (damerau)');


## Building a simple baseline model
- - -

Using the metrics we just calculated - let us build a simple duplicate detector, based on the following rules:

* jaro distances which are closer to 1.0 are good examples of duplicates.
* damerau distances which are closer to zero are good example of duplicates. 

We can set this threshold by looking at the distribution of the distance metrics for our small dataset, under the assumption that our small sample contains some of the common errors we will see in the larger dataset.

In [None]:
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(10, 8))
sns.set(style="ticks")
sns.distplot(distance_table['jaro_name'], ax=ax1)
sns.distplot(distance_table['damerau_name'], ax=ax2)
sns.distplot(distance_table['jaro_address'], ax=ax3)
sns.distplot(distance_table['damerau_address'], ax=ax4);

In [None]:
print(f"""
 jaro_name    :{distance_table['jaro_name'].quantile(0.99)}
 jaro_address :{distance_table['jaro_address'].quantile(0.99)}
 damerau_name    :{distance_table['damerau_name'].quantile(0.01)}
 damerau_address :{distance_table['damerau_address'].quantile(0.01)}
""") 

Using the mean operator we can form a simple classifier that assumes equal importance of each distance metric:

In [None]:
def baseline_classifier(record):
    """
    Simple voting classifier.
    * assumes an equal weighting for the different types of distance metrics. 
    """
    votes = [
        record['jaro_name'] > 0.67,
        record['jaro_address'] > 0.67,
        record['damerau_name'] < 9,
        record['damerau_address'] < 9]
    return np.mean(votes)    

## Using the baseline model

In [None]:
classification_column = []
for row, record in tqdm.tqdm(distance_table.iterrows(), total=distance_table.shape[0]):
  classification_column.append(baseline_classifier(record))

In [None]:
distance_table['duplicate'] = np.array(classification_column) > 0.5
distance_table['duplicate_score'] = np.array(classification_column)

sns.heatmap(distance_table.pivot('donor_id1', 'donor_id2', 'duplicate_score'), 
            xticklabels=False, yticklabels=False, annot=False);