In [None]:
##pip install dj_database_url

In [None]:
##pip install psycopg2-binary

In [None]:
##pip install dedupe

In [1]:
import os
import time
import logging
import optparse
import locale
import itertools
import io
import csv

import dj_database_url
import psycopg2
import psycopg2.extras

import dedupe
import numpy


In [2]:
from psycopg2.extensions import register_adapter, AsIs
register_adapter(numpy.int32, AsIs)
register_adapter(numpy.int64, AsIs)
register_adapter(numpy.float32, AsIs)
register_adapter(numpy.float64, AsIs)

In [3]:
class Readable(object):

    def __init__(self, iterator):

        self.output = io.StringIO()
        self.writer = csv.writer(self.output)
        self.iterator = iterator

    def read(self, size):

        self.writer.writerows(itertools.islice(self.iterator, size))

        chunk = self.output.getvalue()
        self.output.seek(0)
        self.output.truncate(0)

        return chunk


def record_pairs(result_set):

    for i, row in enumerate(result_set):
        a_record_id, a_record, b_record_id, b_record = row
        record_a = (a_record_id, a_record)
        record_b = (b_record_id, b_record)

        yield record_a, record_b

        if i % 10000 == 0:
            print(i)


def cluster_ids(clustered_dupes):

    for cluster, scores in clustered_dupes:
        cluster_id = cluster[0]
        for donor_id, score in zip(cluster, scores):
            yield donor_id, cluster_id, score


if __name__ == '__main__':
    # ## Logging

    # Dedupe uses Python logging to show or suppress verbose output. Added
    # for convenience.  To enable verbose output, run `python
    # pgsql_big_dedupe_example.py -v`
    optp = optparse.OptionParser()
    optp.add_option('-v', '--verbose', dest='verbose', action='count',
                    help='Increase verbosity (specify multiple times for more)'
                    )
    (opts, args) = optp.parse_args()
    log_level = logging.WARNING
    if opts.verbose:
        if opts.verbose == 1:
            log_level = logging.INFO
        elif opts.verbose >= 2:
            log_level = logging.DEBUG
    logging.getLogger().setLevel(log_level)

    # ## Setup
    settings_file = 'pgsql_big_dedupe_example_settings'
    training_file = 'pgsql_big_dedupe_example_training.json'

In [4]:
    # ## Setup
    settings_file = 'pgsql_big_dedupe_example_settings'
    training_file = 'pgsql_big_dedupe_example_training.json'

In [5]:
    start_time = time.time()
    
    read_con = psycopg2.connect(database="campaign-finance",
                        user="postgres",
                        password="",
                        host="172.16.238.13",
                        port="5432",
                        cursor_factory=psycopg2.extras.RealDictCursor)

    write_con = psycopg2.connect(database="campaign-finance",
                        user="postgres",
                        password="",
                        host="172.16.238.13",
                        port="5432")

In [6]:
    DONOR_SELECT = "SELECT account_id, city, name, zip, state, address, occupation, employer " \
                   "from processed_accounts"

    # ## Training

    if os.path.exists(settings_file):
        print('reading from ', settings_file)
        with open(settings_file, 'rb') as sf:
            deduper = dedupe.StaticDedupe(sf, num_cores=4)
    else:

        # Define the fields dedupe will pay attention to
        #
        # The address, city, and zip fields are often missing, so we'll
        # tell dedupe that, and we'll learn a model that take that into
        # account
        fields = [{'field': 'name', 'type': 'String', 'has missing': True},
                  {'field': 'address', 'type': 'String', 'has missing': True},
                  {'field': 'city', 'type': 'ShortString', 'has missing': True},
                  {'field': 'state', 'type': 'ShortString', 'has missing': True},
                  {'field': 'zip', 'type': 'ShortString', 'has missing': True},
                  {'field': 'occupation', 'type': 'ShortString', 'has missing': True},
                  {'field': 'employer', 'type': 'ShortString', 'has missing': True}
                  ]

        # Create a new deduper object and pass our data model to it.
        deduper = dedupe.Dedupe(fields, num_cores=4)

        # Named cursor runs server side with psycopg2
        with read_con.cursor('donor_select') as cur:
            cur.execute(DONOR_SELECT)
            temp_d = {i: row for i, row in enumerate(cur)}

        # If we have training data saved from a previous run of dedupe,
        # look for it an load it in.
        #
        # __Note:__ if you want to train from
        # scratch, delete the training_file
        if os.path.exists(training_file):
            print('reading labeled examples from ', training_file)
            with open(training_file) as tf:
                deduper.prepare_training(temp_d, tf)
        else:
            deduper.prepare_training(temp_d)

        del temp_d
    print("Done.")

INFO:dedupe.canopy_index:Removing stop word 01
INFO:dedupe.canopy_index:Removing stop word 20
INFO:dedupe.canopy_index:Removing stop word 07
INFO:dedupe.canopy_index:Removing stop word 30
INFO:dedupe.canopy_index:Removing stop word 72
INFO:dedupe.canopy_index:Removing stop word 00
INFO:dedupe.canopy_index:Removing stop word 27
INFO:dedupe.canopy_index:Removing stop word 80
INFO:dedupe.canopy_index:Removing stop word 03
INFO:dedupe.canopy_index:Removing stop word 40
INFO:dedupe.canopy_index:Removing stop word 05
INFO:dedupe.canopy_index:Removing stop word 75
INFO:dedupe.canopy_index:Removing stop word 78
INFO:dedupe.canopy_index:Removing stop word 83
INFO:dedupe.canopy_index:Removing stop word 15
INFO:dedupe.canopy_index:Removing stop word 10
INFO:dedupe.canopy_index:Removing stop word 61
INFO:dedupe.canopy_index:Removing stop word 76
INFO:dedupe.canopy_index:Removing stop word 84
INFO:dedupe.canopy_index:Removing stop word 60
INFO:dedupe.canopy_index:Removing stop word 82
INFO:dedupe.c

INFO:dedupe.canopy_index:Removing stop word ou
INFO:dedupe.canopy_index:Removing stop word rt
INFO:dedupe.canopy_index:Removing stop word  h
INFO:dedupe.canopy_index:Removing stop word 4 
INFO:dedupe.canopy_index:Removing stop word ay
INFO:dedupe.canopy_index:Removing stop word hi
INFO:dedupe.canopy_index:Removing stop word wa
INFO:dedupe.canopy_index:Removing stop word  p
INFO:dedupe.canopy_index:Removing stop word 12
INFO:dedupe.canopy_index:Removing stop word 6 
INFO:dedupe.canopy_index:Removing stop word dr
INFO:dedupe.canopy_index:Removing stop word iv
INFO:dedupe.canopy_index:Removing stop word on
INFO:dedupe.canopy_index:Removing stop word ri
INFO:dedupe.canopy_index:Removing stop word ve
INFO:dedupe.canopy_index:Removing stop word  f
INFO:dedupe.canopy_index:Removing stop word en
INFO:dedupe.canopy_index:Removing stop word re
INFO:dedupe.canopy_index:Removing stop word  s
INFO:dedupe.canopy_index:Removing stop word ar
INFO:dedupe.canopy_index:Removing stop word s 
INFO:dedupe.c

INFO:dedupe.canopy_index:Removing stop word et
INFO:dedupe.canopy_index:Removing stop word n 
INFO:dedupe.canopy_index:Removing stop word to
INFO:dedupe.canopy_index:Removing stop word  b
INFO:dedupe.canopy_index:Removing stop word 2 
INFO:dedupe.canopy_index:Removing stop word st
INFO:dedupe.canopy_index:Removing stop word 7 
INFO:dedupe.canopy_index:Removing stop word  r
INFO:dedupe.canopy_index:Removing stop word ch
INFO:dedupe.canopy_index:Removing stop word oa
INFO:dedupe.canopy_index:Removing stop word ro
INFO:dedupe.canopy_index:Removing stop word  a
INFO:dedupe.canopy_index:Removing stop word t 
INFO:dedupe.canopy_index:Removing stop word o 
INFO:dedupe.canopy_index:Removing stop word an
INFO:dedupe.canopy_index:Removing stop word te
INFO:dedupe.canopy_index:Removing stop word ee
INFO:dedupe.canopy_index:Removing stop word 10
INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (wholeFieldPredicate, occupation), TfidfNGramCanopyPredicate: (0.8, name)

Done.


In [7]:
        # ## Active learning

        print('starting active labeling...')
        # Starts the training loop. Dedupe will find the next pair of records
        # it is least certain about and ask you to label them as duplicates
        # or not.

        # use 'y', 'n' and 'u' keys to flag duplicates
        # press 'f' when you are finished
        dedupe.console_label(deduper)
        # When finished, save our labeled, training pairs to disk
        with open(training_file, 'w') as tf:
            deduper.write_training(tf)

        # Notice our argument here
        #
        # `recall` is the proportion of true dupes pairs that the learned
        # rules must cover. You may want to reduce this if your are making
        # too many blocks and too many comparisons.
        deduper.train(recall=0.90)

        with open(settings_file, 'wb') as sf:
            deduper.write_settings(sf)

        # We can now remove some of the memory hogging objects we used
        # for training
        deduper.cleanup_training()

name : jessica hartman
address : None
city : None
state : nc
zip : None
occupation : None
employer : None

name : jessica hartman
address : 8542 alicia brittany lane 
city : charlotte
state : nc
zip : 28212
occupation : marketing communions mgr
employer : iris by lowes

0/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished


starting active labeling...
u


name : anedot
address : None
city : None
state : None
zip : None
occupation : None
employer : None

name : anedot
address : 1340 poydras st 
city : new orleans
state : la
zip : 70112
occupation : None
employer : None

0/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : morris lawson
address : 4644 cindy lane apt a 
city : kinston, nc 28501
state : nc
zip : None
occupation : None
employer : None

name : morris lawson
address : 4644 cindy lane apt a 
city : kinston, nc 28501
state : nc
zip : None
occupation : unemployeed
employer : None

1/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (commonSixGram, name), TfidfNGramCanopyPredicate: (0.8, name))
name : 7 eleven
address : 2701 the plaza 
city : charlotte
state : nc
zip : 28213
occupation : None
employer : None

name : 7-eleven
address : None
city : charlotte
state : nc
zip : None
occupation : None
employer : None

2/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : steven cogburn
address : 64 pisgah view ranch rd 
city : candler
state : nc
zip : 28715-7116
occupation : None
employer : None

name : steven cogburn
address : 64 pisgah view ranch rd 
city : candler
state : nc
zip : 28715-7116
occupation : clerk of superior court
employer : state of north carolina

2/10 positive, 1/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : brian ricci
address : 3032 rolston rd 
city : greenville
state : nc
zip : 27858
occupation : attorney
employer : brian ricci

name : brian ricci
address : po box 483 
city : greenville
state : nc
zip : 27835
occupation : attorney
employer : ricci law firm

3/10 positive, 1/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (commonSixGram, name), TfidfNGramCanopyPredicate: (0.8, name))
INFO:dedupe.training:(SimplePredicate: (commonSixGram, zip), SimplePredicate: (fingerprint, name))
name : barker french
address : 1005 monmouth ave 
city : durham
state : nc
zip : 27701-1711
occupation : retired from investments
employer : not employed

name : barker french
address : 1005 monmouth avenue 
city : durham
state : nc
zip : 27701
occupation : retired
employer : n/a

4/10 positive, 1/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (commonSixGram, name), TfidfNGramCanopyPredicate: (0.8, name))
INFO:dedupe.training:(SimplePredicate: (commonSixGram, zip), SimplePredicate: (fingerprint, name))
INFO:dedupe.training:(LevenshteinCanopyPredicate: (1, name), SimplePredicate: (suffixArray, occupation))
name : troy dover
address : 2928 polo club rd 
city : nashville
state : tn
zip : 37221
occupation : business exec
employer : ica

name : troy dover
address : 2928 polo club road 
city : nashville
state : tn
zip : 37221
occupation : vp
employer : summit design

5/10 positive, 1/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (commonSixGram, name), TfidfNGramCanopyPredicate: (0.8, name))
INFO:dedupe.training:(SimplePredicate: (sameSevenCharStartPredicate, occupation), SimplePredicate: (wholeFieldPredicate, name))
INFO:dedupe.training:(SimplePredicate: (commonSixGram, zip), SimplePredicate: (fingerprint, name))
name : teresa van duyn
address : 27 busbee rd 
city : asheville
state : nc
zip : 28803
occupation : senator
employer : ncga

name : teresa van duyn
address : 27 busbee rd 
city : ashville
state : nc
zip : 28803
occupation : state senator
employer : state of nc

6/10 positive, 1/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (commonSixGram, name), TfidfNGramCanopyPredicate: (0.8, name))
INFO:dedupe.training:(SimplePredicate: (sameSevenCharStartPredicate, occupation), SimplePredicate: (wholeFieldPredicate, name))
INFO:dedupe.training:(TfidfNGramCanopyPredicate: (0.4, zip), TfidfNGramCanopyPredicate: (0.6, name))
INFO:dedupe.training:(SimplePredicate: (commonSixGram, zip), SimplePredicate: (fingerprint, name))
name : carl page
address : 5214 diamond 
city : san francisco
state : ca
zip : 94131
occupation : not employed
employer : not employed

name : carl page
address : 5214 diamond 
city : san francisco
state : ca
zip : 94131
occupation : retired
employer : retired

7/10 positive, 1/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(LevenshteinCanopyPredicate: (1, name), SimplePredicate: (commonSixGram, occupation))
INFO:dedupe.training:(SimplePredicate: (commonSixGram, name), TfidfNGramCanopyPredicate: (0.8, name))
INFO:dedupe.training:(TfidfNGramCanopyPredicate: (0.4, zip), TfidfNGramCanopyPredicate: (0.6, name))
INFO:dedupe.training:(SimplePredicate: (commonSixGram, zip), SimplePredicate: (fingerprint, name))
name : bennie bradsher
address : 3104 darien dr 
city : raleigh
state : nc
zip : 27607
occupation : gregory poole equipement co
employer : sales

name : bennie bradsher jr
address : 3104 darien dr 
city : raleigh
state : nc
zip : 27607-6706
occupation : construction
employer : gregory poole

8/10 positive, 1/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(LevenshteinCanopyPredicate: (1, name), SimplePredicate: (commonSixGram, occupation))
INFO:dedupe.training:(SimplePredicate: (commonSixGram, name), TfidfNGramCanopyPredicate: (0.8, name))
INFO:dedupe.training:(LevenshteinCanopyPredicate: (2, city), TfidfNGramCanopyPredicate: (0.8, address))
INFO:dedupe.training:(TfidfNGramCanopyPredicate: (0.4, zip), TfidfNGramCanopyPredicate: (0.6, name))
INFO:dedupe.training:(SimplePredicate: (commonSixGram, zip), SimplePredicate: (fingerprint, name))
name : james davis
address : 110 marion dr 
city : erwin
state : nc
zip : 28339
occupation : power plant operator
employer : goodyear tire and rubber co.

name : james davis
address : 37 georgia rd 
city : franklin
state : nc
zip : 28734
occupation : dentist
employer : franklin dental

9/10 positive, 1/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (commonTwoTokens, name), SimplePredicate: (wholeFieldPredicate, address))
INFO:dedupe.training:(LevenshteinCanopyPredicate: (1, name), SimplePredicate: (commonSixGram, occupation))
INFO:dedupe.training:(SimplePredicate: (commonSixGram, name), TfidfNGramCanopyPredicate: (0.8, name))
INFO:dedupe.training:(TfidfNGramCanopyPredicate: (0.4, zip), TfidfNGramCanopyPredicate: (0.6, name))
name : fred j. stanback jr.
address : 507 w innes st # 207
city : salisbury
state : nc
zip : 28144-4267
occupation : retired
employer : none

name : fred stanback
address : 507 w innes st ste 270
city : salisbury
state : nc
zip : 28144-4265
occupation : stanback headache powder
employer : retired

9/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : john h mcpherson jr
address : 895 ridge gate dr 
city : lewisville
state : nc
zip : 27023-8691
occupation : realtor
employer : mkt real estate group of coldwell banker advantage

name : john mcpherson
address : 895 ridge gate dr 
city : lewisville
state : nc
zip : 27023-8691
occupation : realtor
employer : mcpherson real estate group, llc

10/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (firstTokenPredicate, name), SimplePredicate: (oneGramFingerprint, address))
INFO:dedupe.training:(SimplePredicate: (commonSixGram, name), TfidfNGramCanopyPredicate: (0.8, name))
INFO:dedupe.training:(TfidfNGramCanopyPredicate: (0.4, zip), TfidfNGramCanopyPredicate: (0.6, name))
INFO:dedupe.training:(LevenshteinCanopyPredicate: (1, name), SimplePredicate: (suffixArray, occupation))
name : gregory j. gallagher
address : 127 delta st. 
city : forest city
state : nc
zip : 28043
occupation : high school teacher
employer : chase high school

name : lucy s marshall
address : None
city : None
state : nc
zip : None
occupation : None
employer : unc hospitals

11/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : chris mull
address : 8917 caratoke hwy. 
city : point harbor
state : nc
zip : 27964
occupation : manager
employer : griggs lumber

name : stephanie a kline
address : None
city : None
state : nc
zip : None
occupation : None
employer : spponsor ctv

11/10 positive, 3/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : nc general assembly
address : 16 w joes st 
city : raleigh
state : nc
zip : 27601
occupation : None
employer : None

name : nc general assembly
address : 16 w. jones st. 
city : raleigh
state : nc
zip : 27601
occupation : None
employer : None

11/10 positive, 4/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : committee to elect michele presnell
address : 316 woodstock drive 
city : burnsville, nc
state : nc
zip : 28714
occupation : None
employer : None

name : committee to elect michele presnell
address : 68 woodstock drive 
city : burnsville
state : nc
zip : 28714
occupation : None
employer : None

12/10 positive, 4/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (firstTokenPredicate, name), SimplePredicate: (oneGramFingerprint, address))
INFO:dedupe.training:(SimplePredicate: (nearIntegersPredicate, zip), TfidfNGramCanopyPredicate: (0.6, name))
INFO:dedupe.training:(SimplePredicate: (commonSixGram, name), TfidfNGramCanopyPredicate: (0.8, name))
INFO:dedupe.training:(LevenshteinCanopyPredicate: (1, name), SimplePredicate: (suffixArray, occupation))
name : fred stanback
address : 507 w innes st 
city : salisbury
state : nc
zip : 28144
occupation : retired
employer : retired

name : fred stanback
address : 507 w innes st ste 270
city : salisbury
state : nc
zip : 28144-4265
occupation : stanback headache powder
employer : retired

13/10 positive, 4/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(LevenshteinCanopyPredicate: (3, name), SimplePredicate: (oneGramFingerprint, zip))
INFO:dedupe.training:(SimplePredicate: (commonSixGram, name), TfidfNGramCanopyPredicate: (0.8, name))
INFO:dedupe.training:(SimplePredicate: (commonSixGram, zip), SimplePredicate: (firstTokenPredicate, name))
INFO:dedupe.training:(LevenshteinCanopyPredicate: (1, name), SimplePredicate: (suffixArray, occupation))
name : jessie t. bunn
address : 3800 camp mangum wynd 
city : raleigh
state : nc
zip : 27612
occupation : ceo
employer : us tobacco cooperative, inc.

name : jessie t. bunn
address : 3800 camp mangum wynd 
city : raleigh
state : nc
zip : 27612-5340
occupation : cooperative president
employer : us tobacco cooperative

14/10 positive, 4/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (firstTokenPredicate, name), SimplePredicate: (oneGramFingerprint, address))
INFO:dedupe.training:(TfidfNGramCanopyPredicate: (0.4, zip), TfidfNGramCanopyPredicate: (0.6, name))
INFO:dedupe.training:(SimplePredicate: (commonThreeTokens, name), SimplePredicate: (tokenFieldPredicate, zip))
INFO:dedupe.training:(SimplePredicate: (commonSixGram, name), TfidfNGramCanopyPredicate: (0.8, name))
INFO:dedupe.training:(LevenshteinCanopyPredicate: (1, name), SimplePredicate: (suffixArray, occupation))
name : ngpvan
address : 1445 new york ave nw ste 200
city : washington
state : dc
zip : 20005-2158
occupation : None
employer : None

name : ngpvan, inc.
address : 1445 new york ave. nw suite 200
city : washington
state : dc
zip : 20005
occupation : None
employer : None

15/10 positive, 4/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : mcguirewoods federal pac
address : 800 e canal street 
city : richmond
state : va
zip : 23219
occupation : None
employer : None

name : mcguirewoods federal pac
address : 901 east cary street one james center
city : richmond
state : va
zip : 23219-4030
occupation : None
employer : None

16/10 positive, 4/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (firstTokenPredicate, name), SimplePredicate: (oneGramFingerprint, address))
INFO:dedupe.training:(LevenshteinCanopyPredicate: (2, address), TfidfNGramCanopyPredicate: (0.6, name))
INFO:dedupe.training:(TfidfNGramCanopyPredicate: (0.6, employer), TfidfNGramCanopyPredicate: (0.6, name))
INFO:dedupe.training:(SimplePredicate: (commonSixGram, name), TfidfNGramCanopyPredicate: (0.8, name))
INFO:dedupe.training:(LevenshteinCanopyPredicate: (1, name), SimplePredicate: (suffixArray, occupation))
INFO:dedupe.training:(SimplePredicate: (commonThreeTokens, name), SimplePredicate: (tokenFieldPredicate, zip))
name : susan pierce
address : 201 rhoades street 
city : wendell
state : nc
zip : 27591
occupation : retired
employer : None

name : susan pierce
address : 201 rhodes st 
city : wendell
state : nc
zip : 27591
occupation : n/a
employer : n/a

17/10 positive, 4/10 negative
Do these records refer to the same thing?


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (firstTokenPredicate, name), SimplePredicate: (oneGramFingerprint, address))
INFO:dedupe.training:(LevenshteinCanopyPredicate: (2, address), TfidfNGramCanopyPredicate: (0.6, name))
INFO:dedupe.training:(LevenshteinCanopyPredicate: (1, name), SimplePredicate: (commonThreeTokens, name))
INFO:dedupe.training:(TfidfNGramCanopyPredicate: (0.6, employer), TfidfNGramCanopyPredicate: (0.6, name))
INFO:dedupe.training:(SimplePredicate: (commonSixGram, name), TfidfNGramCanopyPredicate: (0.8, name))
INFO:dedupe.training:(LevenshteinCanopyPredicate: (1, name), SimplePredicate: (suffixArray, occupation))
name : kellie falk
address : 618 n boylan avenue apt 924
city : raleigh
state : nc
zip : 27603
occupation : managing director
employer : drucker + falk

name : kellie falk
address : 7200 creedmor rd 300
city : raleigh
state : nc
zip : 27613
occupation : executive
employer : drucker & falf llc

18/10 positive, 4/10 nega

y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (firstTokenPredicate, name), SimplePredicate: (oneGramFingerprint, address))
INFO:dedupe.training:(SimplePredicate: (commonSixGram, address), TfidfNGramCanopyPredicate: (0.6, name))
INFO:dedupe.training:(SimplePredicate: (commonThreeTokens, name), SimplePredicate: (tokenFieldPredicate, zip))
INFO:dedupe.training:(SimplePredicate: (commonSixGram, name), TfidfNGramCanopyPredicate: (0.8, name))
INFO:dedupe.training:(LevenshteinCanopyPredicate: (1, name), SimplePredicate: (suffixArray, occupation))
name : amazon
address : 1516 2nd ave 
city : seattle
state : wa
zip : 98101
occupation : None
employer : None

name : amazon
address : 440 terry avenue n 
city : seattle
state : wa
zip : 98109
occupation : None
employer : None

19/10 positive, 4/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (commonSixGram, city), TfidfNGramCanopyPredicate: (0.6, name))
INFO:dedupe.training:(SimplePredicate: (firstTokenPredicate, name), SimplePredicate: (oneGramFingerprint, address))
INFO:dedupe.training:(SimplePredicate: (commonSixGram, name), TfidfNGramCanopyPredicate: (0.8, name))
INFO:dedupe.training:(LevenshteinCanopyPredicate: (1, name), SimplePredicate: (suffixArray, occupation))
INFO:dedupe.training:(SimplePredicate: (commonThreeTokens, name), SimplePredicate: (tokenFieldPredicate, zip))
name : 13th cong dist dec
address : 220 hillsborough st 
city : raleigh
state : nc
zip : 27603
occupation : None
employer : None

name : 13th cong. dist. republican party
address : po box 99096 
city : raleigh
state : nc
zip : 27624-9096
occupation : None
employer : None

20/10 positive, 4/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : jessica cannon
address : 2220 s live oak pkwy 
city : wilmington
state : nc
zip : 28403-6113
occupation : md retired
employer : wilington health

name : jessica parks
address : 617 dungannon blvd 
city : wilmington
state : nc
zip : 28403
occupation : attorney
employer : kator parks weiser & harris pllc

20/10 positive, 5/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : sam searcy
address : 201 danagher ct 
city : holly springs
state : nc
zip : 27540-5788
occupation : coo
employer : graybeard distillery

name : shauna searcy
address : 201 danagher ct 
city : holly springs
state : nc
zip : 27540-5788
occupation : stay at home mother
employer : n/a

20/10 positive, 6/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : aggregated individual contribution
address : 1219 n south st 
city : mount airy
state : nc
zip : 27030
occupation : None
employer : None

name : aggregated individual contribution
address : 1709 inglebrook trail 
city : mt airy
state : nc
zip : 27030
occupation : None
employer : None

20/10 positive, 7/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : 3m company politcal action committee
address : 3m center bldg 224-5n-42
city : saint paul
state : mn
zip : 55144-1001
occupation : None
employer : None

name : 3m politcal action committee
address : 3m center, bldg 224-5n-42 
city : st paul
state : mn
zip : 55144
occupation : None
employer : None

20/10 positive, 8/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : caroline finklea sullivan
address : 2709 fairview rd 
city : raleigh
state : nc
zip : 27608-1349
occupation : executive director
employer : ncbce

name : caroline sullilvan
address : 2709 fairview rd 
city : raleigh
state : nc
zip : 27608-1349
occupation : executive director
employer : ncbce

21/10 positive, 8/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : diane d 'diane' wheatley
address : 9774 ramsey street 
city : linden
state : nc
zip : 28356
occupation : retired
employer : None

name : diane wheatly
address : 9774 ramsey st 
city : lindon
state : nc
zip : 28356
occupation : None
employer : None

22/10 positive, 8/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : godaddy.com
address : 1020 enterprise way # 300
city : sunnyvale
state : ca
zip : 94089
occupation : None
employer : None

name : godaddy.com
address : 14455 n hayden rd suite 100 
city : scottsdale
state : az
zip : 85260
occupation : None
employer : None

23/10 positive, 8/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : vista print
address : 275 wyman st 
city : waltham
state : ma
zip : 02451-1200
occupation : None
employer : None

name : vista print
address : 275 wyman street 
city : san jose
state : ca
zip : 02451
occupation : None
employer : None

24/10 positive, 8/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : arthur williams
address : 108 arbor drive 
city : washington
state : nc
zip : 27889
occupation : former legislator
employer : state of nc

name : arthur williams
address : 277 royal poinciana way #135 
city : palm beach
state : fl
zip : 33480
occupation : business executive
employer : None

25/10 positive, 8/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (sameThreeCharStartPredicate, zip), SimplePredicate: (wholeFieldPredicate, name))
INFO:dedupe.training:(SimplePredicate: (firstTokenPredicate, name), SimplePredicate: (oneGramFingerprint, address))
INFO:dedupe.training:(SimplePredicate: (commonSixGram, name), TfidfNGramCanopyPredicate: (0.8, name))
INFO:dedupe.training:(LevenshteinCanopyPredicate: (2, address), TfidfNGramCanopyPredicate: (0.6, name))
name : carolyn hopper
address : None
city : None
state : nc
zip : None
occupation : None
employer : None

name : carolyn weber
address : 4059 longview dr 
city : chamblee
state : ga
zip : 30341-1505
occupation : manager
employer : at&t

25/10 positive, 9/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : n c gop
address : 1506 hillsborough st 
city : raleigh
state : nc
zip : 27615
occupation : None
employer : None

name : nc gop
address : 1506hillsborough street 
city : raleigh
state : nc
zip : 27605
occupation : None
employer : None

25/10 positive, 10/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


f


Finished labeling
INFO:rlr.crossvalidation:using cross validation to find optimum alpha...
INFO:rlr.crossvalidation:optimum alpha: 0.010000, score 0.26653022001351756
INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (metaphoneToken, city), TfidfNGramCanopyPredicate: (0.6, name))
INFO:dedupe.training:(SimplePredicate: (fingerprint, name), SimplePredicate: (oneGramFingerprint, zip))
INFO:dedupe.training:(SimplePredicate: (commonSixGram, name), TfidfNGramCanopyPredicate: (0.8, name))
INFO:dedupe.training:(SimplePredicate: (alphaNumericPredicate, address), SimplePredicate: (commonIntegerPredicate, name))
INFO:dedupe.training:(LevenshteinCanopyPredicate: (2, city), TfidfNGramCanopyPredicate: (0.8, address))
INFO:dedupe.training:(SimplePredicate: (commonThreeTokens, employer), SimplePredicate: (commonThreeTokens, name))
INFO:dedupe.training:(SimplePredicate: (commonThreeTokens, employer), TfidfNGramCanopyPredicate: (0.6, zip))
INFO:dedupe.training:(Levenshtein

In [8]:
write_con.commit()

In [9]:
read_con.commit()

In [10]:
    # ## Blocking
    print('blocking...')

    # To run blocking on such a large set of data, we create a separate table
    # that contains blocking keys and record ids
    print('creating blocking_map database')
    with write_con:
        with write_con.cursor() as cur:
            cur.execute("DROP TABLE IF EXISTS blocking_map")
            cur.execute("CREATE TABLE blocking_map "
                        "(block_key text, canon_account_id INTEGER)")

    # If dedupe learned a Index Predicate, we have to take a pass
    # through the data and create indices.
    print('creating inverted index')

    for field in deduper.fingerprinter.index_fields:
        with read_con.cursor('field_values') as cur:
            cur.execute("SELECT DISTINCT %s FROM processed_accounts" % field)
            field_data = (row[field] for row in cur)
            deduper.fingerprinter.index(field_data, field)

    # Now we are ready to write our blocking map table by creating a
    # generator that yields unique `(block_key, donor_id)` tuples.
    print('writing blocking map')

    with read_con.cursor('donor_select') as read_cur:
        read_cur.execute(DONOR_SELECT)

        full_data = ((row['account_id'], row) for row in read_cur)
        b_data = deduper.fingerprinter(full_data)

        with write_con:
            with write_con.cursor() as write_cur:
                write_cur.copy_expert('COPY blocking_map FROM STDIN WITH CSV',
                                      Readable(b_data),
                                        size=10000)
    print("done")

blocking...
creating blocking_map database
creating inverted index


INFO:dedupe.canopy_index:Removing stop word  c
INFO:dedupe.canopy_index:Removing stop word an
INFO:dedupe.canopy_index:Removing stop word er
INFO:dedupe.canopy_index:Removing stop word ne
INFO:dedupe.canopy_index:Removing stop word  d
INFO:dedupe.canopy_index:Removing stop word al
INFO:dedupe.canopy_index:Removing stop word es
INFO:dedupe.canopy_index:Removing stop word ic
INFO:dedupe.canopy_index:Removing stop word is
INFO:dedupe.canopy_index:Removing stop word na
INFO:dedupe.canopy_index:Removing stop word on
INFO:dedupe.canopy_index:Removing stop word ri
INFO:dedupe.canopy_index:Removing stop word st
INFO:dedupe.canopy_index:Removing stop word  r
INFO:dedupe.canopy_index:Removing stop word ca
INFO:dedupe.canopy_index:Removing stop word li
INFO:dedupe.canopy_index:Removing stop word de
INFO:dedupe.canopy_index:Removing stop word rd
INFO:dedupe.canopy_index:Removing stop word h 
INFO:dedupe.canopy_index:Removing stop word  m
INFO:dedupe.canopy_index:Removing stop word ee
INFO:dedupe.c

writing blocking map


INFO:dedupe.blocking:10000, 35.3593622 seconds
INFO:dedupe.blocking:20000, 70.3045252 seconds
INFO:dedupe.blocking:30000, 98.6571682 seconds
INFO:dedupe.blocking:40000, 127.4286202 seconds
INFO:dedupe.blocking:50000, 160.6351412 seconds
INFO:dedupe.blocking:60000, 189.8467942 seconds
INFO:dedupe.blocking:70000, 217.6259102 seconds
INFO:dedupe.blocking:80000, 247.7355472 seconds
INFO:dedupe.blocking:90000, 279.8001152 seconds
INFO:dedupe.blocking:100000, 309.1083582 seconds
INFO:dedupe.blocking:110000, 339.5845032 seconds
INFO:dedupe.blocking:120000, 370.1143732 seconds
INFO:dedupe.blocking:130000, 398.9231382 seconds
INFO:dedupe.blocking:140000, 426.9268082 seconds
INFO:dedupe.blocking:150000, 457.7762552 seconds
INFO:dedupe.blocking:160000, 484.8780952 seconds
INFO:dedupe.blocking:170000, 516.8220882 seconds
INFO:dedupe.blocking:180000, 542.4971122 seconds
INFO:dedupe.blocking:190000, 566.4715962 seconds
INFO:dedupe.blocking:200000, 594.0993432 seconds
INFO:dedupe.blocking:210000, 622

done


In [11]:
    # free up memory by removing indices
    deduper.fingerprinter.reset_indices()

    logging.info("indexing block_key")
    with write_con:
        with write_con.cursor() as cur:
            cur.execute("CREATE UNIQUE INDEX ON blocking_map "
                        "(block_key text_pattern_ops, canon_account_id)")


INFO:root:indexing block_key


In [14]:
read_con.commit()
write_con.commit()

In [15]:
    # ## Clustering

    with write_con:
        with write_con.cursor() as cur:
            cur.execute("DROP TABLE IF EXISTS entity_map")

            print('creating entity_map database')
            cur.execute("CREATE TABLE entity_map "
                        "(original_id INTEGER, canon_id INTEGER, "
                        " cluster_score FLOAT, PRIMARY KEY(original_id))")

    with read_con.cursor('pairs', cursor_factory=psycopg2.extensions.cursor) as read_cur:
        read_cur.execute("""
               select a.account_id,
                      row_to_json((select d from (select a.city,
                                                         a.name,
                                                         a.zip,
                                                         a.state,
                                                         a.address,
                                                         a.occupation,
                                                         a.employer) d)),
                      b.account_id,
                      row_to_json((select d from (select b.city,
                                                         b.name,
                                                         b.zip,
                                                         b.state,
                                                         b.address,
                                                         b.occupation,
                                                         b.employer) d))
               from (select DISTINCT l.canon_account_id as east, r.canon_account_id as west
                     from blocking_map as l
                     INNER JOIN blocking_map as r
                     using (block_key)
                     where l.canon_account_id < r.canon_account_id) ids
               INNER JOIN processed_accounts a on ids.east=a.account_id
               INNER JOIN processed_accounts b on ids.west=b.account_id""")

        print('clustering...')
        clustered_dupes = deduper.cluster(deduper.score(record_pairs(read_cur)),
                                          threshold=0.5)

        # ## Writing out results

        # We now have a sequence of tuples of donor ids that dedupe believes
        # all refer to the same entity. We write this out onto an entity map
        # table

        print('writing results')
        with write_con:
            with write_con.cursor() as write_cur:
                write_cur.copy_expert('COPY entity_map FROM STDIN WITH CSV',
                                      Readable(cluster_ids(clustered_dupes)),
                                      size=10000)

    with write_con:
        with write_con.cursor() as cur:
            cur.execute("CREATE INDEX head_index ON entity_map (canon_id)")

    # Print out the number of duplicates found

creating entity_map database
clustering...
0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
330000
340000
350000
360000
370000
380000
390000
400000
410000
420000
430000
440000
450000
460000
470000
480000
490000
500000
510000
520000
530000
540000
550000
560000
570000
580000
590000
600000
610000
620000
630000
640000
650000
660000
670000
680000
690000
700000
710000
720000
730000
740000
750000
760000
770000
780000
790000
800000
810000
820000
830000
840000
850000
860000
870000
880000
890000
900000
910000
920000
930000
940000
950000
960000
970000
980000
990000
1000000
1010000
1020000
1030000
1040000
1050000
1060000
1070000
1080000
1090000
1100000
1110000
1120000
1130000
1140000
1150000
1160000
1170000
1180000
1190000
writing results


In [16]:
    read_con.close()
    write_con.close()

    print('ran in', time.time() - start_time, 'seconds')

ran in 3005.6060020923615 seconds


In [None]:
 locale.setlocale(locale.LC_ALL, '')  # for pretty printing numbers

In [None]:
read_con.commit()

In [None]:
    with read_con.cursor() as cur:
        cur.execute("DROP TABLE e_map")

In [None]:
 with read_con.cursor() as cur:
   
        cur.execute(
            "SELECT CONCAT_WS(' ', donors.name) as name, "
            "SUM(CAST(contributions.amount AS FLOAT)) AS totals "
            "FROM donors INNER JOIN contributions "
            "USING (donor_id) "
            "GROUP BY (donor_id) "
            "ORDER BY totals DESC "
            "LIMIT 10"
        )

        print("Top Donors (raw)")
        for row in cur:
            row['totals'] = row['totals']
            print('%(totals)20s: %(name)s' % row)

   

In [None]:
    with read_con.cursor() as cur:
               
        cur.execute("CREATE TEMPORARY TABLE e_map "
                    "AS SELECT COALESCE(canon_id, donor_id) AS canon_id, donor_id "
                    "FROM entity_map "
                    "RIGHT JOIN donors USING(donor_id)")
        
        cur.execute(
            "SELECT donors.name AS name, "
            "donation_totals.totals AS totals "
            "FROM donors INNER JOIN "
            "(SELECT contributions.canon_id, SUM(CAST(amount AS FLOAT)) AS totals "
            " FROM contributions INNER JOIN e_map "
            " USING (donor_id) "
            " GROUP BY (contributions.canon_id) "
            " ORDER BY totals "
            " DESC LIMIT 10) "
            "AS donation_totals ON donors.donor_id=donation_totals.canon_id "
            "WHERE donors.donor_id = donation_totals.canon_id"
        )

        print("Top Donors (deduped)")
        for row in cur:
            row['totals'] = row['totals']
            print('%(totals)20s: %(name)s' % row)
            
        cur.execute("SELECT * FROM e_map")
   

In [None]:
    read_con.close()
    write_con.close()

    print('ran in', time.time() - start_time, 'seconds')