In [None]:
pip install dj_database_url

In [None]:
##pip install psycopg2-binary

In [None]:
pip install dedupe

In [None]:
pip install dedupe-variable-address

In [None]:
pip install dedupe-variable-name

In [1]:
import os
import time
import logging
import optparse
import locale
import itertools
import io
import csv

import dj_database_url
import psycopg2
import psycopg2.extras

import dedupe
import numpy


In [2]:
from psycopg2.extensions import register_adapter, AsIs
register_adapter(numpy.int32, AsIs)
register_adapter(numpy.int64, AsIs)
register_adapter(numpy.float32, AsIs)
register_adapter(numpy.float64, AsIs)

In [3]:
class Readable(object):

    def __init__(self, iterator):

        self.output = io.StringIO()
        self.writer = csv.writer(self.output)
        self.iterator = iterator

    def read(self, size):

        self.writer.writerows(itertools.islice(self.iterator, size))

        chunk = self.output.getvalue()
        self.output.seek(0)
        self.output.truncate(0)

        return chunk


def record_pairs(result_set):

    for i, row in enumerate(result_set):
        a_record_id, a_record, b_record_id, b_record = row
        record_a = (a_record_id, a_record)
        record_b = (b_record_id, b_record)

        yield record_a, record_b

        if i % 10000 == 0:
            print(i)


def cluster_ids(clustered_dupes):

    for cluster, scores in clustered_dupes:
        cluster_id = cluster[0]
        for donor_id, score in zip(cluster, scores):
            yield donor_id, cluster_id, score


if __name__ == '__main__':
    # ## Logging

    # Dedupe uses Python logging to show or suppress verbose output. Added
    # for convenience.  To enable verbose output, run `python
    # pgsql_big_dedupe_example.py -v`
    optp = optparse.OptionParser()
    optp.add_option('-v', '--verbose', dest='verbose', action='count',
                    help='Increase verbosity (specify multiple times for more)'
                    )
    (opts, args) = optp.parse_args()
    log_level = logging.WARNING
    if opts.verbose:
        if opts.verbose == 1:
            log_level = logging.INFO
        elif opts.verbose >= 2:
            log_level = logging.DEBUG
    logging.getLogger().setLevel(log_level)

    # ## Setup
    settings_file = 'pgsql_big_dedupe_example_settings'
    training_file = 'pgsql_big_dedupe_example_training.json'

In [4]:
    # ## Setup
    settings_file = 'pgsql_big_dedupe_example_settings'
    training_file = 'pgsql_big_dedupe_example_training.json'

In [5]:
    start_time = time.time()
    
    read_con = psycopg2.connect(database="campaign-finance",
                        user="postgres",
                        password="",
                        host="172.16.238.13",
                        port="5432",
                        cursor_factory=psycopg2.extras.RealDictCursor)

    write_con = psycopg2.connect(database="campaign-finance",
                        user="postgres",
                        password="",
                        host="172.16.238.13",
                        port="5432")

In [6]:
    DONOR_SELECT = "SELECT account_id, city, name, zip, state, address, occupation, employer " \
                   "from processed_accounts"

    # ## Training

    if os.path.exists(settings_file):
        print('reading from ', settings_file)
        with open(settings_file, 'rb') as sf:
            deduper = dedupe.StaticDedupe(sf, num_cores=4)
    else:

        # Define the fields dedupe will pay attention to
        #
        # The address, city, and zip fields are often missing, so we'll
        # tell dedupe that, and we'll learn a model that take that into
        # account
        fields = [{'field': 'name', 'type': 'Name', 'has missing': True},
                  {'field': 'address', 'type': 'Address', 'has missing': True},
                  {'field': 'city', 'type': 'ShortString', 'has missing': True},
                  {'field': 'state', 'type': 'ShortString', 'has missing': True},
                  {'field': 'zip', 'type': 'ShortString', 'has missing': True},
                  {'field': 'occupation', 'type': 'ShortString', 'has missing': True},
                  {'field': 'employer', 'type': 'ShortString', 'has missing': True}
                  ]

        # Create a new deduper object and pass our data model to it.
        deduper = dedupe.Dedupe(fields, num_cores=8)

        # Named cursor runs server side with psycopg2
        with read_con.cursor('donor_select') as cur:
            cur.execute(DONOR_SELECT)
            temp_d = {i: row for i, row in enumerate(cur)}

        # If we have training data saved from a previous run of dedupe,
        # look for it an load it in.
        #
        # __Note:__ if you want to train from
        # scratch, delete the training_file
        if os.path.exists(training_file):
            print('reading labeled examples from ', training_file)
            with open(training_file) as tf:
                deduper.prepare_training(temp_d, tf)
        else:
            deduper.prepare_training(temp_d)

        del temp_d
    print("Done.")

INFO:dedupe.canopy_index:Removing stop word rd
INFO:dedupe.canopy_index:Removing stop word box
INFO:dedupe.canopy_index:Removing stop word dr
INFO:dedupe.canopy_index:Removing stop word drive
INFO:dedupe.canopy_index:Removing stop word road
INFO:dedupe.canopy_index:Removing stop word st
INFO:dedupe.canopy_index:Removing stop word street
INFO:dedupe.canopy_index:Removing stop word  b
INFO:dedupe.canopy_index:Removing stop word  t
INFO:dedupe.canopy_index:Removing stop word 5 
INFO:dedupe.canopy_index:Removing stop word e 
INFO:dedupe.canopy_index:Removing stop word in
INFO:dedupe.canopy_index:Removing stop word nd
INFO:dedupe.canopy_index:Removing stop word po
INFO:dedupe.canopy_index:Removing stop word t 
INFO:dedupe.canopy_index:Removing stop word ur
INFO:dedupe.canopy_index:Removing stop word  r
INFO:dedupe.canopy_index:Removing stop word 30
INFO:dedupe.canopy_index:Removing stop word co
INFO:dedupe.canopy_index:Removing stop word oo
INFO:dedupe.canopy_index:Removing stop word rd
INF

INFO:dedupe.canopy_index:Removing stop word te
INFO:dedupe.canopy_index:Removing stop word en
INFO:dedupe.canopy_index:Removing stop word nt
INFO:dedupe.canopy_index:Removing stop word ar
INFO:dedupe.canopy_index:Removing stop word t 
INFO:dedupe.canopy_index:Removing stop word y 
INFO:dedupe.canopy_index:Removing stop word  d
INFO:dedupe.canopy_index:Removing stop word ar
INFO:dedupe.canopy_index:Removing stop word de
INFO:dedupe.canopy_index:Removing stop word ic
INFO:dedupe.canopy_index:Removing stop word nc
INFO:dedupe.canopy_index:Removing stop word pa
INFO:dedupe.canopy_index:Removing stop word rt
INFO:dedupe.canopy_index:Removing stop word es
INFO:dedupe.canopy_index:Removing stop word s 
INFO:dedupe.canopy_index:Removing stop word  e
INFO:dedupe.canopy_index:Removing stop word el
INFO:dedupe.canopy_index:Removing stop word he
INFO:dedupe.canopy_index:Removing stop word la
INFO:dedupe.canopy_index:Removing stop word n 
INFO:dedupe.canopy_index:Removing stop word  g
INFO:dedupe.c

INFO:dedupe.canopy_index:Removing stop word al
INFO:dedupe.canopy_index:Removing stop word el
INFO:dedupe.canopy_index:Removing stop word l 
INFO:dedupe.canopy_index:Removing stop word y 
INFO:dedupe.canopy_index:Removing stop word  s
INFO:dedupe.canopy_index:Removing stop word rs
INFO:dedupe.canopy_index:Removing stop word ss
INFO:dedupe.canopy_index:Removing stop word un
INFO:dedupe.canopy_index:Removing stop word ve
INFO:dedupe.canopy_index:Removing stop word me
INFO:dedupe.canopy_index:Removing stop word ca
INFO:dedupe.canopy_index:Removing stop word ol
INFO:dedupe.canopy_index:Removing stop word th
INFO:dedupe.canopy_index:Removing stop word at
INFO:dedupe.canopy_index:Removing stop word ic
INFO:dedupe.canopy_index:Removing stop word ti
INFO:dedupe.canopy_index:Removing stop word es
INFO:dedupe.canopy_index:Removing stop word  f
INFO:dedupe.canopy_index:Removing stop word fo
INFO:dedupe.canopy_index:Removing stop word ou
INFO:dedupe.canopy_index:Removing stop word el
INFO:dedupe.c

Done.


In [7]:
        # ## Active learning

        print('starting active labeling...')
        # Starts the training loop. Dedupe will find the next pair of records
        # it is least certain about and ask you to label them as duplicates
        # or not.

        # use 'y', 'n' and 'u' keys to flag duplicates
        # press 'f' when you are finished
        dedupe.console_label(deduper)
        # When finished, save our labeled, training pairs to disk
        with open(training_file, 'w') as tf:
            deduper.write_training(tf)

        # Notice our argument here
        #
        # `recall` is the proportion of true dupes pairs that the learned
        # rules must cover. You may want to reduce this if your are making
        # too many blocks and too many comparisons.
        deduper.train(recall=0.90)

        with open(settings_file, 'wb') as sf:
            deduper.write_settings(sf)

        # We can now remove some of the memory hogging objects we used
        # for training
        deduper.cleanup_training()

name : us postmaster
address : None
city : None
state : None
zip : None
occupation : None
employer : None

name : us postmaster
address : 418 gallimore rd 
city : greensboro
state : nc
zip : 27409
occupation : None
employer : None

0/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished


starting active labeling...
n


name : us postal service
address : None
city : None
state : None
zip : None
occupation : None
employer : None

name : us postal service
address : 119 west washington st 
city : rockingham
state : nc
zip : 28379
occupation : None
employer : None

0/10 positive, 1/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : james a barrett
address : 270 cumberland ave 
city : asheville
state : nc
zip : 28801
occupation : attorney
employer : pisgah legal services

name : james barrett
address : 270 cumberland ave 
city : asheville
state : nc
zip : 28801
occupation : lawyer
employer : pisgah legal services

0/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : barbara lankton
address : 2665 grosvenor pl apt a
city : winston salem
state : nc
zip : 27106-5253
occupation : doctor
employer : retired

name : barbara lankton
address : 2665 grosvenor pl apt a
city : winston salem
state : nc
zip : 27106-5253
occupation : physician
employer : retired

1/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (tokenFieldPredicate, zip), TfidfNGramCanopyPredicate: (0.8, address))
INFO:dedupe.training:(SimplePredicate: (commonThreeTokens, employer), TfidfNGramCanopyPredicate: (0.6, name))
name : justine tobin
address : 215 north pine street 
city : charlotte
state : nc
zip : 28202
occupation : accountant
employer : self employed

name : justine tobin
address : 215 north pine street 
city : charlotte
state : nc
zip : 28202
occupation : banker
employer : self employed

2/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (fingerprint, address), TfidfNGramCanopyPredicate: (0.8, name))
INFO:dedupe.training:(SimplePredicate: (commonThreeTokens, employer), TfidfNGramCanopyPredicate: (0.6, name))
name : paul kauffmann
address : 2124 sprunt ave 
city : durham
state : nc
zip : 27705-3254
occupation : retired teacher
employer : not employed

name : paul kauffmann
address : 2124 sprunt avenue 
city : durham
state : nc
zip : 27705
occupation : not employed
employer : n/a

3/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (fingerprint, name), TfidfNGramCanopyPredicate: (0.6, employer))
INFO:dedupe.training:(SimplePredicate: (commonThreeTokens, employer), TfidfNGramCanopyPredicate: (0.6, name))
name : jane barnett
address : 900 river road 
city : robbins
state : nc
zip : 27325
occupation : not employed
employer : not employed

name : jane barnett
address : 900 river road 
city : robbins
state : nc
zip : 27325
occupation : retired teacher
employer : teacher

4/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (fingerprint, name), TfidfNGramCanopyPredicate: (0.6, address))
INFO:dedupe.training:(SimplePredicate: (fingerprint, address), TfidfNGramCanopyPredicate: (0.8, name))
INFO:dedupe.training:(SimplePredicate: (commonThreeTokens, employer), TfidfNGramCanopyPredicate: (0.6, name))
name : douglas auer
address : 1036 15th ave nw 
city : hickory
state : nc
zip : 28601
occupation : sales & marketing
employer : custom design group llc

name : douglas auer
address : 1036 15th ave, nw 
city : hickory
state : nc
zip : 28601
occupation : jeanne supin for congress outreach coord
employer : jeanne supin for congress

5/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (doubleMetaphone, name), SimplePredicate: (oneGramFingerprint, address))
name : george christie
address : 5212 twin pines ln 
city : durham
state : nc
zip : 27705
occupation : retired
employer : duke law

name : nick christie
address : 5212 twin pines ln 
city : durham
state : nc
zip : 27705
occupation : student
employer : n/a

6/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : charles e schroeder iii
address : 5701 buffalo gap rd 
city : abilene
state : tx
zip : 79606
occupation : oil exploration executive
employer : chisholm operating inc

name : charles e schroeder iii
address : 5701 buffalo gap rd. suite b
city : abilene
state : tx
zip : 79606
occupation : wildcatter
employer : oil and gas industry

6/10 positive, 3/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : james casteen
address : 474 horsebranch rd 
city : burgaw
state : nc
zip : 28425
occupation : retired
employer : retired

name : james taylor
address : 441 new lenox road 
city : lenox
state : ma
zip : 01240
occupation : musician
employer : self employed

7/10 positive, 3/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (doubleMetaphone, name), SimplePredicate: (oneGramFingerprint, address))
INFO:dedupe.training:(SimplePredicate: (commonThreeTokens, address), TfidfNGramCanopyPredicate: (0.8, name))
name : jeffrey qualls
address : 610 morningside drive 
city : durham
state : nc
zip : 27713
occupation : senior editor
employer : american board of pediatrics

name : jeffrey watters
address : 2103 whitney st 
city : houston
state : tx
zip : 77006
occupation : attorney
employer : baker botts llp

7/10 positive, 4/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : dexter b edwards
address : 323 willard edwards rd 
city : beulaville
state : nc
zip : 28518
occupation : farmer
employer : self employed

name : dexter b edwards
address : 323 willard edwards road 
city : beulaville
state : nc
zip : 28518
occupation : farmer
employer : murphy brown

7/10 positive, 5/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : charles byers
address : 243 mccall drive 
city : forest city
state : nc
zip : 28043
occupation : president ceo
employer : cfp

name : charles p. byers
address : 243 mccall drive 
city : forest city
state : nc
zip : 28043
occupation : president/ceo
employer : cfp

8/10 positive, 5/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (doubleMetaphone, name), SimplePredicate: (oneGramFingerprint, address))
INFO:dedupe.training:(SimplePredicate: (tokenFieldPredicate, zip), TfidfNGramCanopyPredicate: (0.8, address))
INFO:dedupe.training:(SimplePredicate: (commonThreeTokens, address), TfidfNGramCanopyPredicate: (0.8, name))
name : jennifer mcgovern
address : 1011 minerva ave 
city : durham
state : nc
zip : 27701
occupation : occupational therapy
employer : legacy health

name : jennifer mcgovern
address : 1011 minerva avenue 
city : durham
state : nc
zip : 27701
occupation : tutor
employer : self-employed

9/10 positive, 5/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (doubleMetaphone, name), SimplePredicate: (oneGramFingerprint, address))
INFO:dedupe.training:(SimplePredicate: (tokenFieldPredicate, zip), TfidfNGramCanopyPredicate: (0.8, address))
INFO:dedupe.training:(SimplePredicate: (commonThreeTokens, address), TfidfNGramCanopyPredicate: (0.8, name))
INFO:dedupe.training:(SimplePredicate: (twoGramFingerprint, address), TfidfNGramCanopyPredicate: (0.8, employer))
name : julia cleaver
address : 295 river forest rd 
city : pittsboro
state : nc
zip : 27312
occupation : librarian
employer : ipas

name : julia cleaver
address : 295 river forest road 
city : pittsboro
state : nc
zip : 27312
occupation : librarian
employer : pharmintell

10/10 positive, 5/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (fingerprint, name), TfidfNGramCanopyPredicate: (0.6, address))
INFO:dedupe.training:(SimplePredicate: (doubleMetaphone, name), SimplePredicate: (fingerprint, address))
INFO:dedupe.training:(SimplePredicate: (twoGramFingerprint, address), TfidfNGramCanopyPredicate: (0.8, employer))
name : frank hannah
address : 1622 e marion st 
city : shelby
state : nc
zip : 28150-4939
occupation : physician/surgeon
employer : morganton eye physicians, pa

name : frank hannah
address : 1622 e marion street 
city : shelby
state : nc
zip : 28150
occupation : physician
employer : morganton eye physicians

11/10 positive, 5/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(LevenshteinCanopyPredicate: (3, address), SimplePredicate: (doubleMetaphone, name))
INFO:dedupe.training:(SimplePredicate: (commonThreeTokens, address), TfidfNGramCanopyPredicate: (0.8, name))
INFO:dedupe.training:(SimplePredicate: (twoGramFingerprint, address), TfidfNGramCanopyPredicate: (0.8, employer))
name : carter worthy
address : 2300 white oak rd 
city : raleigh
state : nc
zip : 27608
occupation : broker
employer : carter worthy commercial inc

name : carter worthy
address : 2300 white oak road 
city : raleigh
state : nc
zip : 27608-1456
occupation : real estate broker
employer : None

12/10 positive, 5/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(PartialPredicate: (suffixArray, address, StreetName), SimplePredicate: (doubleMetaphone, name))
INFO:dedupe.training:(PartialIndexTfidfTextCanopyPredicate: (0.6, name, Surname), PartialPredicate: (nearIntegersPredicate, address, StreetName))
INFO:dedupe.training:(SimplePredicate: (twoGramFingerprint, address), TfidfNGramCanopyPredicate: (0.8, employer))
INFO:dedupe.training:(SimplePredicate: (fingerprint, name), TfidfNGramCanopyPredicate: (0.6, employer))
name : david l boliek jr
address : 3218 jura dr 
city : fayetteville
state : nc
zip : 28303
occupation : attorney
employer : None

name : david l boliek jr
address : 3218 jura drive 
city : fayetteville
state : nc
zip : 28303
occupation : attorney
employer : williford hollers crenshaw & boliek

13/10 positive, 5/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : susan boyles
address : 1001 w 4th st 
city : winston salem
state : nc
zip : 27101-2410
occupation : attorney
employer : kilpatrick townsend & stockton llp

name : susan boyles
address : 5471 brookberry farm road 
city : winston salem
state : nc
zip : 27106
occupation : attorney
employer : kilpatrick townsend & stockton llp

14/10 positive, 5/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (commonThreeTokens, address), SimplePredicate: (doubleMetaphone, name))
INFO:dedupe.training:(SimplePredicate: (fingerprint, name), TfidfNGramCanopyPredicate: (0.6, address))
INFO:dedupe.training:(SimplePredicate: (twoGramFingerprint, address), TfidfNGramCanopyPredicate: (0.8, employer))
name : carroll beckham
address : 103 aspen court 
city : pine knoll shores
state : nc
zip : 28512
occupation : teacher
employer : retired

name : carroll beckham
address : 103 aspen ct 
city : pine knoll shores
state : nc
zip : 28512
occupation : None
employer : None

15/10 positive, 5/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (fingerprint, name), TfidfNGramCanopyPredicate: (0.6, address))
INFO:dedupe.training:(SimplePredicate: (commonThreeTokens, employer), TfidfNGramCanopyPredicate: (0.6, name))
INFO:dedupe.training:(SimplePredicate: (wholeFieldPredicate, address), SimplePredicate: (wholeFieldPredicate, name))
INFO:dedupe.training:(PartialPredicate: (commonTwoTokens, address, StreetName), SimplePredicate: (twoGramFingerprint, name))
INFO:dedupe.training:(SimplePredicate: (twoGramFingerprint, address), TfidfNGramCanopyPredicate: (0.8, employer))
name : charles johnson
address : 101 n tryon st ste 1900
city : charlotte
state : nc
zip : 28246
occupation : attorney
employer : robinson bradshaw

name : charles johnson
address : 4021 kingsgate pl apt b
city : charlotte
state : nc
zip : 28211-4542
occupation : attorney
employer : robinson bradshaw & hinson

16/10 positive, 5/10 negative
Do these records refer to the same thing?
(y)es

u


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (fingerprint, name), TfidfNGramCanopyPredicate: (0.6, address))
INFO:dedupe.training:(SimplePredicate: (commonThreeTokens, employer), TfidfNGramCanopyPredicate: (0.6, name))
INFO:dedupe.training:(SimplePredicate: (wholeFieldPredicate, address), SimplePredicate: (wholeFieldPredicate, name))
INFO:dedupe.training:(PartialPredicate: (commonTwoTokens, address, StreetName), SimplePredicate: (twoGramFingerprint, name))
INFO:dedupe.training:(PartialPredicate: (fingerprint, address, StreetName), SimplePredicate: (commonThreeTokens, city))
INFO:dedupe.training:(SimplePredicate: (twoGramFingerprint, address), TfidfNGramCanopyPredicate: (0.8, employer))
name : jonathan a. berkelhammer
address : 1505 forest hill drive 
city : greensboro
state : nc
zip : 27410
occupation : attorney
employer : ellis and winters

name : jonathan berkelhammer
address : 333 n green st., ste 200 
city : greensboro
state : nc
zip : 27401
occu

u


name : barbara h voorhees
address : 1112 woodvale avenue 
city : gastonia
state : nc
zip : 28054
occupation : None
employer : None

name : barbara voorhees
address : 1112 woodvale avenue 
city : gastonia
state : nc
zip : 28054-5733
occupation : executive director
employer : the carrie e and lena v glenn

16/10 positive, 5/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : david forvendel
address : 109 canyon run 
city : cary
state : nc
zip : 27513-2833
occupation : research programmer analyst
employer : rti international

name : david g forvendel
address : 109 canyon run 
city : cary
state : nc
zip : 27513
occupation : None
employer : None

17/10 positive, 5/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (doubleMetaphone, name), TfidfNGramCanopyPredicate: (0.6, address))
INFO:dedupe.training:(SimplePredicate: (commonThreeTokens, employer), TfidfNGramCanopyPredicate: (0.6, name))
INFO:dedupe.training:(SimplePredicate: (wholeFieldPredicate, address), SimplePredicate: (wholeFieldPredicate, name))
INFO:dedupe.training:(PartialPredicate: (commonTwoTokens, address, StreetName), SimplePredicate: (twoGramFingerprint, name))
INFO:dedupe.training:(PartialPredicate: (fingerprint, address, StreetName), SimplePredicate: (commonThreeTokens, city))
INFO:dedupe.training:(SimplePredicate: (twoGramFingerprint, address), TfidfNGramCanopyPredicate: (0.8, employer))
name : joyce cotten
address : 1221 n pea ridge rd 
city : pittsboro
state : nc
zip : 27312
occupation : teacher retired
employer : lee county school system

name : joyce j cotton
address : 1221 n. pea ridge rd 
city : pittsboro
state : nc
zip : 27312
occupation : N

y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (doubleMetaphone, name), TfidfNGramCanopyPredicate: (0.6, address))
INFO:dedupe.training:(SimplePredicate: (commonThreeTokens, employer), TfidfNGramCanopyPredicate: (0.6, name))
INFO:dedupe.training:(SimplePredicate: (wholeFieldPredicate, address), SimplePredicate: (wholeFieldPredicate, name))
INFO:dedupe.training:(PartialPredicate: (commonTwoTokens, address, StreetName), SimplePredicate: (twoGramFingerprint, name))
INFO:dedupe.training:(PartialPredicate: (fingerprint, address, StreetName), SimplePredicate: (commonThreeTokens, city))
INFO:dedupe.training:(PartialPredicate: (suffixArray, name, Surname), TfidfNGramCanopyPredicate: (0.8, address))
INFO:dedupe.training:(SimplePredicate: (twoGramFingerprint, address), TfidfNGramCanopyPredicate: (0.8, employer))
name : vanessa harrison
address : 10516 hadleigh place 
city : charlotte
state : nc
zip : 28210
occupation : president
employer : at&t -- north carolina

y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (doubleMetaphone, name), SimplePredicate: (oneGramFingerprint, address))
INFO:dedupe.training:(TfidfNGramCanopyPredicate: (0.6, address), TfidfNGramCanopyPredicate: (0.6, name))
INFO:dedupe.training:(SimplePredicate: (commonTwoTokens, city), SimplePredicate: (wholeFieldPredicate, name))
INFO:dedupe.training:(PartialPredicate: (commonTwoTokens, address, StreetName), TfidfNGramCanopyPredicate: (0.6, name))
INFO:dedupe.training:(SimplePredicate: (twoGramFingerprint, address), TfidfNGramCanopyPredicate: (0.8, employer))
name : damon seils
address : 601 jones ferry rd 
city : carrboro
state : nc
zip : 27510
occupation : None
employer : None

name : damon seils
address : 601 jones ferry rd apt b13
city : carrboro
state : nc
zip : 27510-2197
occupation : research manager
employer : duke university

20/10 positive, 5/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)re

y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(PartialPredicate: (commonSixGram, name, Surname), TfidfNGramCanopyPredicate: (0.6, address))
INFO:dedupe.training:(SimplePredicate: (wholeFieldPredicate, address), SimplePredicate: (wholeFieldPredicate, name))
INFO:dedupe.training:(SimplePredicate: (commonThreeTokens, employer), TfidfNGramCanopyPredicate: (0.6, name))
INFO:dedupe.training:(PartialPredicate: (commonTwoTokens, address, StreetName), SimplePredicate: (oneGramFingerprint, name))
INFO:dedupe.training:(PartialPredicate: (fingerprint, address, StreetName), SimplePredicate: (commonThreeTokens, city))
INFO:dedupe.training:(SimplePredicate: (twoGramFingerprint, address), TfidfNGramCanopyPredicate: (0.8, employer))
name : danny caddell
address : pob 665 
city : laurinburg
state : nc
zip : 28352
occupation : insurance agent
employer : state farm insurance

name : danny caddell
address : po box 665 
city : laurinburg
state : nc
zip : 28352
occupation : insurance agebt
e

y


name : aqua america, inc. h20 pac
address : 762 w lancaster ave 
city : bryn mawr
state : pa
zip : 19010-3402
occupation : None
employer : None

name : aqua america, inc  h20  pac
address : 762 west lancaster ave 
city : bryn mawr
state : pa
zip : 19010
occupation : None
employer : None

22/10 positive, 5/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(LevenshteinCanopyPredicate: (3, name), SimplePredicate: (wholeFieldPredicate, address))
INFO:dedupe.training:(SimplePredicate: (fingerprint, name), TfidfNGramCanopyPredicate: (0.6, address))
INFO:dedupe.training:(SimplePredicate: (commonTwoTokens, employer), SimplePredicate: (fingerprint, name))
INFO:dedupe.training:(PartialPredicate: (commonTwoTokens, address, StreetName), SimplePredicate: (twoGramFingerprint, name))
INFO:dedupe.training:(PartialPredicate: (fingerprint, address, StreetName), SimplePredicate: (commonThreeTokens, city))
name : little architecture
address : 5815 westpark drive 
city : charlotte
state : nc
zip : 28217
occupation : None
employer : None

name : w edwin mcmahan
address : 5815 westpark drive 
city : charlotte
state : nc
zip : 28217
occupation : retired
employer : little & associates

23/10 positive, 5/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)

n


name : committee to elect marcia morgan
address : 110 green turtle lane 
city : carolina beach
state : nc
zip : 28428
occupation : None
employer : None

name : marcia morgan
address : 110 green turtle lane 
city : carolina beach
state : nc
zip : 28428
occupation : us army
employer : retired

23/10 positive, 6/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : john kane
address : 1022 marlowe rd 
city : raleigh
state : nc
zip : 27609
occupation : developer
employer : kane reralty corporation

name : john kane
address : po box 19107 
city : raleigh
state : nc
zip : 27619
occupation : real estate
employer : kane realty coporation

23/10 positive, 7/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : william johnson
address : 617 downpatrick lane 
city : raleigh
state : nc
zip : 27615
occupation : ceo
employer : progress energy

name : william zeke creech
address : 2612 dover road 
city : raleigh
state : nc
zip : 27608
occupation : attorney
employer : zeke creech law

24/10 positive, 7/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(TfidfNGramCanopyPredicate: (0.4, name), TfidfNGramCanopyPredicate: (0.6, address))
INFO:dedupe.training:(SimplePredicate: (commonTwoTokens, employer), TfidfNGramCanopyPredicate: (0.6, name))
INFO:dedupe.training:(PartialPredicate: (commonTwoTokens, address, StreetName), SimplePredicate: (oneGramFingerprint, name))
INFO:dedupe.training:(SimplePredicate: (wholeFieldPredicate, address), SimplePredicate: (wholeFieldPredicate, name))
INFO:dedupe.training:(PartialPredicate: (fingerprint, address, StreetName), SimplePredicate: (commonThreeTokens, city))
INFO:dedupe.training:(SimplePredicate: (twoGramFingerprint, address), TfidfNGramCanopyPredicate: (0.8, employer))
INFO:dedupe.training:(SimplePredicate: (fingerprint, name), TfidfNGramCanopyPredicate: (0.6, employer))
name : jill smith
address : po box 2503 
city : blowing rock
state : nc
zip : 28605
occupation : furniture rendering
employer : self

name : jill smith
address : unk

u


name : charles smith
address : None
city : None
state : nc
zip : None
occupation : None
employer : None

name : charles smith
address : 886 mccotters marina rd 
city : washington
state : nc
zip : 27889
occupation : minister
employer : retired

24/10 positive, 8/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


u


name : harrison locklear
address : None
city : None
state : None
zip : None
occupation : None
employer : None

name : harrison locklear
address : 1080 gene's road 
city : pembroke
state : nc
zip : 28372
occupation : None
employer : None

24/10 positive, 8/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


u


name : charles carter
address : 3322 waterford place 
city : burlington
state : nc
zip : 27215
occupation : None
employer : None

name : charles carter
address : 612 downpatrick ln 
city : raleigh
state : nc
zip : 27615
occupation : attorney
employer : burns day & presnell

24/10 positive, 8/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : victorystore.com
address : 5200 sw 30th st. 
city : davenprt
state : ia
zip : 52802
occupation : None
employer : None

name : victorystore.com business
address : 5200 sw 30th st. 
city : davenport
state : ia
zip : 52802
occupation : None
employer : None

24/10 positive, 9/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : ramdy anderson
address : 5524 captains ln 
city : wilmington
state : nc
zip : 28409
occupation : senior vp
employer : None

name : randy anderson
address : 5524 captains ln 
city : wilmington
state : nc
zip : 28409-3604
occupation : statistical scientist
employer : captains ventures, inc.

25/10 positive, 9/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (commonThreeTokens, address), SimplePredicate: (doubleMetaphone, name))
INFO:dedupe.training:(PartialPredicate: (commonSixGram, name, Surname), TfidfNGramCanopyPredicate: (0.6, address))
INFO:dedupe.training:(SimplePredicate: (commonTwoTokens, employer), SimplePredicate: (fingerprint, name))
INFO:dedupe.training:(PartialPredicate: (commonIntegerPredicate, address, StreetName), TfidfNGramCanopyPredicate: (0.6, address))
INFO:dedupe.training:(PartialPredicate: (fingerprint, address, StreetName), SimplePredicate: (commonThreeTokens, city))
INFO:dedupe.training:(PartialPredicate: (doubleMetaphone, address, StreetName), SimplePredicate: (hundredIntegerPredicate, name))
INFO:dedupe.training:(SimplePredicate: (twoGramFingerprint, address), TfidfNGramCanopyPredicate: (0.8, employer))
INFO:dedupe.training:(SimplePredicate: (fingerprint, name), TfidfNGramCanopyPredicate: (0.6, employer))
INFO:dedupe.training:(Partia

y


name : amazon
address : 401 terry ave 
city : seattle
state : wa
zip : 98108
occupation : None
employer : None

name : amazon
address : 440 terry ave n 
city : seattle
state : wa
zip : 98109-5210
occupation : None
employer : None

27/10 positive, 9/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(LevenshteinCanopyPredicate: (3, name), SimplePredicate: (commonThreeTokens, address))
INFO:dedupe.training:(SimplePredicate: (fingerprint, name), TfidfNGramCanopyPredicate: (0.6, address))
INFO:dedupe.training:(SimplePredicate: (commonTwoTokens, employer), SimplePredicate: (fingerprint, name))
INFO:dedupe.training:(PartialPredicate: (commonIntegerPredicate, address, StreetName), TfidfNGramCanopyPredicate: (0.6, address))
INFO:dedupe.training:(PartialPredicate: (fingerprint, address, StreetName), SimplePredicate: (commonThreeTokens, city))
INFO:dedupe.training:(PartialPredicate: (commonSixGram, address, StreetName), TfidfNGramCanopyPredicate: (0.8, name))
INFO:dedupe.training:(PartialPredicate: (suffixArray, name, Surname), TfidfNGramCanopyPredicate: (0.8, address))
INFO:dedupe.training:(SimplePredicate: (fingerprint, name), TfidfNGramCanopyPredicate: (0.6, employer))
name : 18 seaboard
address : 18 seaboard ave. #100
city 

y


name : bp north america
address : 501 westlake park blvd 
city : houston
state : tx
zip : 77079
occupation : None
employer : None

name : bp north america ee pac
address : 501 westlake park blvd 
city : houston
state : tx
zip : 77079
occupation : None
employer : None

29/10 positive, 9/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(PartialPredicate: (suffixArray, address, StreetName), SimplePredicate: (doubleMetaphone, name))
INFO:dedupe.training:(PartialPredicate: (commonSixGram, name, Surname), TfidfNGramCanopyPredicate: (0.6, address))
INFO:dedupe.training:(SimplePredicate: (commonTwoTokens, employer), SimplePredicate: (fingerprint, name))
INFO:dedupe.training:(PartialPredicate: (commonIntegerPredicate, address, StreetName), TfidfNGramCanopyPredicate: (0.6, address))
INFO:dedupe.training:(PartialIndexTfidfTextCanopyPredicate: (0.6, name, Surname), PartialPredicate: (nearIntegersPredicate, address, StreetName))
INFO:dedupe.training:(PartialPredicate: (doubleMetaphone, address, StreetName), SimplePredicate: (hundredIntegerPredicate, name))
INFO:dedupe.training:(SimplePredicate: (twoGramFingerprint, address), TfidfNGramCanopyPredicate: (0.8, employer))
INFO:dedupe.training:(SimplePredicate: (fingerprint, name), TfidfNGramCanopyPredicate: (0.6, employ

n


name : international paper pac
address : 1101 pennsylvania avenue nw ste 200
city : washington
state : dc
zip : 20004
occupation : None
employer : None

name : international paper pac
address : 1101 pennsylvania avnw ste 200 
city : washington
state : dc
zip : 20004
occupation : None
employer : None

30/10 positive, 10/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : nasif majeed
address : 5401 rupert lane 
city : charlotte
state : nc
zip : 28215
occupation : businessman
employer : self employed

name : nasif majeed campaign committee
address : 5401 rupert lane 
city : charlotte
state : nc
zip : 28215
occupation : None
employer : None

31/10 positive, 10/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(LevenshteinCanopyPredicate: (3, name), PartialPredicate: (tokenFieldPredicate, address, StreetName))
INFO:dedupe.training:(PartialIndexTfidfTextCanopyPredicate: (0.8, address, StreetName), TfidfNGramCanopyPredicate: (0.8, address))
INFO:dedupe.training:(SimplePredicate: (commonTwoTokens, employer), SimplePredicate: (fingerprint, name))
INFO:dedupe.training:(PartialPredicate: (commonIntegerPredicate, address, StreetName), TfidfNGramCanopyPredicate: (0.6, address))
INFO:dedupe.training:(PartialIndexTfidfNGramCanopyPredicate: (0.4, name, CorporationName), PartialPredicate: (commonTwoTokens, address, StreetName))
INFO:dedupe.training:(SimplePredicate: (fingerprint, name), TfidfNGramCanopyPredicate: (0.6, employer))
name : ncsfaa pac 05-07 nc state farm agents & associates
address : p.o. box 1105 
city : raleigh
state : nc
zip : 27602
occupation : None
employer : None

name : ncsfaa pac 05-07 nc state farm agents & associates
a

f


Finished labeling
INFO:rlr.crossvalidation:using cross validation to find optimum alpha...
INFO:rlr.crossvalidation:optimum alpha: 1.000000, score 0.47400298095185583
INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (doubleMetaphone, name), TfidfNGramCanopyPredicate: (0.6, address))
INFO:dedupe.training:(SimplePredicate: (commonTwoTokens, employer), TfidfNGramCanopyPredicate: (0.6, name))
INFO:dedupe.training:(PartialIndexTfidfTextCanopyPredicate: (0.8, address, StreetName), TfidfNGramCanopyPredicate: (0.8, address))
INFO:dedupe.training:(PartialPredicate: (commonTwoTokens, address, StreetName), TfidfNGramCanopyPredicate: (0.6, name))
INFO:dedupe.training:(PartialPredicate: (commonSixGram, address, StreetName), TfidfNGramCanopyPredicate: (0.8, name))
INFO:dedupe.training:(PartialPredicate: (commonIntegerPredicate, address, StreetName), TfidfNGramCanopyPredicate: (0.6, address))
INFO:dedupe.training:(PartialIndexTfidfTextCanopyPredicate: (0.6, name, Surna

In [8]:
write_con.commit()

In [9]:
read_con.commit()

In [10]:
    # ## Blocking
    print('blocking...')

    # To run blocking on such a large set of data, we create a separate table
    # that contains blocking keys and record ids
    print('creating blocking_map database')
    with write_con:
        with write_con.cursor() as cur:
            cur.execute("DROP TABLE IF EXISTS blocking_map")
            cur.execute("CREATE TABLE blocking_map "
                        "(block_key text, canon_account_id INTEGER)")

    # If dedupe learned a Index Predicate, we have to take a pass
    # through the data and create indices.
    print('creating inverted index')

    for field in deduper.fingerprinter.index_fields:
        with read_con.cursor('field_values') as cur:
            cur.execute("SELECT DISTINCT %s FROM processed_accounts" % field)
            field_data = (row[field] for row in cur)
            deduper.fingerprinter.index(field_data, field)

    # Now we are ready to write our blocking map table by creating a
    # generator that yields unique `(block_key, donor_id)` tuples.
    print('writing blocking map')

    with read_con.cursor('donor_select') as read_cur:
        read_cur.execute(DONOR_SELECT)

        full_data = ((row['account_id'], row) for row in read_cur)
        b_data = deduper.fingerprinter(full_data)

        with write_con:
            with write_con.cursor() as write_cur:
                write_cur.copy_expert('COPY blocking_map FROM STDIN WITH CSV',
                                      Readable(b_data),
                                        size=10000)
    print("done")

blocking...
creating blocking_map database
creating inverted index


INFO:dedupe.canopy_index:Removing stop word 00
INFO:dedupe.canopy_index:Removing stop word 0 
INFO:dedupe.canopy_index:Removing stop word  s
INFO:dedupe.canopy_index:Removing stop word et
INFO:dedupe.canopy_index:Removing stop word or
INFO:dedupe.canopy_index:Removing stop word rt
INFO:dedupe.canopy_index:Removing stop word th
INFO:dedupe.canopy_index:Removing stop word  f
INFO:dedupe.canopy_index:Removing stop word 01
INFO:dedupe.canopy_index:Removing stop word 9 
INFO:dedupe.canopy_index:Removing stop word k 
INFO:dedupe.canopy_index:Removing stop word rd
INFO:dedupe.canopy_index:Removing stop word  l
INFO:dedupe.canopy_index:Removing stop word 1 
INFO:dedupe.canopy_index:Removing stop word r 
INFO:dedupe.canopy_index:Removing stop word  a
INFO:dedupe.canopy_index:Removing stop word 2 
INFO:dedupe.canopy_index:Removing stop word av
INFO:dedupe.canopy_index:Removing stop word ng
INFO:dedupe.canopy_index:Removing stop word t 
INFO:dedupe.canopy_index:Removing stop word ve
INFO:dedupe.c

INFO:dedupe.canopy_index:Removing stop word  o
INFO:dedupe.canopy_index:Removing stop word le
INFO:dedupe.canopy_index:Removing stop word  a
INFO:dedupe.canopy_index:Removing stop word  p
INFO:dedupe.canopy_index:Removing stop word f 
INFO:dedupe.canopy_index:Removing stop word ge
INFO:dedupe.canopy_index:Removing stop word ni
INFO:dedupe.canopy_index:Removing stop word pa
INFO:dedupe.canopy_index:Removing stop word  c
INFO:dedupe.canopy_index:Removing stop word a 
INFO:dedupe.canopy_index:Removing stop word ch
INFO:dedupe.canopy_index:Removing stop word li
INFO:dedupe.canopy_index:Removing stop word ol
INFO:dedupe.canopy_index:Removing stop word to
INFO:dedupe.canopy_index:Removing stop word  h
INFO:dedupe.canopy_index:Removing stop word ho
INFO:dedupe.canopy_index:Removing stop word is
INFO:dedupe.canopy_index:Removing stop word io
INFO:dedupe.canopy_index:Removing stop word ti
INFO:dedupe.canopy_index:Removing stop word ac
INFO:dedupe.canopy_index:Removing stop word oo
INFO:dedupe.c

writing blocking map


INFO:dedupe.blocking:10000, 79.3516132 seconds
INFO:dedupe.blocking:20000, 144.5594092 seconds
INFO:dedupe.blocking:30000, 215.0420042 seconds
INFO:dedupe.blocking:40000, 277.1376282 seconds
INFO:dedupe.blocking:50000, 337.4827112 seconds
INFO:dedupe.blocking:60000, 405.2434742 seconds
INFO:dedupe.blocking:70000, 470.1647352 seconds
INFO:dedupe.blocking:80000, 534.2992042 seconds
INFO:dedupe.blocking:90000, 603.8394422 seconds
INFO:dedupe.blocking:100000, 680.8980202 seconds
INFO:dedupe.blocking:110000, 758.0358542 seconds
INFO:dedupe.blocking:120000, 821.1857992 seconds
INFO:dedupe.blocking:130000, 884.7498652 seconds
INFO:dedupe.blocking:140000, 947.2686022 seconds
INFO:dedupe.blocking:150000, 1009.4639832 seconds
INFO:dedupe.blocking:160000, 1078.1432642 seconds
INFO:dedupe.blocking:170000, 1145.5570592 seconds
INFO:dedupe.blocking:180000, 1213.5080242 seconds
INFO:dedupe.blocking:190000, 1281.6245452 seconds
INFO:dedupe.blocking:200000, 1338.5887472 seconds
INFO:dedupe.blocking:210

done


In [11]:
    # free up memory by removing indices
    deduper.fingerprinter.reset_indices()

    logging.info("indexing block_key")
    with write_con:
        with write_con.cursor() as cur:
            cur.execute("CREATE UNIQUE INDEX ON blocking_map "
                        "(block_key text_pattern_ops, canon_account_id)")


INFO:root:indexing block_key


In [12]:
read_con.commit()
write_con.commit()

In [13]:
    # ## Clustering

    with write_con:
        with write_con.cursor() as cur:
            cur.execute("DROP TABLE IF EXISTS entity_map")

            print('creating entity_map database')
            cur.execute("CREATE TABLE entity_map "
                        "(original_id INTEGER, canon_id INTEGER, "
                        " cluster_score FLOAT, PRIMARY KEY(original_id))")

    with read_con.cursor('pairs', cursor_factory=psycopg2.extensions.cursor) as read_cur:
        read_cur.execute("""
               select a.account_id,
                      row_to_json((select d from (select a.city,
                                                         a.name,
                                                         a.zip,
                                                         a.state,
                                                         a.address,
                                                         a.occupation,
                                                         a.employer) d)),
                      b.account_id,
                      row_to_json((select d from (select b.city,
                                                         b.name,
                                                         b.zip,
                                                         b.state,
                                                         b.address,
                                                         b.occupation,
                                                         b.employer) d))
               from (select DISTINCT l.canon_account_id as east, r.canon_account_id as west
                     from blocking_map as l
                     INNER JOIN blocking_map as r
                     using (block_key)
                     where l.canon_account_id < r.canon_account_id) ids
               INNER JOIN processed_accounts a on ids.east=a.account_id
               INNER JOIN processed_accounts b on ids.west=b.account_id""")

        print('clustering...')
        clustered_dupes = deduper.cluster(deduper.score(record_pairs(read_cur)),
                                          threshold=0.5)

        # ## Writing out results

        # We now have a sequence of tuples of donor ids that dedupe believes
        # all refer to the same entity. We write this out onto an entity map
        # table

        print('writing results')
        with write_con:
            with write_con.cursor() as write_cur:
                write_cur.copy_expert('COPY entity_map FROM STDIN WITH CSV',
                                      Readable(cluster_ids(clustered_dupes)),
                                      size=10000)

    with write_con:
        with write_con.cursor() as cur:
            cur.execute("CREATE INDEX head_index ON entity_map (canon_id)")

    # Print out the number of duplicates found

creating entity_map database
clustering...
0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
330000
340000
350000
360000
370000
380000
390000
400000
410000
420000
430000
440000
450000
460000
470000
480000
490000
500000
510000
520000
530000
540000
550000
560000
570000
580000
590000
600000
610000
620000
630000
640000
650000
660000
670000
680000
690000
700000
710000
720000
730000
740000
750000
760000
770000
780000
790000
800000
810000
820000
830000
840000
850000
860000
870000
880000
890000
900000
910000
920000
930000
940000
950000
960000
970000
980000
990000
1000000
1010000
1020000
1030000
1040000
1050000
1060000
1070000
1080000
1090000
1100000
1110000
1120000
1130000
1140000
1150000
1160000
1170000
1180000
1190000
1200000
1210000
1220000
1230000
1240000
1250000
1260000
1270000
1280000
1290000
1300000
1310000
1320000
1330000

In [14]:
    read_con.close()
    write_con.close()

    print('ran in', time.time() - start_time, 'seconds')

ran in 15386.468050718307 seconds


In [None]:
 locale.setlocale(locale.LC_ALL, '')  # for pretty printing numbers

In [None]:
read_con.commit()

In [None]:
    with read_con.cursor() as cur:
        cur.execute("DROP TABLE e_map")

 with read_con.cursor() as cur:
   
        cur.execute(
            "SELECT CONCAT_WS(' ', donors.name) as name, "
            "SUM(CAST(contributions.amount AS FLOAT)) AS totals "
            "FROM donors INNER JOIN contributions "
            "USING (donor_id) "
            "GROUP BY (donor_id) "
            "ORDER BY totals DESC "
            "LIMIT 10"
        )

        print("Top Donors (raw)")
        for row in cur:
            row['totals'] = row['totals']
            print('%(totals)20s: %(name)s' % row)

   

    with read_con.cursor() as cur:
               
        cur.execute("CREATE TEMPORARY TABLE e_map "
                    "AS SELECT COALESCE(canon_id, donor_id) AS canon_id, donor_id "
                    "FROM entity_map "
                    "RIGHT JOIN donors USING(donor_id)")
        
        cur.execute(
            "SELECT donors.name AS name, "
            "donation_totals.totals AS totals "
            "FROM donors INNER JOIN "
            "(SELECT contributions.canon_id, SUM(CAST(amount AS FLOAT)) AS totals "
            " FROM contributions INNER JOIN e_map "
            " USING (donor_id) "
            " GROUP BY (contributions.canon_id) "
            " ORDER BY totals "
            " DESC LIMIT 10) "
            "AS donation_totals ON donors.donor_id=donation_totals.canon_id "
            "WHERE donors.donor_id = donation_totals.canon_id"
        )

        print("Top Donors (deduped)")
        for row in cur:
            row['totals'] = row['totals']
            print('%(totals)20s: %(name)s' % row)
            
        cur.execute("SELECT * FROM e_map")
   

In [None]:
    read_con.close()
    write_con.close()

    print('ran in', time.time() - start_time, 'seconds')