# Deduping

## Imports

In [11]:
import recordlinkage
from recordlinkage.datasets import load_febrl1

## Load data

In [12]:
data = load_febrl1()
data.head(10)

Unnamed: 0_level_0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
rec-223-org,,waller,6.0,tullaroop street,willaroo,st james,4011,wa,19081209.0,6988048
rec-122-org,lachlan,berry,69.0,giblin street,killarney,bittern,4814,qld,19990219.0,7364009
rec-373-org,deakin,sondergeld,48.0,goldfinch circuit,kooltuo,canterbury,2776,vic,19600210.0,2635962
rec-10-dup-0,kayla,harrington,,maltby circuit,coaling,coolaroo,3465,nsw,19150612.0,9004242
rec-227-org,luke,purdon,23.0,ramsay place,mirani,garbutt,2260,vic,19831024.0,8099933
rec-6-dup-0,,trevorrow,16.0,dumas street,2/98-latchford barracks,mount immaney,2281,wa,19530313.0,4811642
rec-190-dup-0,darcie,turtur,10.0,blacket street,eureka,beverly hills,2263,nsw,,2025650
rec-294-org,william,bishop,21.0,neworra place,apmnt 65,worongary,6225,qld,19490130.0,9773843
rec-206-dup-0,,lombardi,36.0,yerralee road,leisure living vlge,carlsruhe,3149,qld,19870919.0,1613132
rec-344-org,,julius,52.0,florey drive,north stirling downs,coolaroo,2259,qld,19630521.0,1797144


## Indexation step
First load data into an index and force matching on the field 'given name' (expected to be unique)  
Then we can gete the candidates for deduping

In [5]:
idxr = recordlinkage.Index()
idxr.block(left_on='given_name') 
candidateLinks = idxr.index(data)
candidateLinks

MultiIndex([('rec-183-dup-0',   'rec-122-org'),
            (  'rec-248-org',   'rec-122-org'),
            (  'rec-248-org', 'rec-183-dup-0'),
            ('rec-122-dup-0',   'rec-122-org'),
            ('rec-122-dup-0', 'rec-183-dup-0'),
            ('rec-122-dup-0',   'rec-248-org'),
            (  'rec-469-org',   'rec-122-org'),
            (  'rec-469-org', 'rec-183-dup-0'),
            (  'rec-469-org',   'rec-248-org'),
            (  'rec-469-org', 'rec-122-dup-0'),
            ...
            ('rec-407-dup-0',   'rec-407-org'),
            ('rec-367-dup-0',   'rec-367-org'),
            ('rec-103-dup-0',   'rec-103-org'),
            ('rec-195-dup-0',   'rec-195-org'),
            ('rec-184-dup-0',   'rec-184-org'),
            (  'rec-252-org', 'rec-252-dup-0'),
            ( 'rec-48-dup-0',    'rec-48-org'),
            ('rec-298-dup-0',   'rec-298-org'),
            (  'rec-282-org', 'rec-282-dup-0'),
            (  'rec-327-org',   'rec-411-org')],
           names=['rec_

## Comparison step
Exact matching in required fields. Fuzzy matching on others with certain methods and similarity thresholds.

In [6]:
cmp = recordlinkage.Compare()
cmp.exact('given_name', 'given_name', label='given_name')
cmp.string('surname', 'surname', method='jarowinkler', threshold=0.85, label='surname')
cmp.exact('date_of_birth', 'date_of_birth', label='date_of_birth')
cmp.exact('suburb', 'suburb', label='suburb')
cmp.exact('state', 'state', label='state')
cmp.string('address_1', 'address_1', threshold=0.85, label='address_1')

features = cmp.compute(candidateLinks, data)
features

Unnamed: 0_level_0,Unnamed: 1_level_0,given_name,surname,date_of_birth,suburb,state,address_1
rec_id_1,rec_id_2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
rec-183-dup-0,rec-122-org,1,0.0,0,0,0,0.0
rec-248-org,rec-122-org,1,0.0,0,0,1,0.0
rec-248-org,rec-183-dup-0,1,0.0,0,0,0,0.0
rec-122-dup-0,rec-122-org,1,1.0,1,1,1,1.0
rec-122-dup-0,rec-183-dup-0,1,0.0,0,0,0,0.0
...,...,...,...,...,...,...,...
rec-252-org,rec-252-dup-0,1,1.0,1,1,1,1.0
rec-48-dup-0,rec-48-org,1,1.0,1,1,1,1.0
rec-298-dup-0,rec-298-org,1,1.0,1,1,1,0.0
rec-282-org,rec-282-dup-0,1,1.0,1,1,1,0.0


## Classification step
Classify the record matches on the number of feature matches (threshold 4, 5, 6 here).

In [14]:
matches4 = features[features.sum(axis=1) >= 4]
matches5 = features[features.sum(axis=1) >= 5]
matches6 = features[features.sum(axis=1) >= 6]
matches4

Unnamed: 0_level_0,Unnamed: 1_level_0,given_name,surname,date_of_birth,suburb,state,address_1
rec_id_1,rec_id_2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
rec-122-dup-0,rec-122-org,1,1.0,1,1,1,1.0
rec-183-org,rec-183-dup-0,1,1.0,1,1,1,1.0
rec-248-dup-0,rec-248-org,1,1.0,1,1,1,1.0
rec-373-dup-0,rec-373-org,1,1.0,1,1,1,1.0
rec-10-org,rec-10-dup-0,1,1.0,1,1,1,1.0
...,...,...,...,...,...,...,...
rec-184-dup-0,rec-184-org,1,1.0,1,0,1,1.0
rec-252-org,rec-252-dup-0,1,1.0,1,1,1,1.0
rec-48-dup-0,rec-48-org,1,1.0,1,1,1,1.0
rec-298-dup-0,rec-298-org,1,1.0,1,1,1,0.0


## Results

In [15]:
print('Candidate links : ', len(candidateLinks))
print('Matches 1 : ', len(matches4))
print('Matches 2 : ', len(matches5))
print('Matches 3 : ', len(matches6))

Candidate links :  2082
Matches 1 :  317
Matches 2 :  287
Matches 3 :  142
