# Data Deduplication

https://recordlinkage.readthedocs.io/en/latest/notebooks/data_deduplication.html

In [1]:
import recordlinkage
from recordlinkage.datasets import load_febrl1

In [2]:
dfA = load_febrl1()

dfA.head()

Unnamed: 0_level_0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
rec-223-org,,waller,6.0,tullaroop street,willaroo,st james,4011,wa,19081209,6988048
rec-122-org,lachlan,berry,69.0,giblin street,killarney,bittern,4814,qld,19990219,7364009
rec-373-org,deakin,sondergeld,48.0,goldfinch circuit,kooltuo,canterbury,2776,vic,19600210,2635962
rec-10-dup-0,kayla,harrington,,maltby circuit,coaling,coolaroo,3465,nsw,19150612,9004242
rec-227-org,luke,purdon,23.0,ramsay place,mirani,garbutt,2260,vic,19831024,8099933


In [3]:
indexer = recordlinkage.Index()
indexer.full()
candidate_links = indexer.index(dfA)



In [4]:
print (len(dfA), len(candidate_links))
# (1000*1000-1000)/2 = 499500

1000 499500


In [5]:
indexer = recordlinkage.Index()
indexer.block('given_name')
candidate_links = indexer.index(dfA)

print (len(candidate_links))

2082


In [6]:
# This cell can take some time to compute.
compare_cl = recordlinkage.Compare()

In [8]:
compare_cl.exact('given_name', 'given_name', label='given_name')
compare_cl.string('surname', 'surname', method='jarowinkler', threshold=0.85, label='surname')
compare_cl.exact('date_of_birth', 'date_of_birth', label='date_of_birth')
compare_cl.exact('suburb', 'suburb', label='suburb')
compare_cl.exact('state', 'state', label='state')
compare_cl.string('address_1', 'address_1', threshold=0.85, label='address_1')

<Compare>

In [10]:
features = compare_cl.compute(candidate_links, dfA)

In [11]:
features.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,given_name,surname,date_of_birth,suburb,state,address_1
rec_id_1,rec_id_2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
rec-183-dup-0,rec-122-org,1,0.0,0,0,0,0.0
rec-248-org,rec-122-org,1,0.0,0,0,1,0.0
rec-248-org,rec-183-dup-0,1,0.0,0,0,0,0.0
rec-122-dup-0,rec-122-org,1,1.0,1,1,1,1.0
rec-122-dup-0,rec-183-dup-0,1,0.0,0,0,0,0.0
rec-122-dup-0,rec-248-org,1,0.0,0,0,1,0.0
rec-469-org,rec-122-org,1,0.0,0,0,0,0.0
rec-469-org,rec-183-dup-0,1,0.0,0,0,1,0.0
rec-469-org,rec-248-org,1,0.0,0,0,0,0.0
rec-469-org,rec-122-dup-0,1,0.0,0,0,0,0.0


In [12]:
features.describe()

Unnamed: 0,given_name,surname,date_of_birth,suburb,state,address_1
count,2082.0,2082.0,2082.0,2082.0,2082.0,2082.0
mean,1.0,0.144092,0.139289,0.108549,0.327089,0.133045
std,0.0,0.351268,0.346331,0.311148,0.469263,0.339705
min,1.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,0.0,0.0,0.0,1.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0


In [13]:
# Sum the comparison results.
features.sum(axis=1).value_counts().sort_index(ascending=False)

6.0     142
5.0     145
4.0      30
3.0       9
2.0     376
1.0    1380
dtype: int64

In [14]:
matches = features[features.sum(axis=1) > 3]

print(len(matches))
matches.head(10)

317


Unnamed: 0_level_0,Unnamed: 1_level_0,given_name,surname,date_of_birth,suburb,state,address_1
rec_id_1,rec_id_2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
rec-122-dup-0,rec-122-org,1,1.0,1,1,1,1.0
rec-183-org,rec-183-dup-0,1,1.0,1,1,1,1.0
rec-248-dup-0,rec-248-org,1,1.0,1,1,1,1.0
rec-373-dup-0,rec-373-org,1,1.0,1,1,1,1.0
rec-10-org,rec-10-dup-0,1,1.0,1,1,1,1.0
rec-342-dup-0,rec-342-org,1,1.0,0,1,1,1.0
rec-397-org,rec-397-dup-0,1,1.0,1,1,1,0.0
rec-472-org,rec-472-dup-0,1,1.0,1,1,1,0.0
rec-330-org,rec-330-dup-0,1,0.0,1,1,1,0.0
rec-190-org,rec-190-dup-0,1,1.0,0,1,1,1.0
