# Record Linkage Template
- Clean
- Index
- Compare
- Classify
- Evaluate

In [150]:
import numpy as np
import pandas as pd
import recordlinkage
import math
from recordlinkage.datasets import load_febrl4

## Data Understanding and Cleaning
- make data consistent
- use same column names

In [3]:
dfA, dfB = load_febrl4()

In [4]:
dfA.shape

(5000, 10)

In [5]:
dfA.head()

Unnamed: 0_level_0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
rec-1070-org,michaela,neumann,8,stanley street,miami,winston hills,4223,nsw,19151111,5304218
rec-1016-org,courtney,painter,12,pinkerton circuit,bega flats,richlands,4560,vic,19161214,4066625
rec-4405-org,charles,green,38,salkauskas crescent,kela,dapto,4566,nsw,19480930,4365168
rec-1288-org,vanessa,parr,905,macquoid place,broadbridge manor,south grafton,2135,sa,19951119,9239102
rec-3585-org,mikayla,malloney,37,randwick road,avalind,hoppers crossing,4552,vic,19860208,7207688


In [6]:
dfA.dtypes

given_name       object
surname          object
street_number    object
address_1        object
address_2        object
suburb           object
postcode         object
state            object
date_of_birth    object
soc_sec_id       object
dtype: object

In [7]:
dfA.describe()

Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
count,4888,4952,4842,4902,4580,4945,5000,4950,4906,5000
unique,770,1827,450,2399,2575,1634,1419,8,4588,5000
top,emiily,white,1,forbes street,rowethorpe,toowoomba,4740,nsw,19950105,3742016
freq,85,151,161,18,52,40,30,1686,3,1


In [8]:
dfB.shape

(5000, 10)

In [9]:
dfB.head()

Unnamed: 0_level_0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
rec-561-dup-0,elton,,3.0,light setreet,pinehill,windermere,3212,vic,19651013,1551941
rec-2642-dup-0,mitchell,maxon,47.0,edkins street,lochaoair,north ryde,3355,nsw,19390212,8859999
rec-608-dup-0,,white,72.0,lambrigg street,kelgoola,broadbeach waters,3159,vic,19620216,9731855
rec-3239-dup-0,elk i,menzies,1.0,lyster place,,northwood,2585,vic,19980624,4970481
rec-2886-dup-0,,garanggar,,may maxwell crescent,springettst arcade,forest hill,2342,vic,19921016,1366884


In [10]:
dfB.dtypes

given_name       object
surname          object
street_number    object
address_1        object
address_2        object
suburb           object
postcode         object
state            object
date_of_birth    object
soc_sec_id       object
dtype: object

In [11]:
dfB.describe()

Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
count,4766,4898,4713,4780,4149,4894,5000,4893,4801,5000
unique,1702,2633,453,3516,3306,2403,1691,50,4505,5000
top,emiily,white,1,badimara street,brentwood vlge,toowoomba,4740,nsw,19900531,3742016
freq,68,105,171,12,33,34,23,1637,3,1


## Indexing

### Full Indexing

In [12]:
indexer = recordlinkage.FullIndex()
pairs = indexer.index(dfA, dfB)



In [13]:
print (len(dfA), len(dfB), len(pairs))

5000 5000 25000000


### Blocking

In [14]:
indexer = recordlinkage.BlockIndex(on='given_name')
pairs = indexer.index(dfA, dfB)

print (len(pairs))

77249


### Sorted Neighbourhood Indexing

## Comparing

In [101]:
# This cell can take some time to compute.
compare_cl = recordlinkage.Compare()

compare_cl.exact('given_name', 'given_name', label='given_name')
compare_cl.string('surname', 'surname', method='jarowinkler', threshold=0.85, label='surname')
compare_cl.exact('date_of_birth', 'date_of_birth', label='date_of_birth')
compare_cl.exact('suburb', 'suburb', label='suburb')
compare_cl.exact('state', 'state', label='state')
compare_cl.string('address_1', 'address_1', threshold=0.85, label='address_1')

features = compare_cl.compute(pairs, dfA, dfB)

In [85]:
features.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,given_name,surname,date_of_birth,suburb,state,address_1
rec_id,rec_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
rec-1070-org,rec-3024-dup-0,1,0.0,0,0,1,0.0
rec-1070-org,rec-2371-dup-0,1,0.0,0,0,0,0.0
rec-1070-org,rec-4652-dup-0,1,0.0,0,0,0,0.0
rec-1070-org,rec-4795-dup-0,1,0.0,0,0,1,0.0
rec-1070-org,rec-1314-dup-0,1,0.0,0,0,1,0.0


In [35]:
features.describe()

Unnamed: 0,given_name,surname,date_of_birth,suburb,state,address_1
count,77249.0,77249.0,77249.0,77249.0,77249.0,77249.0
mean,1.0,0.044428,0.037929,0.032259,0.248767,0.0367
std,0.0,0.206045,0.191027,0.176689,0.432301,0.188024
min,1.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0


In [36]:
# Sum the comparison results.
match_grps = features.sum(axis=1).value_counts().sort_index(ascending=False)
match_grps

6.0     1566
5.0     1332
4.0      343
3.0      146
2.0    16427
1.0    57435
dtype: int64

In [27]:
features[features.sum(axis=1) > 3]

Unnamed: 0_level_0,Unnamed: 1_level_0,given_name,surname,date_of_birth,suburb,state,address_1
rec_id,rec_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
rec-2371-org,rec-2371-dup-0,1,1.0,1,1,1,1.0
rec-3024-org,rec-3024-dup-0,1,1.0,1,0,1,0.0
rec-4652-org,rec-4652-dup-0,1,1.0,1,0,1,1.0
rec-4795-org,rec-4795-dup-0,1,1.0,1,1,1,1.0
rec-1016-org,rec-1016-dup-0,1,1.0,1,1,0,1.0
rec-2463-org,rec-2463-dup-0,1,1.0,0,1,1,1.0
rec-521-org,rec-521-dup-0,1,1.0,1,1,1,1.0
rec-786-org,rec-786-dup-0,1,1.0,1,1,1,1.0
rec-572-org,rec-572-dup-0,1,0.0,0,1,1,1.0
rec-2389-org,rec-2389-dup-0,1,1.0,1,1,1,1.0


### Sampling

In [173]:
#Calculating sample size
p = 0.5
q = 0.5
z = 1.96
err = 0.1
sample_size = (p * q * (z**2)) / (err**2)
sample_size

96.03999999999998

In [148]:
#calculating proportions
sample_size = 100
match_prps = (match_grps / len(pairs)) * sample_size
match_prps

6.0     2.027211
5.0     1.724294
4.0     0.444019
3.0     0.188999
2.0    21.265000
1.0    74.350477
dtype: float64

In [143]:
def strat_sampling(data_df, match_prps):
    match_sum_grp = data_df.loc[:,'match_sum'].unique()[0]
    return data_df.sample(max(1, int(round(match_prps.loc[match_sum_grp]))))

In [103]:
features.loc[:,'match_sum'] = features.sum(axis=1)
features.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,given_name,surname,date_of_birth,suburb,state,address_1,match_sum
rec_id,rec_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
rec-1070-org,rec-3024-dup-0,1,0.0,0,0,1,0.0,2.0
rec-1070-org,rec-2371-dup-0,1,0.0,0,0,0,0.0,1.0
rec-1070-org,rec-4652-dup-0,1,0.0,0,0,0,0.0,1.0
rec-1070-org,rec-4795-dup-0,1,0.0,0,0,1,0.0,2.0
rec-1070-org,rec-1314-dup-0,1,0.0,0,0,1,0.0,2.0


In [144]:
strat_sample_feat = features.groupby('match_sum', group_keys=False).apply(lambda x: strat_sampling(x, match_prps))
strat_sample_feat

Unnamed: 0_level_0,Unnamed: 1_level_0,given_name,surname,date_of_birth,suburb,state,address_1,match_sum
rec_id,rec_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
rec-3034-org,rec-996-dup-0,1,0.0,0,0,0,0.0,1.0
rec-4040-org,rec-4171-dup-0,1,0.0,0,0,0,0.0,1.0
rec-2036-org,rec-3250-dup-0,1,0.0,0,0,0,0.0,1.0
rec-3591-org,rec-134-dup-0,1,0.0,0,0,0,0.0,1.0
rec-45-org,rec-2812-dup-0,1,0.0,0,0,0,0.0,1.0
rec-3299-org,rec-3722-dup-0,1,0.0,0,0,0,0.0,1.0
rec-337-org,rec-3247-dup-0,1,0.0,0,0,0,0.0,1.0
rec-4312-org,rec-2972-dup-0,1,0.0,0,0,0,0.0,1.0
rec-714-org,rec-4023-dup-0,1,0.0,0,0,0,0.0,1.0
rec-2559-org,rec-4353-dup-0,1,0.0,0,0,0,0.0,1.0


In [145]:
strat_sample_feat.loc[:,'match_sum'].value_counts().sort_index(ascending=False)

6.0     2
5.0     2
4.0     1
3.0     1
2.0    21
1.0    74
Name: match_sum, dtype: int64

## Classify 

## Evaluate