In [1]:
import recordlinkage as rl
import pandas as pd
import numpy as np
import scipy
import jellyfish
import numexpr

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

1. DONE pull in correct variables for linked 1617 file - matching death and birth vars
2. DONE clean all variables
3. DONE block on dob m, d, y and index
4. DONE compare records on fname, lname, mfname, mlname, dadfname, dadlname, sex, dobm, dobd, doby
5. index true matches (linked 1617 file) on bsfn, dsfn
6. split bir - dth candidate pairs into train and test
7. compare indexes from true matches to train and test sets and label
8. logistic regression


Read in data sets: birth 2016-18, death 2016-18, linked infant death 2016-17.  

Birth and death data are then restricted to events that occurred in 2016 and 2017; I will use 2018 to validate the model after training and testing on 2016 and 2017 combined.

All data are limited to births or deaths that occurred in Washington state to in state residents.

This step also keeps only the variables that are of interest for record linkage.

##### Death data

In [3]:
d1618 = pd.read_csv(r'###\Py\Data\d1618_clean.csv', low_memory = False)

In [4]:
d1617 = d1618[(d1618['ddody']!=2018)]
d1617 = d1617[(d1617.ddthstatel=='WASHINGTON')]

In [5]:
d1617 = d1617.loc[:, ['dsfn', 'dfname', 'dlname', 'dmom_fname','dmom_lname', 'ddad_fname', 
                      'ddad_lname','dsex','ddobm', 'ddobd', 'ddoby','drescity','drescountyl',
                      'dreszip']]

In [6]:
d1617.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 109122 entries, 0 to 168701
Data columns (total 14 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   dsfn         109122 non-null  int64 
 1   dfname       109120 non-null  object
 2   dlname       109120 non-null  object
 3   dmom_fname   109118 non-null  object
 4   dmom_lname   109111 non-null  object
 5   ddad_fname   109119 non-null  object
 6   ddad_lname   109119 non-null  object
 7   dsex         109122 non-null  object
 8   ddobm        109122 non-null  int64 
 9   ddobd        109122 non-null  int64 
 10  ddoby        109122 non-null  int64 
 11  drescity     109061 non-null  object
 12  drescountyl  109062 non-null  object
 13  dreszip      109122 non-null  int64 
dtypes: int64(5), object(9)
memory usage: 12.5+ MB


##### Birth data

In [7]:
b1618 = pd.read_csv(r'###\Py\Data\b1618_clean.csv', low_memory = False)

In [8]:
b1617 = b1618[(b1618.bdoby !=2018)]


In [9]:
b1617.bdoby.value_counts(dropna=False)

2016    89083
2017    86167
Name: bdoby, dtype: int64

In [10]:
b1617.bbirplstatefips.value_counts(dropna=False)

WA    175250
Name: bbirplstatefips, dtype: int64

In [11]:
b1617.b_momresstatefips.value_counts(dropna=False)

WA    175250
Name: b_momresstatefips, dtype: int64

In [12]:
b1617 = b1617.loc[:, ['bsfn','bfname','blname', 'bmom_fname', 'bmom_lname', 'bdad_lname', 
                      'bdad_fname','bsex', 'bdobm', 'bdobd', 'bdoby', 'b_momrescity', 
                      'b_momrescountyl', 'b_momreszip']]

In [13]:
b1617.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 175250 entries, 0 to 259908
Data columns (total 14 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   bsfn             175250 non-null  int64 
 1   bfname           175246 non-null  object
 2   blname           175242 non-null  object
 3   bmom_fname       175184 non-null  object
 4   bmom_lname       174002 non-null  object
 5   bdad_lname       163274 non-null  object
 6   bdad_fname       169464 non-null  object
 7   bsex             175250 non-null  object
 8   bdobm            175250 non-null  int64 
 9   bdobd            175250 non-null  int64 
 10  bdoby            175250 non-null  int64 
 11  b_momrescity     175202 non-null  object
 12  b_momrescountyl  175202 non-null  object
 13  b_momreszip      175250 non-null  int64 
dtypes: int64(5), object(9)
memory usage: 20.1+ MB


##### Linked infant birth-death file 2016-2017

In [14]:
linked1617 = pd.read_csv(r'###\Py\Data\WA1617infantDeath_dthdata.csv')

In [15]:
linked1617.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 631 entries, 0 to 630
Data columns (total 54 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   lbsfn                631 non-null    int64  
 1   ldsfn                631 non-null    int64  
 2   dsfn                 631 non-null    int64  
 3   dbirsfn              631 non-null    object 
 4   dssn                 631 non-null    int64  
 5   dfname               631 non-null    object 
 6   dmname               531 non-null    object 
 7   dlname               631 non-null    object 
 8   dmom_fname           631 non-null    object 
 9   dmom_mname           375 non-null    object 
 10  dmom_lname           631 non-null    object 
 11  dsex                 631 non-null    object 
 12  dagetype             631 non-null    float64
 13  dage                 631 non-null    float64
 14  dageyrs              631 non-null    float64
 15  ddob                 631 non-null    obj

In [16]:
linked1617.ddthstatel.value_counts(dropna=False)

WASHINGTON    631
Name: ddthstatel, dtype: int64

In [17]:
linked1617.dbirplstatefips.value_counts(dropna=False)

WA    631
Name: dbirplstatefips, dtype: int64

In [18]:
linked1617 = linked1617.loc[:,['lbsfn','ldsfn']]

In [19]:
linked1617.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 631 entries, 0 to 630
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   lbsfn   631 non-null    int64
 1   ldsfn   631 non-null    int64
dtypes: int64(2)
memory usage: 10.0 KB


#### Data cleaning and standardizing

###### DEATH

In [20]:
from recordlinkage.preprocessing import clean

In [21]:
d1617['dfname_clean'] = clean(d1617.dfname,
                              lowercase = True,
                              replace_by_none = '[\s-]+',
                              strip_accents = 'unicode'
                             )

In [22]:
d1617['dlname_clean'] = clean(d1617.dlname,
                              lowercase = True,
                              replace_by_none = '[\s-]+',
                              strip_accents = 'unicode'
                             )

In [23]:
d1617['dmom_fname_clean'] = clean(d1617.dmom_fname,
                                 lowercase = True,
                                  replace_by_none = '[\s-]+',
                                  strip_accents = 'unicode'
                                 )

In [24]:
d1617['dmom_lname_clean'] = clean(d1617.dmom_lname,
                                 lowercase = True,
                                  replace_by_none = '[\s-]+',
                                  strip_accents = 'unicode'
                                 )

In [25]:
d1617['ddad_fname_clean'] = clean(d1617.ddad_fname,
                                lowercase = True,
                                  replace_by_none = '[\s-]+',
                                  strip_accents = 'unicode'
                                 )

In [26]:
d1617['ddad_lname_clean'] = clean(d1617.ddad_lname,
                                lowercase = True,
                                  replace_by_none = '[\s-]+',
                                  strip_accents = 'unicode'
                                 )

###### Create phonetic encoding of infant, mother, and father names

In [27]:
d1617['dmom_fullname'] =  d1617.dmom_fname_clean + " " +  d1617.dmom_lname_clean

In [28]:
d1617['ddad_fullname'] =  d1617.ddad_fname_clean + " " +  d1617.ddad_lname_clean

In [29]:
d1617['dinf_fullname'] =  d1617.dfname_clean + " " +  d1617.dlname_clean

In [30]:
d1617['dmom_phon'] = rl.preprocessing.phonetic(d1617['dmom_fullname'], method = 'metaphone')
d1617['ddad_phon'] = rl.preprocessing.phonetic(d1617['ddad_fullname'], method = 'metaphone')
d1617['dinf_phon'] = rl.preprocessing.phonetic(d1617['dinf_fullname'], method = 'metaphone')

In [31]:
d1617.head()

Unnamed: 0,dsfn,dfname,dlname,dmom_fname,dmom_lname,ddad_fname,ddad_lname,dsex,ddobm,ddobd,ddoby,drescity,drescountyl,dreszip,dfname_clean,dlname_clean,dmom_fname_clean,dmom_lname_clean,ddad_fname_clean,ddad_lname_clean,dmom_fullname,ddad_fullname,dinf_fullname,dmom_phon,ddad_phon,dinf_phon
0,2017025187,CHARLES,BURNETT,JOHANNA,DEHOLLANDER,CHARLES,BURNETT,M,6,11,1932,WOODINVILLE,KING,98072,charles,burnett,johanna,dehollander,charles,burnett,johanna dehollander,charles burnett,charles burnett,JHNTHLNTR,XRLSBRNT,XRLSBRNT
1,2017025188,DOUGLAS,LEE,ROSE,SHEA,FRANK,LEE,M,9,29,1958,REDMOND,KING,98052,douglas,lee,rose,shea,frank,lee,rose shea,frank lee,douglas lee,RSX,FRNKL,TKLSL
2,2017025189,ARUNEE,TAOSAN,LUMDUAN,SRILAUN,UDON,TAOSAN,F,2,5,1947,LYNNWOOD,SNOHOMISH,98036,arunee,taosan,lumduan,srilaun,udon,taosan,lumduan srilaun,udon taosan,arunee taosan,LMTNSRLN,UTNTSN,ARNTSN
3,2017025190,ELIZABETH,CHAUSSEE,EDA,OIN,HARTLEY,CHAUSSEE,F,10,3,1931,BELLEVUE,KING,98007,elizabeth,chaussee,eda,oin,hartley,chaussee,eda oin,hartley chaussee,elizabeth chaussee,ETN,HRTLXS,ELSB0XS
4,2017025191,MERWIN,ADLER,ALBERTA,SCHNELL,JACOB,ADLER,M,9,11,1941,PALOUSE,WHITMAN,99161,merwin,adler,alberta,schnell,jacob,adler,alberta schnell,jacob adler,merwin adler,ALBRTSXNL,JKBTLR,MRWNTLR


##### BIRTH

In [32]:
b1617['bfname_clean'] = clean(b1617.bfname,
                              lowercase = True,
                              replace_by_none = '[\s-]+',
                              strip_accents = 'unicode'
                             )

In [33]:
b1617['blname_clean'] = clean(b1617.blname,
                              lowercase = True,
                              replace_by_none = '[\s-]+',
                              strip_accents = 'unicode'
                             )

In [34]:
b1617['bmom_fname_clean'] = clean(b1617.bmom_fname,
                              lowercase = True,
                              replace_by_none = '[\s-]+',
                              strip_accents = 'unicode'
                             )

In [35]:
b1617['bmom_lname_clean'] = clean(b1617.bmom_lname,
                                 lowercase = True,
                                  replace_by_none = '[\s-]+',
                                  strip_accents = 'unicode'
                                 )

In [36]:
b1617['bdad_fname_clean'] = clean(b1617.bdad_fname,
                                lowercase = True,
                                  replace_by_none = '[\s-]+',
                                  strip_accents = 'unicode'
                                 )

In [37]:
b1617['bdad_lname_clean'] = clean(b1617.bdad_lname,
                                lowercase = True,
                                  replace_by_none = '[\s-]+',
                                  strip_accents = 'unicode'
                                 )

In [38]:
b1617['bmom_fullname'] =  b1617.bmom_fname_clean + " " +  b1617.bmom_lname_clean
b1617['bdad_fullname'] =  b1617.bdad_fname_clean + " " +  b1617.bdad_lname_clean
b1617['binf_fullname'] =  b1617.bfname_clean + " " +  b1617.blname_clean

In [39]:
b1617['bmom_phon'] = rl.preprocessing.phonetic(b1617['bmom_fullname'], method = 'metaphone')
b1617['bdad_phon'] = rl.preprocessing.phonetic(b1617['bdad_fullname'], method = 'metaphone')
b1617['binf_phon'] = rl.preprocessing.phonetic(b1617['binf_fullname'], method = 'metaphone')

### reducing data set size

In [41]:
matched_dsfn = linked1617['ldsfn'].tolist()
len(matched_dsfn)

631

In [42]:
matched_bsfn = linked1617['lbsfn'].tolist()
len(matched_bsfn)

631

In [43]:
b1617_not_linked = b1617[(~b1617['bsfn'].isin(matched_bsfn))]

In [44]:
b1617_not_linked_undersample = b1617_not_linked.sample(n=1000, random_state=42)

In [45]:
b16_17_linked =  b1617[b1617['bsfn'].isin(matched_bsfn)]

In [75]:
b1617_fin = pd.concat([b1617_not_linked_undersample, b16_17_linked], axis=0)

In [78]:
b1617_fin.bsfn = b1617_fin.bsfn + 10000000000

In [80]:
b1617_fin = b1617_fin.set_index('bsfn')

In [None]:
b1617_fin = b16_17fin

In [68]:
d1617_not_linked= d1617[(~d1617['dsfn'].isin(matched_dsfn))]

In [69]:
d1617_not_linked_undersample = d1617_not_linked.sample(n=1000, random_state=42)

In [70]:
d16_17_linked =  d1617[d1617['dsfn'].isin(matched_dsfn)]

In [83]:
d1617_fin = pd.concat([d1617_not_linked_undersample, d16_17_linked])

In [85]:
d1617_fin.dsfn = d1617_fin.dsfn + 50000000000

In [86]:
d1617_fin = d1617_fin.set_index('dsfn')

#### BLOCKING AND INDEXING CANDIDATE RECORD PAIRS

In [90]:
indexer = rl.Index()
indexer.block(left_on = 'bdoby', right_on = 'ddoby')
indexer.block(left_on = 'bdobm', right_on = 'ddobm' )
indexer.block(left_on = 'bdobd', right_on = 'ddobd')
indexer.block(left_on = 'bmom_phon', right_on = 'dmom_phon')

<Index>

In [91]:
candidate_pairs = indexer.index(b1617_fin, d1617_fin)

##### CANDIDATE PAIR COMPARISONS

In [92]:
c=rl.Compare()

In [93]:
c.string('blname_clean', 'dlname_clean', method = 'jarowinkler')
c.string('bmom_lname_clean', 'dmom_lname_clean', method = 'jarowinkler')
c.string('bmom_phon', 'dmom_phon', method = 'jarowinkler')
c.string('bdad_lname_clean', 'ddad_lname_clean', method = 'jarowinkler')
c.numeric('bdobm', 'ddobm', method = 'linear', scale = 1 )
c.numeric('bdoby', 'ddoby', method = 'linear', scale = 1)
c.numeric('bdobd', 'ddobd', method = 'linear', scale = 1)
c.exact('bsex', 'dsex')
c.exact('b_momrescountyl', 'drescountyl')

<Compare>

In [94]:
comp_scores = c.compute(candidate_pairs, b1617_fin, d1617_fin)

##### LOGISTIC REGRESSION CLASSIFIER

In [95]:
#get true matches i.e. linked1617 data set and make the same changes to the bsfn and dsfn
#fields as with the birth and death data sets i.e. adding 10000000000 to bsfn and 50000000000 to dsfn
#to make sure there is no overlap between birth and death indexes.

linked1617.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 631 entries, 0 to 630
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   lbsfn   631 non-null    int64
 1   ldsfn   631 non-null    int64
dtypes: int64(2)
memory usage: 10.0 KB


In [96]:
linked1617.lbsfn = linked1617.lbsfn + 10000000000
linked1617.ldsfn = linked1617.ldsfn + 50000000000

In [97]:
linked1617.head()

Unnamed: 0,lbsfn,ldsfn
0,12016008461,52016032945
1,12016082743,52016048536
2,12016041884,52016028478
3,12016018813,52016010399
4,12016036019,52016022637


In [99]:
## convert the two linked1617 columns to index as required by record linkage
## these will be the indexes of all true matches

matches = linked1617.set_index(['lbsfn', 'ldsfn']).index

In [162]:
## create testing and training data sets

from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(comp_scores, test_size=0.30, random_state=42)

In [163]:
## for each set (train and test) identify the true match pairs

train_match_index = train_data.index & matches
test_match_index = test_data.index & matches

In [179]:
#from sklearn.linear_model import LogisticRegression

In [184]:
# build the logreg model

lr_classifier = rl.LogisticRegressionClassifier()

lr_classifier.fit(train_data, train_match_index)

In [185]:
len(train_data), len(train_match_index)

(532982, 436)

In [186]:
len(test_data), len(test_match_index)

(228422, 195)

In [187]:
## use classifier to predict using test set

predictions = lr_classifier.predict(test_data)

In [188]:
## confusion matrix

lr_conf_mat = rl.confusion_matrix(test_match_index, predictions, len(test_data))


In [189]:
##                Prediction
##               pos       neg
##        pos    TP        FN
## Actual
##        neg    FP        TN


lr_conf_mat

array([[   190,      5],
       [     4, 228223]])

In [190]:
## accuracy measures

print("Precision (TP/TP + FP): ", rl.precision(lr_conf_mat))
print("Recall (TP/TP + FN): ", rl.recall(lr_conf_mat))
print("F-score (harmonic mean of precision & recall): ", rl.fscore(lr_conf_mat))

Precision (TP/TP + FP):  0.979381443298969
Recall (TP/TP + FN):  0.9743589743589743
F-score (harmonic mean of precision & recall):  0.9768637532133676


In [193]:
specificity = 228223/(228223+4)
print("Specificity (TN/TN+FP): ", specificity)

Specificity (TN/TN+FP):  0.9999824735898908


In [201]:
print("False positive rate: ", '{0:.8f}'.format((1 - specificity )))

False positive rate:  0.00001753
