# Source of Code:
This code reproduces the results of the “Statistical supervised meta-ensemble algorithm for medical record linkage” paper. The vast majority of this code was sourced from the original paper’s GitHub repository. The original code has been slightly modified and amended.

K. Vo, J. Jonnagaddala and S.-T. Liaw, "Medical-Record-Linkage-Ensemble," 16 February 2019. [Online]. Available: https://github.com/ePBRN/Medical-Record-Linkage-Ensemble/.

In [1]:
'''
Source: 
K. Vo, J. Jonnagaddala and S.-T. Liaw, "Medical-Record-Linkage-Ensemble," 16 February 2019. [Online]. 
Available: https://github.com/ePBRN/Medical-Record-Linkage-Ensemble/.
'''
import recordlinkage as rl, pandas as pd, numpy as np
from sklearn.model_selection import KFold
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.utils import shuffle
from recordlinkage.preprocessing import phonetic
from numpy.random import choice
import collections, numpy
from IPython.display import clear_output
from sklearn.model_selection import train_test_split, KFold
from math import comb

# 1.0 FEBRL Results

In [2]:
'''
Source: 
K. Vo, J. Jonnagaddala and S.-T. Liaw, "Medical-Record-Linkage-Ensemble," 16 February 2019. [Online]. 
Available: https://github.com/ePBRN/Medical-Record-Linkage-Ensemble/.
'''
def generate_true_links(df): 
    # although the match_id column is included in the original df to imply the true links,
    # this function will create the true_link object identical to the true_links properties
    # of recordlinkage toolkit, in order to exploit "Compare.compute()" from that toolkit
    # in extract_function() for extracting features quicker.
    # This process should be deprecated in the future release of the UNSW toolkit.
    df["rec_id"] = df.index.values.tolist()
    indices_1 = []
    indices_2 = []
    processed = 0
    for match_id in df["match_id"].unique():
        if match_id != -1:    
            processed = processed + 1
            # print("In routine generate_true_links(), count =", processed)
            # clear_output(wait=True)
            linkages = df.loc[df['match_id'] == match_id]
            for j in range(len(linkages)-1):
                for k in range(j+1, len(linkages)):
                    indices_1 = indices_1 + [linkages.iloc[j]["rec_id"]]
                    indices_2 = indices_2 + [linkages.iloc[k]["rec_id"]]    
    links = pd.MultiIndex.from_arrays([indices_1,indices_2])
    return links

def generate_false_links(df, size):
    # A counterpart of generate_true_links(), with the purpose to generate random false pairs
    # for training. The number of false pairs in specified as "size".
    df["rec_id"] = df.index.values.tolist()
    indices_1 = []
    indices_2 = []
    unique_match_id = df["match_id"].unique()
    for j in range(size):
            false_pair_ids = choice(unique_match_id, 2)
            candidate_1_cluster = df.loc[df['match_id'] == false_pair_ids[0]]
            candidate_1 = candidate_1_cluster.iloc[choice(range(len(candidate_1_cluster)))]
            candidate_2_cluster = df.loc[df['match_id'] == false_pair_ids[1]]
            candidate_2 = candidate_2_cluster.iloc[choice(range(len(candidate_2_cluster)))]    
            indices_1 = indices_1 + [candidate_1["rec_id"]]
            indices_2 = indices_2 + [candidate_2["rec_id"]]  
    links = pd.MultiIndex.from_arrays([indices_1,indices_2])
    return links

def swap_fields_flag(f11, f12, f21, f22):
    return int((f11 == f22) and (f12 == f21))

def extract_features(df, links):
    c = rl.Compare()
    c.string('given_name', 'given_name', method='jarowinkler', label='y_name')
    c.string('given_name_soundex', 'given_name_soundex', method='jarowinkler', label='y_name_soundex')
    c.string('given_name_nysiis', 'given_name_nysiis', method='jarowinkler', label='y_name_nysiis')
    c.string('surname', 'surname', method='jarowinkler', label='y_surname')
    c.string('surname_soundex', 'surname_soundex', method='jarowinkler', label='y_surname_soundex')
    c.string('surname_nysiis', 'surname_nysiis', method='jarowinkler', label='y_surname_nysiis')
    c.exact('street_number', 'street_number', label='y_street_number')
    c.string('address_1', 'address_1', method='levenshtein', threshold=0.7, label='y_address1')
    c.string('address_2', 'address_2', method='levenshtein', threshold=0.7, label='y_address2')
    c.exact('postcode', 'postcode', label='y_postcode')
    c.exact('day', 'day', label='y_day')
    c.exact('month', 'month', label='y_month')
    c.exact('year', 'year', label='y_year')
        
    # Build features
    feature_vectors = c.compute(links, df, df)
    return feature_vectors

def generate_train_X_y(df):
    # This routine is to generate the feature vector X and the corresponding labels y
    # with exactly equal number of samples for both classes to train the classifier.
    pos = extract_features(df, train_true_links)
    train_false_links = generate_false_links(df, len(train_true_links))    
    neg = extract_features(df, train_false_links)
    X = pos.values.tolist() + neg.values.tolist()
    y = [1]*len(pos)+[0]*len(neg)
    X, y = shuffle(X, y, random_state=0)
    X = np.array(X)
    y = np.array(y)
    return X, y

def train_model(modeltype, modelparam, train_vectors, train_labels, modeltype_2):
    if modeltype == 'svm': # Support Vector Machine
        model = svm.SVC(C = modelparam, kernel = modeltype_2)
        model.fit(train_vectors, train_labels) 
    elif modeltype == 'lg': # Logistic Regression
        model = LogisticRegression(C=modelparam, penalty = modeltype_2,class_weight=None, dual=False, fit_intercept=True, 
                                   intercept_scaling=1, max_iter=5000, multi_class='ovr', 
                                   n_jobs=1, random_state=None)
        model.fit(train_vectors, train_labels)
    elif modeltype == 'nb': # Naive Bayes
        model = GaussianNB()
        model.fit(train_vectors, train_labels)
    elif modeltype == 'nn': # Neural Network
        model = MLPClassifier(solver='lbfgs', alpha=modelparam, hidden_layer_sizes=(256, ), 
                              activation = modeltype_2,random_state=None, batch_size='auto', 
                              learning_rate='constant',  learning_rate_init=0.001, 
                              power_t=0.5, max_iter=10000, shuffle=True, 
                              tol=0.0001, verbose=False, warm_start=False, momentum=0.9, 
                              nesterovs_momentum=True, early_stopping=False, 
                              validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
        model.fit(train_vectors, train_labels)
    return model

def classify(model, test_vectors):
    result = model.predict(test_vectors)
    return result

    
def evaluation(test_labels, result):
    true_pos = np.logical_and(test_labels, result)
    count_true_pos = np.sum(true_pos)
    true_neg = np.logical_and(np.logical_not(test_labels),np.logical_not(result))
    count_true_neg = np.sum(true_neg)
    false_pos = np.logical_and(np.logical_not(test_labels), result)
    count_false_pos = np.sum(false_pos)
    false_neg = np.logical_and(test_labels,np.logical_not(result))
    count_false_neg = np.sum(false_neg)
    precision = count_true_pos/(count_true_pos+count_false_pos)
    sensitivity = count_true_pos/(count_true_pos+count_false_neg) # sensitivity = recall
    confusion_matrix = [count_true_pos, count_false_pos, count_false_neg, count_true_neg]
    no_links_found = np.count_nonzero(result)
    no_false = count_false_pos + count_false_neg
    Fscore = 2*precision*sensitivity/(precision+sensitivity)
    metrics_result = {'no_false':no_false, 'confusion_matrix':confusion_matrix ,'precision':precision,
                     'sensitivity':sensitivity ,'no_links':no_links_found, 'F-score': Fscore}
    return metrics_result

def blocking_performance(candidates, true_links, df):
    count = 0
    for candi in candidates:
        if df.loc[candi[0]]["match_id"]==df.loc[candi[1]]["match_id"]:
            count = count + 1
    return count

In [3]:
'''
Source: 
K. Vo, J. Jonnagaddala and S.-T. Liaw, "Medical-Record-Linkage-Ensemble," 16 February 2019. [Online]. 
Available: https://github.com/ePBRN/Medical-Record-Linkage-Ensemble/.
'''
trainset = 'febrl3_UNSW'
testset = 'febrl4_UNSW'

In [4]:
%%time
'''
Source: 
K. Vo, J. Jonnagaddala and S.-T. Liaw, "Medical-Record-Linkage-Ensemble," 16 February 2019. [Online]. 
Available: https://github.com/ePBRN/Medical-Record-Linkage-Ensemble/.
'''
## TRAIN SET CONSTRUCTION

# Import
print("Import train set...")
df_train = pd.read_csv(trainset+".csv", index_col = "rec_id")
train_true_links = generate_true_links(df_train)
print("Train set size:", len(df_train), ", number of matched pairs: ", str(len(train_true_links)))

# Preprocess train set
df_train['postcode'] = df_train['postcode'].astype(str)
df_train['given_name_soundex'] = phonetic(df_train['given_name'], method='soundex')
df_train['given_name_nysiis'] = phonetic(df_train['given_name'], method='nysiis')
df_train['surname_soundex'] = phonetic(df_train['surname'], method='soundex')
df_train['surname_nysiis'] = phonetic(df_train['surname'], method='nysiis')

# Final train feature vectors and labels
X_train, y_train = generate_train_X_y(df_train)
print("Finished building X_train, y_train")

Import train set...
Train set size: 5000 , number of matched pairs:  1165


  s = s.str.replace(r"[\-\_\s]", "")
  s = s.str.replace(r"[\-\_\s]", "")
  s = s.str.replace(r"[\-\_\s]", "")
  s = s.str.replace(r"[\-\_\s]", "")


Finished building X_train, y_train
CPU times: user 834 ms, sys: 54.1 ms, total: 888 ms
Wall time: 888 ms


## 1.1 FEBRL Blocking Results

In [5]:
%%time
'''
Source: 
K. Vo, J. Jonnagaddala and S.-T. Liaw, "Medical-Record-Linkage-Ensemble," 16 February 2019. [Online]. 
Available: https://github.com/ePBRN/Medical-Record-Linkage-Ensemble/.

Code has been modified to reproduce and print Table 4 of the paper.
'''
# Blocking Criteria: declare non-match of all of the below fields disagree
# Import
print("Import test set...")
FEBRL_blocking_results = []
df_test = pd.read_csv(testset+".csv", index_col = "rec_id")
test_true_links = generate_true_links(df_test)
leng_test_true_links = len(test_true_links)
print("Test set size:", len(df_test), ", number of matched pairs: ", str(leng_test_true_links))

total_possible_pairs = comb(len(df_test),2)
match_pairs = leng_test_true_links

print("BLOCKING PERFORMANCE:")
blocking_fields = ["given_name", "surname", "postcode"]
all_candidate_pairs = []
for field in blocking_fields:
    block_indexer = rl.BlockIndex(on=field)
    candidates = block_indexer.index(df_test)
    detects = blocking_performance(candidates, test_true_links, df_test)
    all_candidate_pairs = candidates.union(all_candidate_pairs)
    print("Number of pairs of matched "+ field +": "+str(len(candidates)), ", detected ",
         detects,'/'+ str(leng_test_true_links) + " true matched pairs, missed " + 
          str(leng_test_true_links-detects) )
    # row 1
    row = []
    row.append(field)
    row.append('nc')
    nc = len(candidates)
    row.append(nc)
    FEBRL_blocking_results.append(row)
    
    # row 2 
    row = []
    row.append(field)
    row.append('pc')
    pc = round(detects/match_pairs*100.0, 2)
    row.append(pc)
    FEBRL_blocking_results.append(row)
    
    # row 3
    row = []
    row.append(field)
    row.append('rr')
    rr = round((1-(len(candidates)/1.0/total_possible_pairs))*100, 2)
    row.append(rr)
    FEBRL_blocking_results.append(row)
    
detects = blocking_performance(all_candidate_pairs, test_true_links, df_test)
print("Number of pairs of at least 1 field matched: " + str(len(all_candidate_pairs)), ", detected ",
     detects,'/'+ str(leng_test_true_links) + " true matched pairs, missed " + 
          str(leng_test_true_links-detects) )

#Reproducing Table 4
# row 1
row_all = []
row_all.append('All')
row_all.append('nc')
nc = len(all_candidate_pairs)
row_all.append(nc)
FEBRL_blocking_results.append(row_all)

# row 2
row_all = []
row_all.append('All')
row_all.append('pc')
pc = round(detects/match_pairs*100.0, 2)
row_all.append(pc)
FEBRL_blocking_results.append(row_all)

# row 3
row_all = []
row_all.append('All')
row_all.append('rr')
rr = round((1-(len(candidates)/1.0/total_possible_pairs))*100, 2)
row_all.append(rr)
FEBRL_blocking_results.append(row_all)

Import test set...
Test set size: 10000 , number of matched pairs:  5000
BLOCKING PERFORMANCE:
Number of pairs of matched given_name: 154898 , detected  3287 /5000 true matched pairs, missed 1713
Number of pairs of matched surname: 170843 , detected  3325 /5000 true matched pairs, missed 1675
Number of pairs of matched postcode: 53197 , detected  4219 /5000 true matched pairs, missed 781
Number of pairs of at least 1 field matched: 372073 , detected  4894 /5000 true matched pairs, missed 106
CPU times: user 56.5 s, sys: 227 ms, total: 56.8 s
Wall time: 56.9 s


## 1.2 FEBRL Classification Performance Results

### Base Learners Performance

In [6]:
%%time
'''
Source: 
K. Vo, J. Jonnagaddala and S.-T. Liaw, "Medical-Record-Linkage-Ensemble," 16 February 2019. [Online]. 
Available: https://github.com/ePBRN/Medical-Record-Linkage-Ensemble/.
'''
## TEST SET CONSTRUCTION

# Preprocess test set
print("Processing test set...")
print("Preprocess...")
df_test['postcode'] = df_test['postcode'].astype(str)
df_test['given_name_soundex'] = phonetic(df_test['given_name'], method='soundex')
df_test['given_name_nysiis'] = phonetic(df_test['given_name'], method='nysiis')
df_test['surname_soundex'] = phonetic(df_test['surname'], method='soundex')
df_test['surname_nysiis'] = phonetic(df_test['surname'], method='nysiis')

# Test feature vectors and labels construction
print("Extract feature vectors...")
df_X_test = extract_features(df_test, all_candidate_pairs)
vectors = df_X_test.values.tolist()
labels = [0]*len(vectors)
feature_index = df_X_test.index
for i in range(0, len(feature_index)):
    if df_test.loc[feature_index[i][0]]["match_id"]==df_test.loc[feature_index[i][1]]["match_id"]:
        labels[i] = 1
X_test, y_test = shuffle(vectors, labels, random_state=0)
X_test = np.array(X_test)
y_test = np.array(y_test)
print("Count labels of y_test:",collections.Counter(y_test))
print("Finished building X_test, y_test")

Processing test set...
Preprocess...
Extract feature vectors...


  s = s.str.replace(r"[\-\_\s]", "")
  s = s.str.replace(r"[\-\_\s]", "")
  s = s.str.replace(r"[\-\_\s]", "")
  s = s.str.replace(r"[\-\_\s]", "")


Count labels of y_test: Counter({0: 367179, 1: 4894})
Finished building X_test, y_test
CPU times: user 36.8 s, sys: 239 ms, total: 37.1 s
Wall time: 37.1 s


In [7]:
%%time
'''
Modifying the code provided by the authors to produce the results in Table 6 of the paper. 
Used the hyperparameters as specified by Table 5 of the paper to build the models.

Source: 
K. Vo, J. Jonnagaddala and S.-T. Liaw, "Medical-Record-Linkage-Ensemble," 16 February 2019. [Online]. 
Available: https://github.com/ePBRN/Medical-Record-Linkage-Ensemble/.
'''

FEBRL_classification_results = [] 

## BASE LEARNERS CLASSIFICATION AND EVALUATION
################# SVM ########################
'''
Table 5 Hyperparameters for SVM on the FEBRL dataset
1. Linear kernel
2. C = 0.005
'''
modeltype = 'svm' # choose between 'svm', 'lg', 'nn'
modeltype_2 = 'linear'  # 'linear' or 'rbf' for svm, 'l1' or 'l2' for lg, 'relu' or 'logistic' for nn
modelparam = 0.005

md = train_model(modeltype, modelparam, X_train, y_train, modeltype_2)
final_result = classify(md, X_test)
final_eval = evaluation(y_test, final_result)
precision = final_eval['precision']
sensitivity = final_eval['sensitivity']
Fscore = final_eval['F-score']
nb_false  = final_eval['no_false']

row = []
row.append('SVM')
row.append(round(precision*100,2))
row.append(round(sensitivity*100, 2))
row.append(round(Fscore*100, 2))
row.append(nb_false)
FEBRL_classification_results.append(row)

## BASE LEARNERS CLASSIFICATION AND EVALUATION
################# NN ########################
'''
Table 5 Hyperparameters for NN on the FEBRL dataset
1. ReLu activation with a = 100
'''
modeltype = 'nn' # choose between 'svm', 'lg', 'nn'
modeltype_2 = 'relu'  # 'linear' or 'rbf' for svm, 'l1' or 'l2' for lg, 'relu' or 'logistic' for nn
modelparam = 100

md = train_model(modeltype, modelparam, X_train, y_train, modeltype_2)
final_result = classify(md, X_test)
final_eval = evaluation(y_test, final_result)
precision = final_eval['precision']
sensitivity = final_eval['sensitivity']
Fscore = final_eval['F-score']
nb_false = final_eval['no_false']

row = []
row.append('NN')
row.append(round(precision*100, 2))
row.append(round(sensitivity*100, 2))
row.append(round(Fscore*100, 2))
row.append(nb_false)
FEBRL_classification_results.append(row)

## BASE LEARNERS CLASSIFICATION AND EVALUATION
################# LR ########################
'''
Table 5 Hyperparameters for NN on the FEBRL dataset
1. Regularization I2
2. C = 0.2
'''
modeltype = 'lg' # choose between 'svm', 'lg', 'nn'
modeltype_2 = 'l2'  # 'linear' or 'rbf' for svm, 'l1' or 'l2' for lg, 'relu' or 'logistic' for nn
modelparam = 0.2

md = train_model(modeltype, modelparam, X_train, y_train, modeltype_2)
final_result = classify(md, X_test)
final_eval = evaluation(y_test, final_result)
precision = final_eval['precision']
sensitivity = final_eval['sensitivity']
Fscore = final_eval['F-score']
nb_false = final_eval['no_false']

row = []
row.append('LR')
row.append(round(precision*100, 2))
row.append(round(sensitivity*100, 2))
row.append(round(Fscore*100, 2))
row.append(nb_false)
FEBRL_classification_results.append(row)

CPU times: user 4.91 s, sys: 1.75 s, total: 6.66 s
Wall time: 2.9 s


In [8]:
%%time
'''
Modifying the code provided by the authors to produce the results in Table 6 of the paper. 
Used the hyperparameters as specified by Table 5 of the paper to build the models.

Source: 
K. Vo, J. Jonnagaddala and S.-T. Liaw, "Medical-Record-Linkage-Ensemble," 16 February 2019. [Online]. 
Available: https://github.com/ePBRN/Medical-Record-Linkage-Ensemble/.
'''
## ENSEMBLE CLASSIFICATION AND EVALUATION

# print("BAGGING PERFORMANCE:\n")
modeltypes = ['svm', 'nn', 'lg'] 
modeltypes_2 = ['linear', 'relu', 'l2']
modelparams = [0.005, 100, 0.2]
nFold = 10
kf = KFold(n_splits=nFold)
model_raw_score = [0]*3
model_binary_score = [0]*3
model_i = 0
for model_i in range(3):
    modeltype = modeltypes[model_i]
    modeltype_2 = modeltypes_2[model_i]
    modelparam = modelparams[model_i]
    # print(modeltype, "per fold:")
    iFold = 0
    result_fold = [0]*nFold
    final_eval_fold = [0]*nFold
    for train_index, valid_index in kf.split(X_train):
        X_train_fold = X_train[train_index]
        y_train_fold = y_train[train_index]
        md =  train_model(modeltype, modelparam, X_train_fold, y_train_fold, modeltype_2)
        result_fold[iFold] = classify(md, X_test)
        final_eval_fold[iFold] = evaluation(y_test, result_fold[iFold])
        # print("Fold", str(iFold), final_eval_fold[iFold])
        iFold = iFold + 1
    bagging_raw_score = np.average(result_fold, axis=0)
    bagging_binary_score  = np.copy(bagging_raw_score)
    bagging_binary_score[bagging_binary_score > 0.5] = 1
    bagging_binary_score[bagging_binary_score <= 0.5] = 0
    bagging_eval = evaluation(y_test, bagging_binary_score)
    # print(modeltype, "bagging:", bagging_eval)
    # print('')

    if modeltype == 'svm':
        row = []
        row.append('SVM-bag')
        row.append(round(bagging_eval['precision']*100, 2))
        row.append(round(bagging_eval['sensitivity']*100, 2))
        row.append(round(bagging_eval['F-score']*100, 2))
        row.append(bagging_eval['no_false'])
        FEBRL_classification_results.append(row)
    elif modeltype == 'nn':
        row = []
        row.append('NN-bag')
        row.append(round(bagging_eval['precision']*100, 2))
        row.append(round(bagging_eval['sensitivity']*100, 2))
        row.append(round(bagging_eval['F-score']*100, 2))
        row.append(bagging_eval['no_false'])
        FEBRL_classification_results.append(row)
    else:
        row = []
        row.append('LR-bag')
        row.append(round(bagging_eval['precision']*100, 2))
        row.append(round(bagging_eval['sensitivity']*100, 2))
        row.append(round(bagging_eval['F-score']*100, 2))
        row.append(bagging_eval['no_false'])
        FEBRL_classification_results.append(row)
    
    model_raw_score[model_i] = bagging_raw_score
    model_binary_score[model_i] = bagging_binary_score

CPU times: user 59 s, sys: 18.3 s, total: 1min 17s
Wall time: 29.4 s


### Ensemble Model Performance

In [9]:
%%time
'''
Source: 
K. Vo, J. Jonnagaddala and S.-T. Liaw, "Medical-Record-Linkage-Ensemble," 16 February 2019. [Online]. 
Available: https://github.com/ePBRN/Medical-Record-Linkage-Ensemble/.
'''
thres = .99

print("STACKING PERFORMANCE:\n")
stack_raw_score = np.average(model_raw_score, axis=0)
stack_binary_score = np.copy(stack_raw_score)
stack_binary_score[stack_binary_score > thres] = 1
stack_binary_score[stack_binary_score <= thres] = 0
stacking_eval = evaluation(y_test, stack_binary_score)

STACKING PERFORMANCE:

CPU times: user 20.6 ms, sys: 19.2 ms, total: 39.9 ms
Wall time: 8.05 ms


In [10]:
row = []
row.append('Stack+Bag')
row.append(round(stacking_eval['precision']*100, 2))
row.append(round(stacking_eval['sensitivity']*100, 2))
row.append(round(stacking_eval['F-score']*100, 2))
row.append(stacking_eval['no_false'])
FEBRL_classification_results.append(row)

# 2.0 ePBRN Results

In [11]:
'''
Source: 
K. Vo, J. Jonnagaddala and S.-T. Liaw, "Medical-Record-Linkage-Ensemble," 16 February 2019. [Online]. 
Available: https://github.com/ePBRN/Medical-Record-Linkage-Ensemble/.
'''
trainset = 'ePBRN_F_dup' 
testset = 'ePBRN_D_dup'

import recordlinkage as rl, pandas as pd, numpy as np
from sklearn.model_selection import KFold
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.utils import shuffle
from recordlinkage.preprocessing import phonetic
from numpy.random import choice
import collections, numpy
from IPython.display import clear_output
from sklearn.model_selection import train_test_split, KFold


def generate_true_links(df): 
    # although the match_id column is included in the original df to imply the true links,
    # this function will create the true_link object identical to the true_links properties
    # of recordlinkage toolkit, in order to exploit "Compare.compute()" from that toolkit
    # in extract_function() for extracting features quicker.
    # This process should be deprecated in the future release of the UNSW toolkit.
    df["rec_id"] = df.index.values.tolist()
    indices_1 = []
    indices_2 = []
    processed = 0
    for match_id in df["match_id"].unique():
        if match_id != -1:    
            processed = processed + 1
            # print("In routine generate_true_links(), count =", processed)
            # clear_output(wait=True)
            linkages = df.loc[df['match_id'] == match_id]
            for j in range(len(linkages)-1):
                for k in range(j+1, len(linkages)):
                    indices_1 = indices_1 + [linkages.iloc[j]["rec_id"]]
                    indices_2 = indices_2 + [linkages.iloc[k]["rec_id"]]    
    links = pd.MultiIndex.from_arrays([indices_1,indices_2])
    return links

def generate_false_links(df, size):
    # A counterpart of generate_true_links(), with the purpose to generate random false pairs
    # for training. The number of false pairs in specified as "size".
    df["rec_id"] = df.index.values.tolist()
    indices_1 = []
    indices_2 = []
    unique_match_id = df["match_id"].unique()
    unique_match_id = unique_match_id[~np.isnan(unique_match_id)] # remove nan values
    for j in range(size):
            false_pair_ids = choice(unique_match_id, 2)
            candidate_1_cluster = df.loc[df['match_id'] == false_pair_ids[0]]
            candidate_1 = candidate_1_cluster.iloc[choice(range(len(candidate_1_cluster)))]
            candidate_2_cluster = df.loc[df['match_id'] == false_pair_ids[1]]
            candidate_2 = candidate_2_cluster.iloc[choice(range(len(candidate_2_cluster)))]    
            indices_1 = indices_1 + [candidate_1["rec_id"]]
            indices_2 = indices_2 + [candidate_2["rec_id"]]  
    links = pd.MultiIndex.from_arrays([indices_1,indices_2])
    return links

def swap_fields_flag(f11, f12, f21, f22):
    return ((f11 == f22) & (f12 == f21)).astype(float)

def join_names_space(f11, f12, f21, f22):
    return ((f11+" "+f12 == f21) | (f11+" "+f12 == f22)| (f21+" "+f22 == f11)| (f21+" "+f22 == f12)).astype(float)

def join_names_dash(f11, f12, f21, f22):
    return ((f11+"-"+f12 == f21) | (f11+"-"+f12 == f22)| (f21+"-"+f22 == f11)| (f21+"-"+f22 == f12)).astype(float)

def abb_surname(f1, f2):
    return ((f1[0]==f2) | (f1==f2[0])).astype(float)

def reset_day(f11, f12, f21, f22):
    return (((f11 == 1) & (f12 == 1))|((f21 == 1) & (f22 == 1))).astype(float)

def extract_features(df, links):
    c = rl.Compare()
    c.string('given_name', 'given_name', method='levenshtein', label='y_name_leven')
    c.string('surname', 'surname', method='levenshtein', label='y_surname_leven')  
    c.string('given_name', 'given_name', method='jarowinkler', label='y_name_jaro')
    c.string('surname', 'surname', method='jarowinkler', label='y_surname_jaro')  
    c.string('postcode', 'postcode', method='jarowinkler', label='y_postcode')      
    exact_fields = ['postcode', 'address_1', 'address_2', 'street_number']
    for field in exact_fields:
        c.exact(field, field, label='y_'+field+'_exact')
    c.compare_vectorized(reset_day,('day', 'month'), ('day', 'month'),label='reset_day_flag')    
    c.compare_vectorized(swap_fields_flag,('day', 'month'), ('day', 'month'),label='swap_day_month')    
    c.compare_vectorized(swap_fields_flag,('surname', 'given_name'), ('surname', 'given_name'),label='swap_names')    
    c.compare_vectorized(join_names_space,('surname', 'given_name'), ('surname', 'given_name'),label='join_names_space')
    c.compare_vectorized(join_names_dash,('surname', 'given_name'), ('surname', 'given_name'),label='join_names_dash')
    c.compare_vectorized(abb_surname,'surname', 'surname',label='abb_surname')
    # Build features
    feature_vectors = c.compute(links, df, df)
    return feature_vectors

def generate_train_X_y(df):
    # This routine is to generate the feature vector X and the corresponding labels y
    # with exactly equal number of samples for both classes to train the classifier.
    pos = extract_features(df, train_true_links)
    train_false_links = generate_false_links(df, len(train_true_links))    
    neg = extract_features(df, train_false_links)
    X = pos.values.tolist() + neg.values.tolist()
    y = [1]*len(pos)+[0]*len(neg)
    X, y = shuffle(X, y, random_state=0)
    X = np.array(X)
    y = np.array(y)
    return X, y

def train_model(modeltype, modelparam, train_vectors, train_labels, modeltype_2):
    if modeltype == 'svm': # Support Vector Machine
        model = svm.SVC(C = modelparam, kernel = modeltype_2)
        model.fit(train_vectors, train_labels) 
    elif modeltype == 'lg': # Logistic Regression
        model = LogisticRegression(C=modelparam, penalty = modeltype_2,class_weight=None, dual=False, fit_intercept=True, 
                                   intercept_scaling=1, max_iter=5000, multi_class='ovr', 
                                   n_jobs=1, random_state=None)
        model.fit(train_vectors, train_labels)
    elif modeltype == 'nb': # Naive Bayes
        model = GaussianNB()
        model.fit(train_vectors, train_labels)
    elif modeltype == 'nn': # Neural Network
        model = MLPClassifier(solver='lbfgs', alpha=modelparam, hidden_layer_sizes=(256, ), 
                              activation = modeltype_2,random_state=None, batch_size='auto', 
                              learning_rate='constant',  learning_rate_init=0.001, 
                              power_t=0.5, max_iter=30000, shuffle=True, 
                              tol=0.0001, verbose=False, warm_start=False, momentum=0.9, 
                              nesterovs_momentum=True, early_stopping=False, 
                              validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
        model.fit(train_vectors, train_labels)
    return model

def classify(model, test_vectors):
    result = model.predict(test_vectors)
    return result

    
def evaluation(test_labels, result):
    true_pos = np.logical_and(test_labels, result)
    count_true_pos = np.sum(true_pos)
    true_neg = np.logical_and(np.logical_not(test_labels),np.logical_not(result))
    count_true_neg = np.sum(true_neg)
    false_pos = np.logical_and(np.logical_not(test_labels), result)
    count_false_pos = np.sum(false_pos)
    false_neg = np.logical_and(test_labels,np.logical_not(result))
    count_false_neg = np.sum(false_neg)
    precision = count_true_pos/(count_true_pos+count_false_pos)
    sensitivity = count_true_pos/(count_true_pos+count_false_neg) # sensitivity = recall
    confusion_matrix = [count_true_pos, count_false_pos, count_false_neg, count_true_neg]
    no_links_found = np.count_nonzero(result)
    no_false = count_false_pos + count_false_neg
    Fscore = 2*precision*sensitivity/(precision+sensitivity)
    metrics_result = {'no_false':no_false, 'confusion_matrix':confusion_matrix ,'precision':precision,
                     'sensitivity':sensitivity ,'no_links':no_links_found, 'F-score': Fscore}
    return metrics_result

def blocking_performance(candidates, true_links, df):
    count = 0
    for candi in candidates:
        if df.loc[candi[0]]["match_id"]==df.loc[candi[1]]["match_id"]:
            count = count + 1
    return count

In [12]:
%%time
'''
Source: 
K. Vo, J. Jonnagaddala and S.-T. Liaw, "Medical-Record-Linkage-Ensemble," 16 February 2019. [Online]. 
Available: https://github.com/ePBRN/Medical-Record-Linkage-Ensemble/.
'''
## TRAIN SET CONSTRUCTION

# Import
print("Import train set...")
df_train = pd.read_csv("Data_to_produce_ePBRN_dataset/"+trainset+".csv", index_col = "rec_id")
train_true_links = generate_true_links(df_train)
print("Train set size:", len(df_train), ", number of matched pairs: ", str(len(train_true_links)))

# Preprocess train set
df_train['postcode'] = df_train['postcode'].astype(str)

# Final train feature vectors and labels
X_train, y_train = generate_train_X_y(df_train)
print("Finished building X_train, y_train")

Import train set...
Train set size: 14093 , number of matched pairs:  3220
Finished building X_train, y_train
CPU times: user 2.3 s, sys: 48.7 ms, total: 2.35 s
Wall time: 2.29 s


## 2.1 ePBRN Blocking Results

In [13]:
%%time
'''
Modifying the code provided by the authors to produce the results in Table 4 of the paper. 

Source: 
K. Vo, J. Jonnagaddala and S.-T. Liaw, "Medical-Record-Linkage-Ensemble," 16 February 2019. [Online]. 
Available: https://github.com/ePBRN/Medical-Record-Linkage-Ensemble/.
'''
# Blocking Criteria: declare non-match of all of the below fields disagree
# Import
ePBRN_blocking_results = []
df_test = pd.read_csv("Data_to_produce_ePBRN_dataset/"+testset+".csv", index_col = "rec_id")
test_true_links = generate_true_links(df_test)
leng_test_true_links = len(test_true_links)
total_possible_pairs = comb(len(df_test),2)
match_pairs = leng_test_true_links

blocking_fields = ["given_name", "surname", "postcode"]
all_candidate_pairs = []
for field in blocking_fields:
    block_indexer = rl.BlockIndex(on=field)
    candidates = block_indexer.index(df_test)
    detects = blocking_performance(candidates, test_true_links, df_test)
    all_candidate_pairs = candidates.union(all_candidate_pairs)
    
    # row 1
    row = []
    row.append(field)
    row.append('nc')
    nc = len(candidates)
    row.append(nc)
    ePBRN_blocking_results.append(row)
    
    # row 2 
    row = []
    row.append(field)
    row.append('pc')
    pc = round(detects/match_pairs*100.0, 2)
    row.append(pc)
    ePBRN_blocking_results.append(row)
    
    # row 3
    row = []
    row.append(field)
    row.append('rr')
    rr = round((1-(len(candidates)/1.0/total_possible_pairs))*100, 2)
    row.append(rr)
    ePBRN_blocking_results.append(row)

detects = blocking_performance(all_candidate_pairs, test_true_links, df_test)

# row 1
row_all = []
row_all.append('All')
row_all.append('nc')
nc = len(all_candidate_pairs)
row_all.append(nc)
ePBRN_blocking_results.append(row_all)

# row 2
row_all = []
row_all.append('All')
row_all.append('pc')
pc = round(detects/match_pairs*100.0, 2)
row_all.append(pc)
ePBRN_blocking_results.append(row_all)

# row 3
row_all = []
row_all.append('All')
row_all.append('rr')
rr = round((1-(len(candidates)/1.0/total_possible_pairs))*100, 2)
row_all.append(rr)
ePBRN_blocking_results.append(row_all)

CPU times: user 51.5 s, sys: 120 ms, total: 51.6 s
Wall time: 51.6 s


## 2.2 ePBRN Classification Performance Results

### Base Learners Performance

In [14]:
%%time
'''
Source: 
K. Vo, J. Jonnagaddala and S.-T. Liaw, "Medical-Record-Linkage-Ensemble," 16 February 2019. [Online]. 
Available: https://github.com/ePBRN/Medical-Record-Linkage-Ensemble/.
'''
## TEST SET CONSTRUCTION

# Preprocess test set
print("Processing test set...")
print("Preprocess...")
df_test['postcode'] = df_test['postcode'].astype(str)

# Test feature vectors and labels construction
print("Extract feature vectors...")
df_X_test = extract_features(df_test, all_candidate_pairs)
vectors = df_X_test.values.tolist()
labels = [0]*len(vectors)
feature_index = df_X_test.index
for i in range(0, len(feature_index)):
    if df_test.loc[feature_index[i][0]]["match_id"]==df_test.loc[feature_index[i][1]]["match_id"]:
        labels[i] = 1
X_test, y_test = shuffle(vectors, labels, random_state=0)
X_test = np.array(X_test)
y_test = np.array(y_test)
print("Count labels of y_test:",collections.Counter(y_test))
print("Finished building X_test, y_test")

Processing test set...
Preprocess...
Extract feature vectors...
Count labels of y_test: Counter({0: 357301, 1: 2614})
Finished building X_test, y_test
CPU times: user 32.3 s, sys: 209 ms, total: 32.5 s
Wall time: 32.5 s


In [15]:
%%time
'''
Modifying the code provided by the authors to produce the results in Table 6 of the paper. 
Used the hyperparameters as specified by Table 5 of the paper to build the models.

Source: 
K. Vo, J. Jonnagaddala and S.-T. Liaw, "Medical-Record-Linkage-Ensemble," 16 February 2019. [Online]. 
Available: https://github.com/ePBRN/Medical-Record-Linkage-Ensemble/.
'''

ePBRN_classification_results = [] 

## BASE LEARNERS CLASSIFICATION AND EVALUATION
################# SVM ########################
'''
Table 5 Hyperparameters for SVM on the FEBRL dataset
1. RBF kernel
2. C = 0.001
'''
modeltype = 'svm' # choose between 'svm', 'lg', 'nn'
modeltype_2 = 'rbf'  # 'linear' or 'rbf' for svm, 'l1' or 'l2' for lg, 'relu' or 'logistic' for nn
modelparam = 0.001

md = train_model(modeltype, modelparam, X_train, y_train, modeltype_2)
final_result = classify(md, X_test)
final_eval = evaluation(y_test, final_result)
precision = final_eval['precision']
sensitivity = final_eval['sensitivity']
Fscore = final_eval['F-score']
nb_false  = final_eval['no_false']

row = []
row.append('SVM')
row.append(round(precision*100, 2))
row.append(round(sensitivity*100, 2))
row.append(round(Fscore*100, 2))
row.append(nb_false)
ePBRN_classification_results.append(row)

## BASE LEARNERS CLASSIFICATION AND EVALUATION
################# NN ########################
'''
Table 5 Hyperparameters for NN on the FEBRL dataset
1. ReLu activation with a = 2000
'''
modeltype = 'nn' # choose between 'svm', 'lg', 'nn'
modeltype_2 = 'relu'  # 'linear' or 'rbf' for svm, 'l1' or 'l2' for lg, 'relu' or 'logistic' for nn
modelparam = 2000

md = train_model(modeltype, modelparam, X_train, y_train, modeltype_2)
final_result = classify(md, X_test)
final_eval = evaluation(y_test, final_result)
precision = final_eval['precision']
sensitivity = final_eval['sensitivity']
Fscore = final_eval['F-score']
nb_false = final_eval['no_false']

row = []
row.append('NN')
row.append(round(precision*100,2))
row.append(round(sensitivity*100, 2))
row.append(round(Fscore*100, 2))
row.append(nb_false)
ePBRN_classification_results.append(row)

## BASE LEARNERS CLASSIFICATION AND EVALUATION
################# LR ########################
'''
Table 5 Hyperparameters for NN on the FEBRL dataset
1. Regularization I2
2. C = 0.005
'''
modeltype = 'lg' # choose between 'svm', 'lg', 'nn'
modeltype_2 = 'l2'  # 'linear' or 'rbf' for svm, 'l1' or 'l2' for lg, 'relu' or 'logistic' for nn
modelparam = 0.005

md = train_model(modeltype, modelparam, X_train, y_train, modeltype_2)
final_result = classify(md, X_test)
final_eval = evaluation(y_test, final_result)
precision = final_eval['precision']
sensitivity = final_eval['sensitivity']
Fscore = final_eval['F-score']
nb_false = final_eval['no_false']

row = []
row.append('LR')
row.append(round(precision*100, 2))
row.append(round(sensitivity*100, 2))
row.append(round(Fscore*100, 2))
row.append(nb_false)
ePBRN_classification_results.append(row)


CPU times: user 1min 9s, sys: 2.54 s, total: 1min 12s
Wall time: 1min 5s


In [16]:
%%time
'''
Source: 
K. Vo, J. Jonnagaddala and S.-T. Liaw, "Medical-Record-Linkage-Ensemble," 16 February 2019. [Online]. 
Available: https://github.com/ePBRN/Medical-Record-Linkage-Ensemble/.
'''

## ENSEMBLE CLASSIFICATION AND EVALUATION

modeltypes = ['svm', 'nn', 'lg'] 
modeltypes_2 = ['rbf', 'relu', 'l2']
modelparams = [0.001, 2000, 0.005]
nFold = 10
kf = KFold(n_splits=nFold)
model_raw_score = [0]*3
model_binary_score = [0]*3
model_i = 0
for model_i in range(3):
    modeltype = modeltypes[model_i]
    modeltype_2 = modeltypes_2[model_i]
    modelparam = modelparams[model_i]
    iFold = 0
    result_fold = [0]*nFold
    final_eval_fold = [0]*nFold
    for train_index, valid_index in kf.split(X_train):
        X_train_fold = X_train[train_index]
        y_train_fold = y_train[train_index]
        md =  train_model(modeltype, modelparam, X_train_fold, y_train_fold, modeltype_2)
        result_fold[iFold] = classify(md, X_test)
        final_eval_fold[iFold] = evaluation(y_test, result_fold[iFold])
        iFold = iFold + 1
    bagging_raw_score = np.average(result_fold, axis=0)
    bagging_binary_score  = np.copy(bagging_raw_score)
    bagging_binary_score[bagging_binary_score > 0.5] = 1
    bagging_binary_score[bagging_binary_score <= 0.5] = 0
    bagging_eval = evaluation(y_test, bagging_binary_score)
    if modeltype == 'svm':
        row = []
        row.append('SVM-bag')
        row.append(round(bagging_eval['precision']*100, 2))
        row.append(round(bagging_eval['sensitivity']*100, 2))
        row.append(round(bagging_eval['F-score']*100, 2))
        row.append(bagging_eval['no_false'])
        ePBRN_classification_results.append(row)
    elif modeltype == 'nn':
        row = []
        row.append('NN-bag')
        row.append(round(bagging_eval['precision']*100, 2))
        row.append(round(bagging_eval['sensitivity']*100, 2))
        row.append(round(bagging_eval['F-score']*100, 2))
        row.append(bagging_eval['no_false'])
        ePBRN_classification_results.append(row)
    else:
        row = []
        row.append('LR-bag')
        row.append(round(bagging_eval['precision']*100, 2))
        row.append(round(bagging_eval['sensitivity']*100, 2))
        row.append(round(bagging_eval['F-score']*100, 2))
        row.append(bagging_eval['no_false'])
        ePBRN_classification_results.append(row)
    model_raw_score[model_i] = bagging_raw_score
    model_binary_score[model_i] = bagging_binary_score
    

CPU times: user 10min 55s, sys: 21.9 s, total: 11min 17s
Wall time: 10min 30s


### Ensemble Model Performance

In [17]:
%%time
'''
Source: 
K. Vo, J. Jonnagaddala and S.-T. Liaw, "Medical-Record-Linkage-Ensemble," 16 February 2019. [Online]. 
Available: https://github.com/ePBRN/Medical-Record-Linkage-Ensemble/.
'''
thres = .99
stack_raw_score = np.average(model_raw_score, axis=0)
stack_binary_score = np.copy(stack_raw_score)
stack_binary_score[stack_binary_score > thres] = 1
stack_binary_score[stack_binary_score <= thres] = 0
stacking_eval = evaluation(y_test, stack_binary_score)

CPU times: user 20.9 ms, sys: 16.8 ms, total: 37.6 ms
Wall time: 6.48 ms


In [18]:
%%time
row = []
row.append('Stack+Bag')
row.append(round(stacking_eval['precision']*100, 2))
row.append(round(stacking_eval['sensitivity']*100, 2))
row.append(round(stacking_eval['F-score']*100, 2))
row.append(stacking_eval['no_false'])
ePBRN_classification_results.append(row)

CPU times: user 156 µs, sys: 133 µs, total: 289 µs
Wall time: 42.9 µs


# 3.0 Creating the Paper’s Table 4

In [19]:
blocking_results = pd.DataFrame(FEBRL_blocking_results, 
                                columns=['Blocking Criterion', 'Measure', 'FEBRL'])

In [20]:
blocking_results_ePBRN = pd.DataFrame(ePBRN_blocking_results, 
                                      columns=['Blocking Criterion', 'Measure', 'ePBRN'])

In [21]:
blocking_results = blocking_results.merge(blocking_results_ePBRN, 
                                          left_on=['Blocking Criterion', 'Measure'], 
                                          right_on=['Blocking Criterion', 'Measure'])

In [22]:
blocking_results

Unnamed: 0,Blocking Criterion,Measure,FEBRL,ePBRN
0,given_name,nc,154898.0,250888.0
1,given_name,pc,65.74,58.41
2,given_name,rr,99.69,99.64
3,surname,nc,170843.0,32425.0
4,surname,pc,66.5,55.61
5,surname,rr,99.66,99.95
6,postcode,nc,53197.0,80049.0
7,postcode,pc,84.38,93.9
8,postcode,rr,99.89,99.88
9,All,nc,372073.0,359915.0


# 4.0 Creating the Paper’s Table 6

In [23]:
classification_results_FEBRL = pd.DataFrame(FEBRL_classification_results, 
                                            columns=['Model', 'pr(%)', 're(%)', 'fs(%)', 'fc'])
ePBRN_classification_results = pd.DataFrame(ePBRN_classification_results, 
                                            columns=['Model', 'pr(%)', 're(%)', 'fs(%)', 'fc'])

In [24]:
classification_results_FEBRL

Unnamed: 0,Model,pr(%),re(%),fs(%),fc
0,SVM,94.8,99.73,97.2,281
1,NN,96.71,99.63,98.15,184
2,LR,86.58,99.82,92.73,766
3,SVM-bag,96.23,99.65,97.91,208
4,NN-bag,96.96,99.63,98.28,171
5,LR-bag,87.34,99.82,93.16,717
6,Stack+Bag,97.7,99.61,98.64,134


In [25]:
ePBRN_classification_results

Unnamed: 0,Model,pr(%),re(%),fs(%),fc
0,SVM,32.5,99.16,48.95,5406
1,NN,69.82,97.36,81.32,1169
2,LR,60.78,97.82,74.97,1707
3,SVM-bag,38.16,98.78,55.05,4216
4,NN-bag,70.61,97.32,81.84,1129
5,LR-bag,61.41,97.82,75.45,1664
6,Stack+Bag,74.1,97.32,84.14,959
