In [6]:
# !pip install recordlinkage

In [7]:
import pandas as pd
from collections import defaultdict
import recordlinkage
import recordlinkage.datasets as rl_data
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score
from deepRL import DeepRL

In [8]:
# data: https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/JKBULA
names = ['id', 'first', 'last', 'address', 'city']
start_index = 0
df_org_source = pd.read_csv('Q.txt', delimiter=',', names=names)
df_dup_source = pd.read_csv('A.txt', delimiter=',', encoding="ISO-8859-1", names=names)
df_dup_source = df_dup_source.iloc[[i for i in range(start_index,df_dup_source.shape[0],10)]]
assert df_org_source.shape ==  df_dup_source.shape

In [9]:
# create dict of true links
def def_value():
    return False
      
links = defaultdict(def_value)

for i in range(df_org_source.shape[0]):
  assert df_org_source.iloc[i]['id'][1:] == df_dup_source.iloc[i]['id'][1:-2]
  links[df_org_source.iloc[i]['id']] = df_dup_source.iloc[i]['id']


def true_class(org, dup):
  '''returns true if org and dup are true pair
  false otherwise'''
  if links[org] == dup:
    return True
  else:
     return False

In [122]:
# check for False in links dict
assert sum(id2 == False for id1, id2 in links.items()) == 0

In [11]:
# set index as id
df_org_source = df_org_source.set_index('id')
df_dup_source = df_dup_source.set_index('id')

In [12]:
# create separate columns for address num and street
# lower case
for df in [df_org_source, df_dup_source]:
  for col in df.columns:
    df[col] = df[col].apply(lambda word: word.lower())
  df['addr num'] = df['address'].apply(lambda x: x.split(' ')[0])
  df['addr'] = df['address'].apply(lambda x: ' '.join(x.split(' ')[1:]))
  df = df.drop('address', axis=1)

In [13]:
# create candidate pairs for source
indexer = recordlinkage.SortedNeighbourhoodIndex(
        'address', window=11
    )

candidate_pairs_source = indexer.index(df_org_source, df_dup_source)

In [14]:
# add missing true matches to source because we know all labels for source data
for id1, id2 in links.items():
  if (id1, id2) in candidate_pairs_source: # or ((id2, id1) in candidate_pairs_source):
    continue
  else:
    candidate_pairs_source = candidate_pairs_source.insert(np.random.randint(len(candidate_pairs_source)), (id1, id2))

# check for False in links dict and candidate_pairs_source
assert sum(id2 == False for id1, id2 in links.items()) == 0
assert sum(id2 == False for id1, id2 in candidate_pairs_source) == 0

In [15]:
# check that all links are in candidate_pairs_source
assert np.mean([(id1, id2) in candidate_pairs_source for id1, id2 in links.items()]) == 1
# check balance of data
print('class balance: ', np.mean([links[id1] == id2 for id1, id2 in candidate_pairs_source]))

class balance:  0.1549714852467146


In [16]:
# create list of true labels
y_source_true = [true_class(id1, id2) for id1, id2 in candidate_pairs_source]

In [17]:
# target data
data = rl_data.load_febrl4(return_links=True)
df_org_target, df_dup_target, df_links = data[0], data[1], data[2]

In [18]:
# candidate pairs for transfer learning
indexer = recordlinkage.Index()
indexer.block(left_on='postcode', right_on='postcode')
candidate_pairs_target = indexer.index(df_org_target, df_dup_target)

# create dict of true links
def def_value():
    return "Not Present"
      
links_target = defaultdict(def_value)

for org, dup in df_links:
  links_target[org] = dup

def true_class_target(org, dup):
  '''returns true if org and dup are true pair
  false otherwise'''
  if links_target[org] == dup:
    return 1
  else:
     return 0
     
# create true match list
y_target_true = [true_class_target(org, dup) for org, dup in candidate_pairs_target]

In [19]:
# check for false in links_target and candidate_pairs_target
assert sum(id2 == False for id1, id2 in links_target.items()) == 0
assert sum(id2 == False for id1, id2 in candidate_pairs_target) == 0

# Initialize DeepRL class and train embeddings

In [32]:
DL = DeepRL(df_org_source, df_dup_source, y_source_true, candidate_pairs_source, df_org_target, df_dup_target, 
            candidate_pairs_target, 50)

training embeddings complete
processing source data


100%|██████████| 64528/64528 [01:21<00:00, 790.56it/s]


processing source data complete
processing target data


100%|██████████| 28609/28609 [00:33<00:00, 847.18it/s]


processing target data complete


# Build source model and train

The source model is trained on the source data using full labels where the match/non-match status is known. We can transfer these parameters over to the target model and fine-tune.

In [33]:
DL.build_source_model(universal=False, summary=True) # bool to select type of BiGRU distance measure / bool to display summary

Model: "model_9"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
org_input (InputLayer)          [(None, 10, 10, 50)] 0                                            
__________________________________________________________________________________________________
dup_input (InputLayer)          [(None, 10, 10, 50)] 0                                            
__________________________________________________________________________________________________
tf.compat.v1.transpose_6 (TFOpL (10, None, 10, 50)   0           org_input[0][0]                  
__________________________________________________________________________________________________
tf.compat.v1.transpose_7 (TFOpL (10, None, 10, 50)   0           dup_input[0][0]                  
____________________________________________________________________________________________

None

the model is trained using the following hyperparameters

   lr = 0.001, epochs = 10, batch_size = 32



In [34]:
DL.train_source_model(0.001, 10, 32) # parameters: lr, epochs, batch

Epoch 1/10
 — val_f1: 0.8575525812619503 — val_precision: 0.8229357798165138 — val_recall 0.8952095808383234
Epoch 2/10
 — val_f1: 0.8164878599661208 — val_precision: 0.9401820546163849 — val_recall 0.7215568862275449
Epoch 3/10
 — val_f1: 0.8904440154440154 — val_precision: 0.8621495327102804 — val_recall 0.9206586826347305
Epoch 4/10
 — val_f1: 0.901324006994754 — val_precision: 0.9024512256128064 — val_recall 0.9001996007984032
Epoch 5/10
 — val_f1: 0.9106614017769002 — val_precision: 0.90087890625 — val_recall 0.9206586826347305
Epoch 6/10
 — val_f1: 0.9173658784642766 — val_precision: 0.9351995852773458 — val_recall 0.9001996007984032
Epoch 7/10
 — val_f1: 0.918918918918919 — val_precision: 0.8972895863052782 — val_recall 0.9416167664670658
Epoch 8/10
 — val_f1: 0.9156445556946182 — val_precision: 0.9186338523355098 — val_recall 0.9126746506986028
Epoch 9/10
 — val_f1: 0.9274432379072063 — val_precision: 0.91748046875 — val_recall 0.937624750499002
Epoch 10/10
 — val_f1: 0.9225806

# Build adaptation model and train

The parameters from the source model are transfered over to the adaptation model. For training we combine data from the source and target datasets and train on two target variables. One target is match/non-match and the other is source/target datasets. Because the match/non-match status of the target data is not known we disregard the match/non-match loss for these pairs. We add a gradient reversal layer between the dataset classifier and the rest of the model. This layer makes so the BiGRU is training to trick the dataset classifier. Even though we don't have labels for the target dataset, if the outputs of the BiGRU are indistinguishable between the source and target and the match/non-match classifier is performing well on source data we have reason to believe that the performance on the target dataset will be reasonable. 

In [35]:
DL.build_adaptation_model(True) # display summary

It is important to tune the match_weight and dataset_weight so the dataset classifier isn't easily distinguishing between source and taget

In [50]:
DL.train_adaptation_model(0.001, 10, 32, 1.0, 0.1) # lr, epochs, batch_size, match_weight, dataset_weight
# trained with: lr = 0.001, epochs = 10, batch_size = 32, match_weight = 1.0, dataset_weight = 0.1

class balance on y_dataset:  0.5186749217672196
if dataset classifier in not "winning" the dataset accuracy should be close to class balance
Epoch 1/10
 — val_f1: 0.928529238038984 — val_precision: 0.9538834951456311 — val_recall 0.904487917146145
Epoch 2/10
 — val_f1: 0.9293044469783353 — val_precision: 0.9209039548022598 — val_recall 0.9378596087456847
Epoch 3/10
 — val_f1: 0.9329446064139941 — val_precision: 0.9456264775413712 — val_recall 0.9205983889528193
Epoch 4/10
 — val_f1: 0.9294947121034077 — val_precision: 0.9495798319327731 — val_recall 0.9102416570771001
Epoch 5/10
 — val_f1: 0.9267441860465117 — val_precision: 0.936545240893067 — val_recall 0.9171461449942463
Epoch 6/10
 — val_f1: 0.9174208144796381 — val_precision: 0.9021134593993326 — val_recall 0.9332566168009206
Epoch 7/10
 — val_f1: 0.9131156613171001 — val_precision: 0.8795309168443497 — val_recall 0.9493670886075949
Epoch 8/10
 — val_f1: 0.9177177177177179 — val_precision: 0.9597989949748744 — val_recall 0.8791714

# Build target model

The dataset adaptation model is trained to make predictions on two target variables. The build_target_model() method takes the parameters from the BiGRU and the match/non-match classifier from the adaptation model and creates a new model that just predicts the match/non-match status. 

In [51]:
DL.build_target_model(transfer=True, universal=False, summary=False) 
# parameters: transfer learning vs build model from scratch/ bool for different kinds of BiGRUs / display summary

# Active / Self Learning


The active_self_learning() method pulls out the n_certain_false highest confidence non-match and n_certain_true highest confidence true match pairs, automatically labels them and adds them to the labeled target data. The method also pulls out the n_uncertain lowest confidence pairs and we label them by hand using the clerical review function. This method uses the target model to predict on unlabeled data so the build_target_model() method must be run before the active_self_learning() method can work properly

In [123]:
DL.active_self_learning(1000, 100, 10) # n_certain_false, n_certain_true, n_uncertain

# Clerical Review

The clerical_review() method allows the user to label the low confidence pairs by hand and add them to the set of labeled target data.

In [117]:
DL.clerical_review()

Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec-3013-org,george,mccarthy,35,clubbe crescent,yakilli park,mentone,2540,vic,19350704,9460223
rec-3013-dup-0,georg e,mccarethy,35,clubbe crescent,yakill i park,mentone,2540,vic,19350704,9460223


Enter label (True/False) :True


Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec-3369-org,callum,lowry,52,hallen close,ruglen,picnic point,2000,vic,19971123,3987913
rec-3369-dup-0,callum,lowry,52,ruglen,hallen close,picnic point,2000,vic,19971123,3987913


Enter label (True/False) :True


Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec-4496-org,hari,warnock,8.0,jansz crescent,brentwood vlge,broadmeadows,2486,qld,19350219,7539077
rec-4496-dup-0,warnock,hark,,jansz cr escent,brentwood vlge,broadmeadows,2486,,19350219,7539077


Enter label (True/False) :True


Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec-3057-org,sophie,paine,1,yambina crescent,sefton park,ruse,6210,nsw,19440101,3325291
rec-3057-dup-0,sophie,noble,1,yambina crescent,sefton park,rues,6210,nsw,19440101,3325291


Enter label (True/False) :True


Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec-1703-org,jett,white,37,marshall street,avalind,greenacre,3500,wa,19141102,4115932
rec-2455-dup-0,talia,green,8,groom street,ikllke,medina,3500,wa,19800109,7656892


Enter label (True/False) :False


Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec-4120-org,casey,flatman,3,wyselaskie circuit,,mentone,7250,vic,19931002,9564208
rec-822-dup-0,zachary,goode,39,alroy c ircuit,,broadmeadows,7250,nsw,19900208,3817430


Enter label (True/False) :False


Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec-4514-org,hamish,beams,102,mackellar crescent,rosedale,,3340,,19670422,5928661
rec-4533-dup-0,lily,berry,77,crofts crescent,railway cttage,rose bay,3340,sa,19590415,6015647


Enter label (True/False) :False


Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec-2317-org,lydia,heiland,12,roope close,garden est,yass,2454,nsw,19130103,6703072
rec-2317-dup-0,lydia,hindmarch,12,roope close,garden est,yass,2454,nsw,19130103,6703072


Enter label (True/False) :True


Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec-2880-org,emiily,paterson,7,gruner street,platina,seaforth,5157,vic,19840318,7177837
rec-2880-dup-0,paterson,emiily,7,gruner street,plat ina,seaforth,5157,vic,19840318,7177837


Enter label (True/False) :True


Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec-4253-org,harley,mclaine,419,john young crescent,,hamilton,2261,qld,19380511,5510205
rec-1025-dup-0,grace,maynard,2,woodgatestreet,,greenwood,2261,qld,19860314,4508059


Enter label (True/False) :False


Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec-2028-org,zachary,minter,2,miller street,,port adelaide,2300,nsw,19580309,7193282
rec-2028-dup-0,zachary,minter,2,miller street,,graceville,2300,nsw,19580309,7193282


Enter label (True/False) :True


Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec-74-org,sonja,nguyen,2.0,eyre street,,lake clarendon,5023,nt,19591021,6742110
rec-1445-dup-0,paige,webb,,arthaldo court,,cardkff,5023,vic,19341104,6388140


Enter label (True/False) :False


Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec-4555-org,caleb,purbell,53,banksia street,rosevillea,rye,4740,act,19490425,8230410
rec-4555-dup-0,caleb,,53,banksia street,rosevillea,rye,4740,act,19490425,8230410


Enter label (True/False) :True


Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec-1938-org,noah,bishop,23,shannon circuit,eriswell park,south perth,4551,nsw,19210214,5100494
rec-1938-dup-0,noah,bishop,325,shannon circuit,eriswell park,south pefth,4551,nsw,19210214,5100494


Enter label (True/False) :True


Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec-4166-org,ruby,mason,18,hurley street,,samford valley,3805,qld,19420728,1047066
rec-4166-dup-0,ruby,masoin,18,hurleystreet,openshaw,samford valley,3805,qld,19420728,1047066


Enter label (True/False) :True


Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec-3010-org,thomas,quast,6,fiveash street,durraween,st clair,4565,vic,19730804,9009452
rec-3010-dup-0,thomas,quas,6,fiveash sreet,,st clair,4565,vic,19730804,9009452


Enter label (True/False) :True


Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec-4313-org,lauren,mcveigh,,bremer street,werribee plaza,cranbourne north,2039,act,19591216,4278721
rec-4313-dup-0,mcveigh,lauren,,bremers treet,werribee lpaza,cranbourne north,2039,,19261230,4278721


Enter label (True/False) :True


Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec-4546-org,arren,wiseman,305,morant circuit,the village condo 7,st albans,2027,vic,19660719,5442212
rec-4546-dup-0,etha,wiskeman,305,morant cifcuit,the village condo 7,st albans,2027,vic,19660719,5442212


Enter label (True/False) :True


Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec-4798-org,bethany,crook,,rumker place,villa 115,thornbury,4380,wa,19701023,3361106
rec-386-dup-0,bradley,crook,140.0,jacka tcrescent,locn 1969,south bribane,4380,nsw,19480319,4421582


Enter label (True/False) :False


Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec-2010-org,layla,donaldson,71,mannheim street,sandalford homestead,urangan,2713,wa,19320608,5103641
rec-2010-dup-0,,donaldson,71,mannheim street,sandalford komestead,urangan,2713,nsw,19320608,5103641


Enter label (True/False) :True


Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec-4260-org,phoebe,pinny,5,beaurepaire crescent,pine hill,stoneville,2110,wa,19730518,1310519
rec-4260-dup-0,mia,neeb,5,,pine hill,stonevlile,2110,wa,19730518,1310519


Enter label (True/False) :True


Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec-1542-org,kira,mac an,5,groom place,doctors flat,wategos beach,4207,vic,19341108,7247594
rec-2780-dup-0,,kothe,12,breona olace,lakes retirement estate,highton,4207,nsw,19620821,7592380


Enter label (True/False) :False


Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec-3456-org,lucy,purdon,1,beauchamp street,,maida vale,3153,sa,19510318,1098632
rec-3456-dup-0,blake,purdvon,1,beauchamp street,francis chambers,maida vale,3153,sa,19510318,1098632


Enter label (True/False) :True


Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec-3583-org,isabella,buchhorn,28,officer crescent,,oaklands park,5043,vic,19930522,7645759
rec-2158-dup-0,jenna,neumann,22,percy cresscent,,lennox head,5043,qld,19821024,7587719


Enter label (True/False) :False


Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec-2627-org,seth,sherriff,1329,bean crescent,braeburn,the entrance,2153,vic,19320310.0,1727482
rec-3970-dup-0,,camp,28,sharwood crescent,,laverton,2153,vic,,8240507


Enter label (True/False) :False


Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec-2931-org,,campton,31,kennewell place,palms lodge,coolbellup,4171,nsw,19030416,2809212
rec-1722-dup-0,zachary,mccury,9,blandon place,dina,mirrabooka,4171,nsw,19771202,3404376


Enter label (True/False) :False


Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec-405-org,jazz,dallas,82,wattle street,glenefer garden,clapham,3788,nsw,19780723,4927681
rec-405-dup-0,dallas,jazz,82,wattle street,glenefer garden,clapham,3788,nsw,19780723,4927681


Enter label (True/False) :True


Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec-1148-org,emiily,reid,32,ina gregory circuit,,camira,3172,nsw,19041113,8734967
rec-1148-dup-0,,reid,32,ina gregory circuit,john flynn medical centre,camira,3172,nsw,19041113,8734967


Enter label (True/False) :True


Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec-1282-org,,teteris,76.0,kellermann close,bondo,brighton,3030,nsw,19381004,5797955
rec-1282-dup-0,,teteirs,,kellermann close,b oneo,brighton,3030,nsw,19381004,5797955


Enter label (True/False) :True


Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec-192-org,madeline,huxley,7,murrell place,westbury,orange,3939,nsw,19850421,5740778
rec-192-dup-0,ryley,marzec,7,murrellplace,,orange,3939,nsw,19850421,5740778


Enter label (True/False) :True


Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec-2168-org,chloe,white,10,madigan street,woodbine homestead,barraba,3055,nsw,19620821,4067329
rec-2168-dup-0,alicia,whiet,10,madigan dtreet,,barraba,3055,nsw,19620821,4067329


Enter label (True/False) :True


Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec-1044-org,riley,lamprey,22,embley street,,hinchinbrook,3199,vic,19851029,8710851
rec-862-dup-0,riley,green,20,riorda n sreet,tillside,burwood,3199,vic,19531216,5368190


Enter label (True/False) :False


Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec-3633-org,ryleh,sleath,9,archdall street,boxwood hill,sandy bay,2484,nsw,19550126.0,1791300
rec-3633-dup-0,ryleh,everett,9,,,sandy bay,2484,nsw,,1791300


Enter label (True/False) :True


Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec-3841-org,esme,leslie,89,galleghan circuit,five oaks,brighton,6007,tas,19520521,7091838
rec-3841-dup-0,leslie,esme,3,galleghan circuit,five aoks,brighton,6007,tas,19520521,7091838


Enter label (True/False) :True


Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec-1473-org,,leslie,925.0,carpenter close,,canterbury,2340,vic,19950608,2438058
rec-4658-dup-0,jack,danbty,,thompson polace,woorabinda,norwood,2340,vic,19991118,1030908


Enter label (True/False) :False


Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec-4269-org,rebecca,coppock,3.0,homeleigh wallaroo road,tinaroo falls,embleton,6152,vic,19440414,2971980
rec-4393-dup-0,andtrea,coffey,,dampier cerscent,katimba,ashfield,6152,vic,19300416,7093245


Enter label (True/False) :False


Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec-2432-org,isaiah,warde,61,mcphail place,sunbury,bonnyrigg,3929,vic,19090111,8475133
rec-2432-dup-0,warde,isaiah,14,mcphail place,subbury,bonnykrgg,3929,vic,19090111,8475133


Enter label (True/False) :True


Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec-3610-org,zac,mashberg,32,barraclough crescent,mowbray,ballina,3184,nsw,19680330,8938791
rec-4318-dup-0,timothy,mathews,2,blandon place,harrowvsle,mill park,3184,nsw,19560930,6083907


Enter label (True/False) :False


Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec-4157-org,william,demarco,21,shenton crescent,,south perth,3840,nsw,19170802,3018112
rec-1889-dup-0,matthew,nurse,64,cygnet rescent,,portarlington,3840,nsw,19670101,6591571


Enter label (True/False) :False


Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec-4103-org,jack,badman,8,kardang street,gowrie,point samson,4670,nsw,19070912,1172421
rec-4103-dup-0,jack,drozdowski,8,kardang sgreet,gowrie,point samson,4670,nsw,19070912,1172421


Enter label (True/False) :True


Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec-3113-org,bailey,pascoe,311,tillyard drive,lindenleigh,hamilton,4030,nsw,19960928,2933911
rec-3113-dup-0,liam,pascoe,311,tillyadr drive,lindenleigh,hamilton,4030,nsw,19960928,2933911


Enter label (True/False) :True


Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec-3593-org,marleigh,sideris,42,henry melville crescent,bocking,morpeth,3039,sa,19251227,6987405
rec-1781-dup-0,jessica,lomman,28,wallis place,glengariffe,north sydney,3039,sa,19801217,6284781


Enter label (True/False) :False


Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec-1385-org,joel,bishop,10,french street,cedarview,orange,3223,nt,,1324854
rec-1385-dup-0,elton,bishop,10,french street,,orange,3223,nt,,1324854


Enter label (True/False) :True


Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec-182-org,holly,webb,40,kirkland circuit,meemar,wahroonga,3149,sa,19790526,5682368
rec-182-dup-0,haklra,hokly,40,kirklandc ircuit,meear,wahroonga,3149,sa,19721209,5682368


Enter label (True/False) :True


Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec-1898-org,william,croker,42,belmore garden,norellen,stephens,4207,qld,19190108,9397232
rec-1898-dup-0,croker,willism,42,belmore garden,norellen,stephens,4207,qld,19190108,9397232


Enter label (True/False) :True


Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec-2506-org,teagan,wreford,17,preston street,wollartukkee,forest hill,4413,sa,19410923,2436763
rec-2506-dup-0,teagan,galbraith,17,preston street,wollartukkee,forest hill,4413,sa,19410923,2436763


Enter label (True/False) :True


Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec-4997-org,kane,george,351,gosse street,balala station,minnamoolka,4061,vic,19571204,9672746
rec-4997-dup-0,kabe,george,351,gosse street,,minnamoolka,4061,vic,19571204,9672476


Enter label (True/False) :True


Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec-2714-org,taylor,,17,kerkeri close,,south perth,4650,nt,19740805,4419884
rec-3302-dup-0,blaize,koopman,17,allison place,aldersydeestate,balwyn north,4650,nsw,19110608,7823755


Enter label (True/False) :False


Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec-4136-org,,vincent,47,kent street,blue hills,forest hill,4655,vic,19190304,5807175
rec-4136-dup-0,,vincent,47,kentstreet,,forest hlil,4655,vic,19190304,5807175


Enter label (True/False) :True


Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec-4552-org,sophie,mason,22,sturgess place,acadia,pacific paradise,3215,nsw,19790301,2531438
rec-3801-dup-0,sophie,hage,18,riley clwose,rocklea,seaforvh,3215,nsw,19771204,9725410


Enter label (True/False) :False
all data labeled


# Training target model

The train_target_model() method can be used to train the target model just using the labeled target data that was either submitting when the DeepRL class was initialized, acquired through self/active learning or a combination of the two. This method should only be used after a sufficient amount of labeled target data is available. If there is minimal training data and this method is run, it can cause the model to easily overfit and perform poorly on the unlabeled data. 

In [119]:
DL.train_target_model(0.0001, 1, 32) # lr, epochs, batch_size, optional: y_labels, indices for labels in candidate pairs

 — val_f1: 0.9928057553956835 — val_precision: 0.9857142857142858 — val_recall 1.0


# Notes on overall training process

The DeepRL class can be used in many different ways. If there is sufficient labeled target data the user can just initialize the class and immediately train on the target data. In situations where there is a limited amount of labeled target data, dataset adaptation/self/active learning should be used. In this case it might be wise to initially iterate between the train_dataset_adaptation method and self_active_learning methods. As labeled target data is accumulated this data is added to the training data for the dataset adaptation model, so during each additional round of self/active learning more labeled target data will be available during training with the train_dataset_adaptation() method. We need to be careful not to switch to the target model for self/active learning too early in the process. If the target model is trained when there is a small amount of labeled target data, the model can perform very poorly. If this model has poor performance and it is used for self learning, the data that is automatically labeled can be labeled incorrectly. This will lead to training on incorrectly labeled data which will be very detrimental to model performance. In preliminary experimentation it seems around 4000 labeled target pairs leads to reasonable performance when training the target model. After there is sufficient labeled target data the user can switch over to iterating between train_target_model() and self_active_learning(). 

In [118]:
# in this contrived scenario where we actually do know the true match status of the target data we can check 
# if there are any pairs that have been added to labeled target data that were labeled incorrectly
# we can see below we have accuracy, precision and recall all equal to 1
# in a real world scenario this step would not be possible
y_true_f = [true_class_target(candidate_pairs_target[i][0], candidate_pairs_target[i][1]) for i in DL.y_target_indices]
y_pred_f = DL.y_target

print('accuracy', accuracy_score(y_true_f, y_pred_f))
print('recall', recall_score(y_true_f, y_pred_f))
print('precision', precision_score(y_true_f, y_pred_f))

accuracy 1.0
recall 1.0
precision 1.0


In [107]:
# check accuracy, recall and precision for non-labeled target data
y_target_pred = DL.model_target.predict(x=[np.delete(DL.org_target_embed, DL.y_target_indices, axis=0),
                                           np.delete(DL.dup_target_embed, DL.y_target_indices, axis=0)])
y_target_pred = y_target_pred > 0.5
y_target_true_delete = np.delete(y_target_true, DL.y_target_indices, axis=0)
print('accuracy', accuracy_score(y_target_true_delete, y_target_pred))
print('recall', recall_score(y_target_true_delete, y_target_pred))
print('precision', precision_score(y_target_true_delete, y_target_pred))

accuracy 0.9767716324231532
recall 0.9275715155203895
precision 0.9398704902867715


In [121]:
# the results shown here are after 4 training self/active learning iterations, 
# the last of these iterations was just hand labeling 50 pairs