In [1]:
import os
import json
from pathlib import Path
import pandas as pd
import dedupe
from dotenv import find_dotenv, load_dotenv

load_dotenv(find_dotenv())

INFO:root:Generating grammar tables from /usr/lib/python3.6/lib2to3/Grammar.txt
INFO:root:Generating grammar tables from /usr/lib/python3.6/lib2to3/PatternGrammar.txt


True

In [2]:
DEDUPE_2_ERS_DATAFIELD_MAPPING = {'abs_name':'name', 'abs_legal_form':'legal_form',
                                'abs_register_number':'register_number', 'abs_hq_email':'email', 'abs_website':'website',
                                'abs_hq_phone':'phone_number', 'abs_taxid':'vat_id',
                                'abs_hq_street':'address.street', 'abs_hq_zip_code':'address.postal_code',
                                'abs_hq_city':'address.city', 'abs_hq_country':'address.country',
                                'record_id':'record_id'}

# First model

In [30]:
trainset_filepath = str(Path(os.getenv('DATA_PATH')) / 'raw' / '2018-07-03-trainings_data_from_dedupe.json')
with open(trainset_filepath) as file:
    trainset = json.load(file)
    
with open(str(Path(os.getenv('MODELS_PATH')) / '2018-06-26-dedupeio_datamodel.json')) as file:
    fields = json.load(file)

for field in fields:
    field['field'] = DEDUPE_2_ERS_DATAFIELD_MAPPING[field['field']]

for [a, b] in trainset['match']:
    for field in DEDUPE_2_ERS_DATAFIELD_MAPPING.keys():
        a[DEDUPE_2_ERS_DATAFIELD_MAPPING[field]] = a.pop(field)
    for field in DEDUPE_2_ERS_DATAFIELD_MAPPING.keys():
        b[DEDUPE_2_ERS_DATAFIELD_MAPPING[field]] = b.pop(field)
for [a, b] in trainset['distinct']:
    for field in DEDUPE_2_ERS_DATAFIELD_MAPPING.keys():
        a[DEDUPE_2_ERS_DATAFIELD_MAPPING[field]] = a.pop(field)
    for field in DEDUPE_2_ERS_DATAFIELD_MAPPING.keys():
        b[DEDUPE_2_ERS_DATAFIELD_MAPPING[field]] = b.pop(field)

# messy = {('d'+str(k)):v[0] for k,v in zip(range(len(trainset['distinct'])), trainset['distinct'])}
# canonical = {('m'+str(k)):v[0] for k,v in zip(range(len(trainset['match'])), trainset['match'])}

# somehow the data must be split up: one part has to be use to call this gazetteer.sample function,
# the other part is used as the "real" trainings data:
messy = {('d'+str(k)):v[0] for k,v in zip(range(100), trainset['distinct'])}
canonical = {('m'+str(k)):v[0] for k,v in zip(range(100), trainset['match'])}

new_trainset = {'match':trainset['match'][101:], 'distinct':trainset['distinct'][101:]}
new_trainset_filepath = str(Path(os.getenv('DATA_PATH')) / 'raw' / 'new_trainset.json')
with open(new_trainset_filepath, 'w') as fp:
    json.dump(new_trainset, fp)

gazetteer = dedupe.Gazetteer(fields)
gazetteer.sample(messy, canonical)

with open(new_trainset_filepath) as tf:
    gazetteer.readTraining(tf)
    
gazetteer.train()

model_output_filepath = str(Path(os.getenv('MODELS_PATH')) / 'very_first.model')
with open(model_output_filepath, 'wb') as sf:
    gazetteer.writeSettings(sf, index=True)

  % (sample_size, len(blocked_sample)))
INFO:dedupe.api:reading training from file
INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (sameFiveCharStartPredicate, abs_name), SimplePredicate: (sameThreeCharStartPredicate, abs_name))
INFO:dedupe.training:(SimplePredicate: (commonSixGram, abs_hq_city), SimplePredicate: (commonTwoTokens, abs_hq_street))
INFO:dedupe.training:(SimplePredicate: (sameThreeCharStartPredicate, abs_name), SimplePredicate: (twoGramFingerprint, abs_hq_country))
INFO:rlr.crossvalidation:using cross validation to find optimum alpha...
INFO:rlr.crossvalidation:optimum alpha: 0.100000, score 0.6938146892556963
INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (sameThreeCharStartPredicate, abs_name), SimplePredicate: (suffixArray, abs_hq_zip_code))
INFO:dedupe.training:(SimplePredicate: (commonTwoTokens, abs_name), SimplePredicate: (sameThreeCharStartPredicate, abs_name))
INFO:dedupe.training:(SimplePredicate:

## Model #2

In [4]:
trainset_filepath = str(Path(os.getenv('DATA_PATH')) / 'raw' / '2018-07-03-trainings_data_from_dedupe.json')
with open(trainset_filepath) as file:
    trainset = json.load(file)
    
with open(str(Path(os.getenv('MODELS_PATH')) / '2018-06-26-dedupeio_datamodel.json')) as file:
    fields = json.load(file)
    
for field in fields:
    field['field'] = DEDUPE_2_ERS_DATAFIELD_MAPPING[field['field']]

for [a, b] in trainset['match']:
    for field in DEDUPE_2_ERS_DATAFIELD_MAPPING.keys():
        a[DEDUPE_2_ERS_DATAFIELD_MAPPING[field]] = a.pop(field)
    for field in DEDUPE_2_ERS_DATAFIELD_MAPPING.keys():
        b[DEDUPE_2_ERS_DATAFIELD_MAPPING[field]] = b.pop(field)
for [a, b] in trainset['distinct']:
    for field in DEDUPE_2_ERS_DATAFIELD_MAPPING.keys():
        a[DEDUPE_2_ERS_DATAFIELD_MAPPING[field]] = a.pop(field)
    for field in DEDUPE_2_ERS_DATAFIELD_MAPPING.keys():
        b[DEDUPE_2_ERS_DATAFIELD_MAPPING[field]] = b.pop(field)

# messy = {('d'+str(k)):v[0] for k,v in zip(range(len(trainset['distinct'])), trainset['distinct'])}
# canonical = {('m'+str(k)):v[0] for k,v in zip(range(len(trainset['match'])), trainset['match'])}

# somehow the data must be split up: one part has to be use to call this gazetteer.sample function,
# the other part is used as the "real" trainings data:
messy = {('d'+str(k)):v[0] for k,v in zip(range(50), trainset['distinct'])}
canonical = {('m'+str(k)):v[0] for k,v in zip(range(50), trainset['match'])}

new_trainset = {'match':trainset['match'][51:], 'distinct':trainset['distinct'][51:]}
new_trainset_filepath = str(Path(os.getenv('DATA_PATH')) / 'raw' / 'snd_new_trainset.json')
with open(new_trainset_filepath, 'w') as fp:
    json.dump(new_trainset, fp)
    
gazetteer = dedupe.Gazetteer(fields)
gazetteer.sample(messy, canonical)

with open(new_trainset_filepath) as tf:
    gazetteer.readTraining(tf)
    
gazetteer.train(recall=.05)

model_output_filepath = str(Path(os.getenv('MODELS_PATH')) / 'second.model')
with open(model_output_filepath, 'wb') as sf:
    gazetteer.writeSettings(sf, index=True)

  % (sample_size, len(blocked_sample)))
INFO:dedupe.api:reading training from file
INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (sameFiveCharStartPredicate, name), SimplePredicate: (sameThreeCharStartPredicate, name))
INFO:dedupe.training:(SimplePredicate: (commonFourGram, address.street), SimplePredicate: (nearIntegersPredicate, address.postal_code))
INFO:dedupe.training:(SimplePredicate: (fingerprint, address.country), SimplePredicate: (sameThreeCharStartPredicate, name))
INFO:rlr.crossvalidation:using cross validation to find optimum alpha...
INFO:rlr.crossvalidation:optimum alpha: 0.100000, score 0.6073378014501305
INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (sameFiveCharStartPredicate, name), SimplePredicate: (sameThreeCharStartPredicate, name))


## Model #3

In [3]:
trainset_filepath = str(Path(os.getenv('DATA_PATH')) / 'raw' / '2018-07-03-trainings_data_from_dedupe.json')
with open(trainset_filepath) as file:
    trainset = json.load(file)
    
with open(str(Path(os.getenv('MODELS_PATH')) / '2018-06-26-dedupeio_datamodel.json')) as file:
    fields = json.load(file)
    
for field in fields:
    field['field'] = DEDUPE_2_ERS_DATAFIELD_MAPPING[field['field']]

for [a, b] in trainset['match']:
    for field in DEDUPE_2_ERS_DATAFIELD_MAPPING.keys():
        a[DEDUPE_2_ERS_DATAFIELD_MAPPING[field]] = a.pop(field)
    for field in DEDUPE_2_ERS_DATAFIELD_MAPPING.keys():
        b[DEDUPE_2_ERS_DATAFIELD_MAPPING[field]] = b.pop(field)
for [a, b] in trainset['distinct']:
    for field in DEDUPE_2_ERS_DATAFIELD_MAPPING.keys():
        a[DEDUPE_2_ERS_DATAFIELD_MAPPING[field]] = a.pop(field)
    for field in DEDUPE_2_ERS_DATAFIELD_MAPPING.keys():
        b[DEDUPE_2_ERS_DATAFIELD_MAPPING[field]] = b.pop(field)

# messy = {('d'+str(k)):v[0] for k,v in zip(range(len(trainset['distinct'])), trainset['distinct'])}
# canonical = {('m'+str(k)):v[0] for k,v in zip(range(len(trainset['match'])), trainset['match'])}

# somehow the data must be split up: one part has to be use to call this gazetteer.sample function,
# the other part is used as the "real" trainings data:
messy = {('d'+str(k)):v[0] for k,v in zip(range(50), trainset['distinct'])}
canonical = {('m'+str(k)):v[0] for k,v in zip(range(50), trainset['match'])}

new_trainset = {'match':trainset['match'][51:], 'distinct':trainset['distinct'][51:]}
new_trainset_filepath = str(Path(os.getenv('DATA_PATH')) / 'raw' / 'snd_new_trainset.json')
with open(new_trainset_filepath, 'w') as fp:
    json.dump(new_trainset, fp)
    
gazetteer = dedupe.Gazetteer(fields)
gazetteer.sample(messy, canonical)

with open(new_trainset_filepath) as tf:
    gazetteer.readTraining(tf)
    
gazetteer.train(recall=.95)

model_output_filepath = str(Path(os.getenv('MODELS_PATH')) / 'third.model')
with open(model_output_filepath, 'wb') as sf:
    gazetteer.writeSettings(sf, index=True)

  % (sample_size, len(blocked_sample)))
INFO:dedupe.api:reading training from file
INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (sameFiveCharStartPredicate, name), SimplePredicate: (sameThreeCharStartPredicate, name))
INFO:dedupe.training:(SimplePredicate: (commonFourGram, address.street), SimplePredicate: (nearIntegersPredicate, address.postal_code))
INFO:dedupe.training:(SimplePredicate: (sameThreeCharStartPredicate, name), SimplePredicate: (sortedAcronym, address.country))
INFO:rlr.crossvalidation:using cross validation to find optimum alpha...
INFO:rlr.crossvalidation:optimum alpha: 0.100000, score 0.5819708628105468
INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (sameFiveCharStartPredicate, name), SimplePredicate: (sameThreeCharStartPredicate, name))
INFO:dedupe.training:(SimplePredicate: (commonFourGram, address.street), SimplePredicate: (nearIntegersPredicate, address.postal_code))


# Model trained on big data set

In [4]:
with open(os.getenv('DATA_PATH') + '/training-data-from-kantwert-buergle/training_data.json') as file:
    trainset = json.load(file)

with open(str(Path(os.getenv('MODELS_PATH')) / '2018-06-26-dedupeio_datamodel.json')) as file:
    fields = json.load(file)

for field in fields:
    field['field'] = DEDUPE_2_ERS_DATAFIELD_MAPPING[field['field']]

for [a, b] in trainset['match']:
    for field in DEDUPE_2_ERS_DATAFIELD_MAPPING.keys():
        a[DEDUPE_2_ERS_DATAFIELD_MAPPING[field]] = a.pop(field)
    for field in DEDUPE_2_ERS_DATAFIELD_MAPPING.keys():
        b[DEDUPE_2_ERS_DATAFIELD_MAPPING[field]] = b.pop(field)
for [a, b] in trainset['distinct']:
    for field in DEDUPE_2_ERS_DATAFIELD_MAPPING.keys():
        a[DEDUPE_2_ERS_DATAFIELD_MAPPING[field]] = a.pop(field)
    for field in DEDUPE_2_ERS_DATAFIELD_MAPPING.keys():
        b[DEDUPE_2_ERS_DATAFIELD_MAPPING[field]] = b.pop(field)

In [5]:
len(trainset['match'])

167589

In [6]:
# somehow the data must be split up: one part has to be use to call this gazetteer.sample function,
# the other part is used as the "real" trainings data:

NUM_FOR_SAMPLING = 1000
NUM_FOR_TRAINING = 5000

messy = {('d'+str(k)):v[0] for k,v in zip(range(NUM_FOR_SAMPLING), trainset['distinct'])}
canonical = {('m'+str(k)):v[0] for k,v in zip(range(NUM_FOR_SAMPLING), trainset['match'])}

new_trainset = {'match':trainset['match'][NUM_FOR_SAMPLING+1:NUM_FOR_SAMPLING + NUM_FOR_TRAINING], 
                'distinct':trainset['distinct'][NUM_FOR_SAMPLING+1:NUM_FOR_SAMPLING + NUM_FOR_TRAINING]}
new_trainset_filepath = str(Path(os.getenv('DATA_PATH')) / 'raw' / 'tmp_trainset.json')
with open(new_trainset_filepath, 'w') as fp:
    json.dump(new_trainset, fp)

In [7]:
gazetteer = dedupe.Gazetteer(fields)
gazetteer.sample(messy, canonical)

with open(new_trainset_filepath) as tf:
    gazetteer.readTraining(tf)
    
gazetteer.train()

model_output_filepath = str(Path(os.getenv('MODELS_PATH')) / f'201904120940_big_kantwert_S{NUM_FOR_SAMPLING}_T{NUM_FOR_TRAINING}.model')
with open(model_output_filepath, 'wb') as sf:
    gazetteer.writeSettings(sf, index=True)

INFO:dedupe.api:reading training from file
INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (sameFiveCharStartPredicate, name), SimplePredicate: (sameSevenCharStartPredicate, address.city))
INFO:dedupe.training:(SimplePredicate: (doubleMetaphone, name), SimplePredicate: (sameSevenCharStartPredicate, address.city))
INFO:dedupe.training:(SimplePredicate: (commonTwoTokens, address.street), SimplePredicate: (sameSevenCharStartPredicate, address.city))
INFO:dedupe.training:(SimplePredicate: (sameSevenCharStartPredicate, address.city), SimplePredicate: (sortedAcronym, name))
INFO:dedupe.training:(SimplePredicate: (sameFiveCharStartPredicate, address.postal_code), SimplePredicate: (suffixArray, name))
INFO:rlr.crossvalidation:using cross validation to find optimum alpha...
INFO:rlr.crossvalidation:optimum alpha: 0.001000, score 0.9934036894499885
INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (doubleMetaphone, name), SimplePred