#### Execute always the following cells to init the notebook:

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import json
from pathlib import Path
import math
import pandas as pd
import numpy as np
from dotenv import find_dotenv, load_dotenv
import requests
# import enforce
import logging
import uuid

import backtester
from ers_experimenter import ERSExperiment
import utils

pd.set_option('display.max_colwidth', 500)

def human_readable_number( num ):
    if num is None:
        return "NaN"
    if num > 1e9:
        return "%.2f bn" % (num/1e9)
    if num > 1e6:
        return "%.2f m" % (num/1e6)
    if num > 1e4:
        return "%.2f k" % (num/1e3)
    if abs(num) <= 1.0:
        return "%.1f %%" % (100.0*num)
    return str(num)

def pp(dct: dict):
    for k in dct.keys():
        print(f'{k}: {human_readable_number(dct[k])}')
        
        
def check_results_of_experiment(experiment_name):
    experiment = ERSExperiment()
    experiment.load_results(experiment_name)
    pp(experiment.stats())
    return experiment.confusion_matrix

In [3]:
logging.getLogger("requests").setLevel(logging.WARNING)
logging.getLogger("urllib3").setLevel(logging.WARNING)
os.environ['PYTHON_LOG_LEVEL'] = 'info'
base_path_to_data = Path(os.environ.get('HOME')) / 'host' / 'data' / 'processed'

In [4]:
cd ../..

/home/datascientist/host


In [4]:
# for connecting to the Kubernetes cluster:
# os.environ['ERS_BASEURI'] = 'http://localhost:8001/api/v1/namespaces/valdon-csf-staging/services/entity-recognition-service:80/proxy/ers_service'

# Run experiments

In [5]:
os.environ['ERS_BASEURI']

'http://ers:8080'

In [5]:
response = requests.get(os.environ['ERS_BASEURI'] + '/aux_service/info')
response.text

'{"git-revision":"23399d7158ca2e8e7d1f8058e0d9b61eea98e769","dedupe-infos":"dedupe_model_filepath=/usr/local/lib/entity-recognition-service/python/resources/latest.model;DEDUPE_MIN_SIMILARITY_FOR_A_MATCH=.5","environment-variables":{"DEDUPE_SEARCH_MAX_THRESHOLD":"5","DEDUPE_MODEL":"latest.model"}}'

In [7]:
# list available data sets:
list(base_path_to_data.glob('*.pkl'))

[PosixPath('/home/datascientist/host/data/processed/2018-07-03-big_set_with_limited_address_and_manipulated_information_golden_data.pkl'),
 PosixPath('/home/datascientist/host/data/processed/2018-07-03-big_set_with_limited_address_information_golden_data.pkl'),
 PosixPath('/home/datascientist/host/data/processed/2018-07-03-big_set_with_full_but_manipulated_information_golden_data.pkl'),
 PosixPath('/home/datascientist/host/data/processed/2018-07-03-big_set_with_full_information_golden_data.pkl'),
 PosixPath('/home/datascientist/host/data/processed/2018-07-03-for_unit_tests_evaluation_data.pkl'),
 PosixPath('/home/datascientist/host/data/processed/2018-07-03-for_unit_tests_golden_data.pkl'),
 PosixPath('/home/datascientist/host/data/processed/2018-07-03-small_set_with_full_information_golden_data.pkl'),
 PosixPath('/home/datascientist/host/data/processed/2018-07-03-small_set_with_limited_address_information_evaluation_data.pkl'),
 PosixPath('/home/datascientist/host/data/processed/2018-

### Unit Testing

In [33]:
os.environ['EXPERIMENT_ID'] = 'unit-tests'
%time experiment = ERSExperiment(base_path_to_data / '2018-07-03-for_unit_tests_golden_data.pkl')

15:01:17.013 - INFO - Starting to populate ERS
15:01:18.013 - INFO - Calling ERS on http://192.168.16.1:8080/ers_service with 11 records for the golden data set.
15:01:19.013 - INFO - The ERS was successfully populated with 100.0 % of 11 the loaded records.
15:01:19.013 - INFO - Upload of 0 records failed.
15:01:19.013 - INFO - Processing the golden data set took 0:00:02.512708 time.
CPU times: user 70 ms, sys: 20 ms, total: 90 ms
Wall time: 2.55 s


In [35]:
exp_id, _ = experiment.process_evaluation_dataset(base_path_to_data / '2018-07-03-for_unit_tests_evaluation_data.pkl')

15:02:29.013 - INFO - Calling ERS to match http://192.168.16.1:8080/ers_service with 10 records from the evaluation data set against the golden data.
15:02:29.013 - INFO - Going to store all results of the experiment at /home/datascientist/host/experiments/unit-tests.
15:02:31.013 - INFO - Final result:
15:02:31.013 - INFO - Accuracy is 90.91 %
15:02:31.013 - INFO - Processing the evaluation data set took 0:00:01.653439 time.


In [20]:
experiment.true_and_predicted_ids

Unnamed: 0,evaluation_row_id,true,predicted,similarity
0,0,889d8c59-4a69-4bc1-a5b0-5f5b2c571a51,889d8c59-4a69-4bc1-a5b0-5f5b2c571a51,0.897808
1,1,,,
2,2,,,
3,3,94d8923f-ca3b-484e-b1bb-682d1fbcd7a7,badbad_duplicate,0.922307
4,3,94d8923f-ca3b-484e-b1bb-682d1fbcd7a7,94d8923f-ca3b-484e-b1bb-682d1fbcd7a7,0.922307
5,4,30f9196c-cf14-491f-a4b3-b5f3ee77cea3,30f9196c-cf14-491f-a4b3-b5f3ee77cea3,0.921539
6,5,bcc4bfe5-3fc6-466a-ac8d-859050c33c98,bcc4bfe5-3fc6-466a-ac8d-859050c33c98,0.894242
7,6,0bd395f6-b73e-11e8-a6b9-0242c0a81002,0bd395f6-b73e-11e8-a6b9-0242c0a81002,0.895375
8,7,,,
9,8,,,


## Small full information

In [6]:
%time experiment = ERSExperiment(base_path_to_data / '2018-07-03-small_set_with_full_information_golden_data.pkl')

13:06:13.019 - INFO - Starting to populate ERS
13:06:13.019 - INFO - Calling ERS on http://ers:8080 with 998 records for the golden data set.
13:06:20.019 - INFO - At step 100 avg processing time per company entry was 0.0708 ms
13:06:25.019 - INFO - At step 200 avg processing time per company entry was 0.0516 ms
13:06:30.019 - INFO - At step 300 avg processing time per company entry was 0.0460 ms
13:06:34.019 - INFO - At step 400 avg processing time per company entry was 0.0441 ms
13:06:38.019 - INFO - At step 500 avg processing time per company entry was 0.0393 ms
13:06:43.019 - INFO - At step 600 avg processing time per company entry was 0.0440 ms
13:06:48.019 - INFO - At step 700 avg processing time per company entry was 0.0500 ms
13:06:52.019 - INFO - At step 800 avg processing time per company entry was 0.0428 ms
13:06:56.019 - INFO - At step 900 avg processing time per company entry was 0.0458 ms
13:07:00.019 - INFO - The ERS was successfully populated with 100.0 % of 998 the loa

In [20]:
# experiment = ERSExperiment(base_path_to_data / '2018-07-03-small_golden_data_full_information.pkl', skip_upload_to_ers=True)

In [7]:
os.environ['EXPERIMENT_ID'] = 'kantwert-S1000_T10000-on-small-full-infos'
%time exp_id, results_path = experiment.process_evaluation_dataset(base_path_to_data / '2018-07-03-small_set_with_full_information_evaluation_data.pkl')

10:10:31.017 - INFO - Calling ERS to match http://ers:8080 with 1011 records from the evaluation data set against the golden data.
10:10:31.017 - INFO - Going to store all results of the experiment at /home/datascientist/host/experiments/kantwert-S1000_T10000-on-small-full-infos/results.
10:10:46.017 - INFO - Matched 100 of 1011 9.89 % entries.
10:10:46.017 - INFO - Accuracy is 100.00 %
10:11:01.017 - INFO - Matched 200 of 1011 19.78 % entries.
10:11:01.017 - INFO - Accuracy is 100.00 %
10:11:17.017 - INFO - Matched 300 of 1011 29.67 % entries.
10:11:17.017 - INFO - Accuracy is 100.00 %
10:11:31.017 - INFO - Matched 400 of 1011 39.56 % entries.
10:11:31.017 - INFO - Accuracy is 100.00 %
10:11:48.017 - INFO - Matched 500 of 1011 49.46 % entries.
10:11:48.017 - INFO - Accuracy is 100.00 %
10:12:03.017 - INFO - Matched 600 of 1011 59.35 % entries.
10:12:03.017 - INFO - Accuracy is 100.00 %
10:12:17.017 - INFO - Matched 700 of 1011 69.24 % entries.
10:12:17.017 - INFO - Accuracy is 100.00 

In [20]:
check_results_of_experiment('kantwert-S1000_T10000-on-small-full-infos')

# golden samples: 998
# evaluation samples: 1011
accuracy: 99.9 %
recall: 100.0 %
precision: 99.8 %
f-score: 99.9 %
number of too ambiguous matches: 0.0 %


Unnamed: 0,was_a_match,was_no_match
found_a_match,509,1
found_no_match,0,501


In [7]:
os.environ['EXPERIMENT_ID'] = 'kantwert-S10000_T1000-on-small-full-infos'
%time exp_id, results_path = experiment.process_evaluation_dataset(base_path_to_data / '2018-07-03-small_set_with_full_information_evaluation_data.pkl')
pp(experiment.stats())

13:07:01.019 - INFO - Calling ERS to match http://ers:8080 with 1011 records from the evaluation data set against the golden data.
13:07:01.019 - INFO - Going to store all results of the experiment at /home/datascientist/host/experiments/kantwert-S10000_T1000-on-small-full-infos/results.
13:07:15.019 - INFO - Matched 100 of 1011 9.89 % entries.
13:07:15.019 - INFO - Accuracy is 99.01 %
13:07:27.019 - INFO - Matched 200 of 1011 19.78 % entries.
13:07:27.019 - INFO - Accuracy is 99.50 %
13:07:42.019 - INFO - Matched 300 of 1011 29.67 % entries.
13:07:42.019 - INFO - Accuracy is 99.67 %
13:07:55.019 - INFO - Matched 400 of 1011 39.56 % entries.
13:07:55.019 - INFO - Accuracy is 99.75 %
13:08:11.019 - INFO - Matched 500 of 1011 49.46 % entries.
13:08:11.019 - INFO - Accuracy is 99.80 %
13:08:26.019 - INFO - Matched 600 of 1011 59.35 % entries.
13:08:26.019 - INFO - Accuracy is 99.83 %
13:08:39.019 - INFO - Matched 700 of 1011 69.24 % entries.
13:08:39.019 - INFO - Accuracy is 99.86 %
13:08

In [21]:
check_results_of_experiment('kantwert-S10000_T1000-on-small-full-infos')

# golden samples: 998
# evaluation samples: 1011
accuracy: 99.9 %
recall: 100.0 %
precision: 99.8 %
f-score: 99.9 %
number of too ambiguous matches: 0.0 %


Unnamed: 0,was_a_match,was_no_match
found_a_match,509,1
found_no_match,0,502


In [9]:
os.environ['EXPERIMENT_ID'] = 'small-full-infos'
%time exp_id, results_path = experiment.process_evaluation_dataset(base_path_to_data / '2018-07-03-small_set_with_full_information_evaluation_data.pkl')

10:19:08.022 - INFO - Calling ERS to match http://172.18.0.1:8080/ers_service with 1011 records from the evaluation data set against the golden data.
10:19:08.022 - INFO - Going to store all results of the experiment at /home/datascientist/host/experiments/small-full-infos.
10:19:46.022 - INFO - Matched 100 of 1011 9.89 % entries.
10:19:46.022 - INFO - Accuracy is: 100.00 %
10:20:20.022 - INFO - Matched 200 of 1011 19.78 % entries.
10:20:20.022 - INFO - Accuracy is: 100.00 %
10:21:10.022 - INFO - Matched 300 of 1011 29.67 % entries.
10:21:10.022 - INFO - Accuracy is: 100.00 %
10:21:57.022 - INFO - Matched 400 of 1011 39.56 % entries.
10:21:57.022 - INFO - Accuracy is: 100.00 %
10:22:56.022 - INFO - Matched 500 of 1011 49.46 % entries.
10:22:56.022 - INFO - Accuracy is: 100.00 %
10:23:55.022 - INFO - Matched 600 of 1011 59.35 % entries.
10:23:55.022 - INFO - Accuracy is: 100.00 %
10:24:51.022 - INFO - Matched 700 of 1011 69.24 % entries.
10:24:51.022 - INFO - Accuracy is: 100.00 %
10:25

In [11]:
exp_id, results_path

('small-full-infos',
 PosixPath('/home/datascientist/host/experiments/small-full-infos'))

In [10]:
experiment.confusion_matrix

Unnamed: 0,was_a_match,was_no_match
found_a_match,509,0
found_no_match,0,502


In [19]:
os.environ['EXPERIMENT_ID'] = 'small-full-infos-third-model'
%time exp_id, results_path = experiment.process_evaluation_dataset(base_path_to_data / '2018-07-03-small_set_with_full_information_evaluation_data.pkl')

15:57:28.003 - INFO - Calling ERS to match http://ers:8080/ers_service with 1011 records from the evaluation data set against the golden data.
15:57:28.003 - INFO - Going to store all results of the experiment at /home/datascientist/host/experiments/small-full-infos-third-model.
15:58:07.003 - INFO - Matched 100 of 1011 9.89 % entries.
15:58:07.003 - INFO - Accuracy is: 100.00 %
15:58:42.003 - INFO - Matched 200 of 1011 19.78 % entries.
15:58:42.003 - INFO - Accuracy is: 100.00 %
15:59:25.003 - INFO - Matched 300 of 1011 29.67 % entries.
15:59:25.003 - INFO - Accuracy is: 100.00 %
15:59:58.003 - INFO - Matched 400 of 1011 39.56 % entries.
15:59:58.003 - INFO - Accuracy is: 100.00 %
16:00:41.003 - INFO - Matched 500 of 1011 49.46 % entries.
16:00:41.003 - INFO - Accuracy is: 100.00 %
16:01:20.003 - INFO - Matched 600 of 1011 59.35 % entries.
16:01:20.003 - INFO - Accuracy is: 100.00 %
16:01:57.003 - INFO - Matched 700 of 1011 69.24 % entries.
16:01:57.003 - INFO - Accuracy is: 100.00 %


In [20]:
experiment.confusion_matrix

Unnamed: 0,was_a_match,was_no_match
found_a_match,509,0
found_no_match,0,502


## Small limited address information

In [21]:
%time experiment = ERSExperiment(base_path_to_data / "2018-07-03-small_set_with_limited_address_information_golden_data.pkl")

16:08:03.003 - INFO - Starting to populate ERS
16:08:03.003 - INFO - Calling ERS on http://ers:8080/ers_service with 998 records for the golden data set.
16:08:49.003 - INFO - The ERS was successfully populated with 100.0 % of 998 the loaded records.
16:08:49.003 - INFO - Upload of 0 records failed.
16:08:49.003 - INFO - Processing the golden data set took 0:00:45.800147 time.
CPU times: user 4.55 s, sys: 520 ms, total: 5.07 s
Wall time: 45.8 s


In [9]:
os.environ['EXPERIMENT_ID'] = 'small-limited-address-infos'
%time exp_id, results_path = experiment.process_evaluation_dataset(base_path_to_data / '2018-07-03-small_set_with_limited_address_information_evaluation_data.pkl')

11:14:00.022 - INFO - Calling ERS to match http://172.18.0.1:8080/ers_service with 1011 records from the evaluation data set against the golden data.
11:14:00.022 - INFO - Going to store all results of the experiment at /home/datascientist/host/experiments/small-limited-address-infos.
11:14:30.022 - INFO - Matched 100 of 1011 9.89 % entries.
11:14:30.022 - INFO - Accuracy is: 100.00 %
11:14:58.022 - INFO - Matched 200 of 1011 19.78 % entries.
11:14:58.022 - INFO - Accuracy is: 99.50 %
11:15:36.022 - INFO - Matched 300 of 1011 29.67 % entries.
11:15:36.022 - INFO - Accuracy is: 99.67 %
11:16:16.022 - INFO - Matched 400 of 1011 39.56 % entries.
11:16:16.022 - INFO - Accuracy is: 99.75 %
11:17:08.022 - INFO - Matched 500 of 1011 49.46 % entries.
11:17:08.022 - INFO - Accuracy is: 99.80 %
11:17:58.022 - INFO - Matched 600 of 1011 59.35 % entries.
11:17:58.022 - INFO - Accuracy is: 99.83 %
11:18:47.022 - INFO - Matched 700 of 1011 69.24 % entries.
11:18:47.022 - INFO - Accuracy is: 99.57 %


In [10]:
experiment.confusion_matrix

Unnamed: 0,was_a_match,was_no_match
found_a_match,508,3
found_no_match,1,499


In [20]:
experiment.false_positives

Unnamed: 0_level_0,Unnamed: 1_level_0,founded,searched
Match-Id,Attribute,Unnamed: 2_level_1,Unnamed: 3_level_1
620,ersId,4b887052-9c8d-11e8-9a84-0242ac120002,
620,name,A E R O -Verpackungsgesellschaft m.b.H.,A E R O Verpackungsgesellschaft mbH
620,email,geldern@aeropack.de,info@aeropack.de
620,phoneNumber,+496359937070,+496359937070
620,website,http://http//www.aeropack.de,http://http//www.aeropack.de
620,address.street,Zeppelinstr. 16,
620,address.city,Geldern,
678,ersId,4b4e21b8-9c8d-11e8-9a84-0242ac120002,
678,name,A. Fischer GmbH,A. Fischer & Söhne GmbH
678,email,info@fischer-kronach.de,bochum@fischersoehne.de


In [22]:
os.environ['EXPERIMENT_ID'] = 'small-limited-address-infos-third-model'
%time exp_id, results_path = experiment.process_evaluation_dataset(base_path_to_data / '2018-07-03-small_set_with_limited_address_information_evaluation_data.pkl')

16:10:02.003 - INFO - Calling ERS to match http://ers:8080/ers_service with 1011 records from the evaluation data set against the golden data.
16:10:02.003 - INFO - Going to store all results of the experiment at /home/datascientist/host/experiments/small-limited-address-infos-third-model.
16:10:39.003 - INFO - Matched 100 of 1011 9.89 % entries.
16:10:39.003 - INFO - Accuracy is: 98.00 %
16:11:11.003 - INFO - Matched 200 of 1011 19.78 % entries.
16:11:11.003 - INFO - Accuracy is: 98.50 %
16:11:51.003 - INFO - Matched 300 of 1011 29.67 % entries.
16:11:51.003 - INFO - Accuracy is: 98.67 %
16:12:23.003 - INFO - Matched 400 of 1011 39.56 % entries.
16:12:23.003 - INFO - Accuracy is: 99.00 %
16:13:04.003 - INFO - Matched 500 of 1011 49.46 % entries.
16:13:04.003 - INFO - Accuracy is: 99.20 %
16:13:38.003 - INFO - Matched 600 of 1011 59.35 % entries.
16:13:38.003 - INFO - Accuracy is: 99.33 %
16:14:13.003 - INFO - Matched 700 of 1011 69.24 % entries.
16:14:13.003 - INFO - Accuracy is: 99.0

In [23]:
check_results_of_experiment('small-limited-address-infos-third-model')

# golden samples: 998
# evaluation samples: 1011
accuracy: 99.2 %
recall: 100.0 %
precision: 98.5 %
f-score: 99.2 %
number of too ambiguous matches: NaN


Unnamed: 0,was_a_match,was_no_match
found_a_match,509,8
found_no_match,0,494


In [25]:
experiment.false_positives

Unnamed: 0_level_0,Unnamed: 1_level_0,founded,searched
Match-Id,Attribute,Unnamed: 2_level_1,Unnamed: 3_level_1
3,ersId,f48fb1e0-af63-11e8-b965-0242ac130003,
3,name,ALPHA LASER GmbH,ALPHA CERAMICS GmbH
3,email,info@alphalaser.de,info@alpha-ceramics.de
3,phoneNumber,+49898902370,+49241160030
3,website,http://http//www.alphalaser.de,http://http//www.alpha-ceramics.de
39,ersId,f486deda-af63-11e8-b965-0242ac130003,
39,name,APART GmbH,APART Fashion GmbH
39,email,info@apart.de,service@apart-fashion.de
39,phoneNumber,+49632797480,+494065033940
39,website,http://http//www.apart.de,http://http//www.apart-fashion.de


## Small full but manipulated information

In [5]:
%time experiment = ERSExperiment(base_path_to_data / '2018-07-03-small_set_with_full_but_manipulated_information_golden_data.pkl')

20:51:33.020 - INFO - Starting to populate ERS
20:51:34.020 - INFO - Calling ERS on http://ers:8080 with 998 records for the golden data set.
20:51:40.020 - INFO - At step 100 avg processing time per company entry was 0.0675 ms
20:51:45.020 - INFO - At step 200 avg processing time per company entry was 0.0458 ms
20:51:49.020 - INFO - At step 300 avg processing time per company entry was 0.0408 ms
20:51:53.020 - INFO - At step 400 avg processing time per company entry was 0.0383 ms
20:51:56.020 - INFO - At step 500 avg processing time per company entry was 0.0348 ms
20:52:00.020 - INFO - At step 600 avg processing time per company entry was 0.0352 ms
20:52:04.020 - INFO - At step 700 avg processing time per company entry was 0.0400 ms
20:52:07.020 - INFO - At step 800 avg processing time per company entry was 0.0353 ms
20:52:11.020 - INFO - At step 900 avg processing time per company entry was 0.0333 ms
20:52:14.020 - INFO - The ERS was successfully populated with 100.0 % of 998 the loa

In [10]:
os.environ['EXPERIMENT_ID'] = 'kantwert-S1000_T10000-on-small-full-but-manipulated'
%time exp_id, results_path = experiment.process_evaluation_dataset(base_path_to_data / '2018-07-03-small_set_with_full_but_manipulated_information_evaluation_data.pkl')

10:14:30.017 - INFO - Calling ERS to match http://ers:8080 with 1011 records from the evaluation data set against the golden data.
10:14:30.017 - INFO - Going to store all results of the experiment at /home/datascientist/host/experiments/kantwert-S1000_T10000-on-small-full-but-manipulated/results.
10:14:40.017 - INFO - Matched 100 of 1011 9.89 % entries.
10:14:40.017 - INFO - Accuracy is 81.00 %
10:14:49.017 - INFO - Matched 200 of 1011 19.78 % entries.
10:14:49.017 - INFO - Accuracy is 83.00 %
10:15:00.017 - INFO - Matched 300 of 1011 29.67 % entries.
10:15:00.017 - INFO - Accuracy is 82.33 %
10:15:09.017 - INFO - Matched 400 of 1011 39.56 % entries.
10:15:09.017 - INFO - Accuracy is 82.75 %
10:15:22.017 - INFO - Matched 500 of 1011 49.46 % entries.
10:15:22.017 - INFO - Accuracy is 83.20 %
10:15:32.017 - INFO - Matched 600 of 1011 59.35 % entries.
10:15:32.017 - INFO - Accuracy is 82.83 %
10:15:42.017 - INFO - Matched 700 of 1011 69.24 % entries.
10:15:42.017 - INFO - Accuracy is 83.

In [24]:
check_results_of_experiment('kantwert-S1000_T10000-on-small-full-but-manipulated')

# golden samples: 998
# evaluation samples: 1011
accuracy: 83.0 %
recall: 66.2 %
precision: 100.0 %
f-score: 79.7 %
number of too ambiguous matches: 0.0 %


Unnamed: 0,was_a_match,was_no_match
found_a_match,337,0
found_no_match,172,502


In [6]:
check_results_of_experiment('2019-04-19T234325_kantwert-S1000-T10000-sanitized')

# golden samples: 998
# evaluation samples: 1011
accuracy: 55.0 %
recall: 10.6 %
precision: 100.0 %
f-score: 19.2 %
number of too ambiguous matches: 0.0 %


Unnamed: 0,was_a_match,was_no_match
found_a_match,54,0
found_no_match,455,502


In [5]:
check_results_of_experiment('2019-04-20T213552_kantwert-S1000-T10000-sanitized_again')

# golden samples: 998
# evaluation samples: 1011
accuracy: 60.1 %
recall: 20.8 %
precision: 100.0 %
f-score: 34.5 %
number of too ambiguous matches: 0.0 %


Unnamed: 0,was_a_match,was_no_match
found_a_match,106,0
found_no_match,403,502


In [5]:
# BEST:
check_results_of_experiment('2019-04-20T215328_kantwert-S1000-T10000')

# golden samples: 998
# evaluation samples: 1011
accuracy: 83.4 %
recall: 67.0 %
precision: 100.0 %
f-score: 80.2 %
number of too ambiguous matches: 0.0 %


Unnamed: 0,was_a_match,was_no_match
found_a_match,341,0
found_no_match,168,502


In [6]:
os.environ['EXPERIMENT_ID'] = 'kantwert-S1000_T10000-on-small-full-but-manipulated-xcheck-model-from-2019-04-20T215328'
%time exp_id, results_path = experiment.process_evaluation_dataset(base_path_to_data / '2018-07-03-small_set_with_full_but_manipulated_information_evaluation_data.pkl')

20:53:47.020 - INFO - Calling ERS to match http://ers:8080 with 1011 records from the evaluation data set against the golden data.
20:53:47.020 - INFO - Going to store all results of the experiment at data/experiments/kantwert-S1000_T10000-on-small-full-but-manipulated-xcheck-model-from-2019-04-20T215328/results.
20:53:51.020 - INFO - Matched 100 of 1011 9.89 % entries.
20:53:51.020 - INFO - Accuracy is 56.00 %
20:53:55.020 - INFO - Matched 200 of 1011 19.78 % entries.
20:53:56.020 - INFO - Accuracy is 62.50 %
20:54:00.020 - INFO - Matched 300 of 1011 29.67 % entries.
20:54:00.020 - INFO - Accuracy is 61.00 %
20:54:05.020 - INFO - Matched 400 of 1011 39.56 % entries.
20:54:05.020 - INFO - Accuracy is 62.75 %
20:54:10.020 - INFO - Matched 500 of 1011 49.46 % entries.
20:54:10.020 - INFO - Accuracy is 60.80 %
20:54:14.020 - INFO - Matched 600 of 1011 59.35 % entries.
20:54:14.020 - INFO - Accuracy is 60.50 %
20:54:17.020 - INFO - Matched 700 of 1011 69.24 % entries.
20:54:17.020 - INFO -

In [7]:
check_results_of_experiment('kantwert-S1000_T10000-on-small-full-but-manipulated-xcheck-model-from-2019-04-20T215328')

# golden samples: 998
# evaluation samples: 1011
accuracy: 60.1 %
recall: 20.8 %
precision: 100.0 %
f-score: 34.5 %
number of too ambiguous matches: 0.0 %


Unnamed: 0,was_a_match,was_no_match
found_a_match,106,0
found_no_match,403,502


In [8]:
os.environ['EXPERIMENT_ID'] = 'kantwert-S1000_T10000-on-small-full-but-manipulated-xcheck-model-from-2019-04-20T215328-2nd'
%time exp_id, results_path = experiment.process_evaluation_dataset(base_path_to_data / '2018-07-03-small_set_with_full_but_manipulated_information_evaluation_data.pkl')

20:55:47.020 - INFO - Calling ERS to match http://ers:8080 with 1011 records from the evaluation data set against the golden data.
20:55:47.020 - INFO - Going to store all results of the experiment at data/experiments/kantwert-S1000_T10000-on-small-full-but-manipulated-xcheck-model-from-2019-04-20T215328-2nd/results.
20:55:51.020 - INFO - Matched 100 of 1011 9.89 % entries.
20:55:51.020 - INFO - Accuracy is 56.00 %
20:55:55.020 - INFO - Matched 200 of 1011 19.78 % entries.
20:55:55.020 - INFO - Accuracy is 62.50 %
20:55:59.020 - INFO - Matched 300 of 1011 29.67 % entries.
20:55:59.020 - INFO - Accuracy is 61.00 %
20:56:04.020 - INFO - Matched 400 of 1011 39.56 % entries.
20:56:04.020 - INFO - Accuracy is 62.75 %
20:56:08.020 - INFO - Matched 500 of 1011 49.46 % entries.
20:56:08.020 - INFO - Accuracy is 60.80 %
20:56:12.020 - INFO - Matched 600 of 1011 59.35 % entries.
20:56:12.020 - INFO - Accuracy is 60.50 %
20:56:15.020 - INFO - Matched 700 of 1011 69.24 % entries.
20:56:15.020 - IN

In [9]:
check_results_of_experiment('kantwert-S1000_T10000-on-small-full-but-manipulated-xcheck-model-from-2019-04-20T215328-2nd')

# golden samples: 998
# evaluation samples: 1011
accuracy: 60.1 %
recall: 20.8 %
precision: 100.0 %
f-score: 34.5 %
number of too ambiguous matches: 0.0 %


Unnamed: 0,was_a_match,was_no_match
found_a_match,106,0
found_no_match,403,502


In [5]:
check_results_of_experiment('2019-04-20T221226_kantwert-S1000-T10000-new-model')

# golden samples: 998
# evaluation samples: 1011
accuracy: 57.0 %
recall: 14.5 %
precision: 100.0 %
f-score: 25.4 %
number of too ambiguous matches: 0.0 %


Unnamed: 0,was_a_match,was_no_match
found_a_match,74,0
found_no_match,435,502


In [5]:
check_results_of_experiment('2019-04-20T222133_kantwert-S1000-T10000-2nd')

# golden samples: 998
# evaluation samples: 1011
accuracy: 56.0 %
recall: 12.8 %
precision: 98.5 %
f-score: 22.6 %
number of too ambiguous matches: 0.0 %


Unnamed: 0,was_a_match,was_no_match
found_a_match,65,1
found_no_match,444,502


In [5]:
check_results_of_experiment('2019-04-20T223105_kantwert-S1000-T10000_3rd')

# golden samples: 998
# evaluation samples: 1011
accuracy: 72.7 %
recall: 47.0 %
precision: 96.8 %
f-score: 63.2 %
number of too ambiguous matches: 0.0 %


Unnamed: 0,was_a_match,was_no_match
found_a_match,239,8
found_no_match,270,500


In [5]:
os.environ['EXPERIMENT_ID'] = 'explicit-sanaticed-kantwert-ST5000-on-small-full-but-manipulated'
%time exp_id, results_path = experiment.process_evaluation_dataset(base_path_to_data / '2018-07-03-small_set_with_full_but_manipulated_information_evaluation_data.pkl')
print("")
pp(experiment.stats())

20:19:06.019 - INFO - Calling ERS to match http://ers:8080 with 1011 records from the evaluation data set against the golden data.
20:19:06.019 - INFO - Going to store all results of the experiment at data/experiments/explicit-sanaticed-kantwert-ST5000-on-small-full-but-manipulated/results.
20:19:15.019 - INFO - Matched 100 of 1011 9.89 % entries.
20:19:15.019 - INFO - Accuracy is 76.00 %
20:19:22.019 - INFO - Matched 200 of 1011 19.78 % entries.
20:19:22.019 - INFO - Accuracy is 78.50 %
20:19:31.019 - INFO - Matched 300 of 1011 29.67 % entries.
20:19:31.019 - INFO - Accuracy is 77.00 %
20:19:41.019 - INFO - Matched 400 of 1011 39.56 % entries.
20:19:41.019 - INFO - Accuracy is 78.30 %
20:19:50.019 - INFO - Matched 500 of 1011 49.46 % entries.
20:19:50.019 - INFO - Accuracy is 76.85 %
20:19:57.019 - INFO - Matched 600 of 1011 59.35 % entries.
20:19:57.019 - INFO - Accuracy is 75.58 %
20:20:03.019 - INFO - Matched 700 of 1011 69.24 % entries.
20:20:03.019 - INFO - Accuracy is 75.36 %
20

In [7]:
check_results_of_experiment('explicit-sanaticed-kantwert-ST5000-on-small-full-but-manipulated')

# golden samples: 998
# evaluation samples: 1011
accuracy: 74.5 %
recall: 50.5 %
precision: 97.3 %
f-score: 66.5 %
number of too ambiguous matches: 0.0 %


Unnamed: 0,was_a_match,was_no_match
found_a_match,257,7
found_no_match,252,500


In [6]:
os.environ['EXPERIMENT_ID'] = 'explicit-sanaticed-kantwert-S5000_T5000-on-small-full-but-manipulated'
%time exp_id, results_path = experiment.process_evaluation_dataset(base_path_to_data / '2018-07-03-small_set_with_full_but_manipulated_information_evaluation_data.pkl')
print("")
pp(experiment.stats())

19:41:11.019 - INFO - Calling ERS to match http://ers:8080 with 1011 records from the evaluation data set against the golden data.
19:41:11.019 - INFO - Going to store all results of the experiment at data/experiments/explicit-sanaticed-kantwert-S5000_T5000-on-small-full-but-manipulated/results.
19:41:14.019 - INFO - Matched 100 of 1011 9.89 % entries.
19:41:15.019 - INFO - Accuracy is 54.00 %
19:41:17.019 - INFO - Matched 200 of 1011 19.78 % entries.
19:41:17.019 - INFO - Accuracy is 58.00 %
19:41:19.019 - INFO - Matched 300 of 1011 29.67 % entries.
19:41:20.019 - INFO - Accuracy is 56.33 %
19:41:22.019 - INFO - Matched 400 of 1011 39.56 % entries.
19:41:22.019 - INFO - Accuracy is 57.25 %
19:41:26.019 - INFO - Matched 500 of 1011 49.46 % entries.
19:41:26.019 - INFO - Accuracy is 56.20 %
19:41:29.019 - INFO - Matched 600 of 1011 59.35 % entries.
19:41:29.019 - INFO - Accuracy is 56.67 %
19:41:32.019 - INFO - Matched 700 of 1011 69.24 % entries.
19:41:32.019 - INFO - Accuracy is 57.00

In [7]:
experiment.confusion_matrix

Unnamed: 0,was_a_match,was_no_match
found_a_match,65,1
found_no_match,444,502


In [27]:
os.environ['EXPERIMENT_ID'] = 'kantwert-S5000_T5000-on-small-full-but-manipulated'
%time exp_id, results_path = experiment.process_evaluation_dataset(base_path_to_data / '2018-07-03-small_set_with_full_but_manipulated_information_evaluation_data.pkl')
print("")
pp(experiment.stats())

13:36:23.019 - INFO - Calling ERS to match http://ers:8080 with 1011 records from the evaluation data set against the golden data.
13:36:23.019 - INFO - Going to store all results of the experiment at data/experiments/kantwert-S5000_T5000-on-small-full-but-manipulated/results.
13:36:25.019 - INFO - Matched 100 of 1011 9.89 % entries.
13:36:25.019 - INFO - Accuracy is 53.00 %
13:36:27.019 - INFO - Matched 200 of 1011 19.78 % entries.
13:36:27.019 - INFO - Accuracy is 57.00 %
13:36:29.019 - INFO - Matched 300 of 1011 29.67 % entries.
13:36:29.019 - INFO - Accuracy is 55.00 %
13:36:31.019 - INFO - Matched 400 of 1011 39.56 % entries.
13:36:31.019 - INFO - Accuracy is 56.00 %
13:36:35.019 - INFO - Matched 500 of 1011 49.46 % entries.
13:36:35.019 - INFO - Accuracy is 55.00 %
13:36:38.019 - INFO - Matched 600 of 1011 59.35 % entries.
13:36:38.019 - INFO - Accuracy is 55.67 %
13:36:41.019 - INFO - Matched 700 of 1011 69.24 % entries.
13:36:41.019 - INFO - Accuracy is 56.14 %
13:36:44.019 - I

In [28]:
check_results_of_experiment('kantwert-S5000_T5000-on-small-full-but-manipulated')

# golden samples: 998
# evaluation samples: 1011
accuracy: 55.0 %
recall: 10.6 %
precision: 100.0 %
f-score: 19.2 %
number of too ambiguous matches: 0.0 %


Unnamed: 0,was_a_match,was_no_match
found_a_match,54,0
found_no_match,455,502


In [12]:
os.environ['EXPERIMENT_ID'] = 'kantwert-S10000_T1000-on-small-full-but-manipulated'
%time exp_id, results_path = experiment.process_evaluation_dataset(base_path_to_data / '2018-07-03-small_set_with_full_but_manipulated_information_evaluation_data.pkl')
print("")
pp(experiment.stats())

13:12:01.019 - INFO - Calling ERS to match http://ers:8080 with 1011 records from the evaluation data set against the golden data.
13:12:01.019 - INFO - Going to store all results of the experiment at /home/datascientist/host/experiments/kantwert-S10000_T1000-on-small-full-but-manipulated/results.
13:12:09.019 - INFO - Matched 100 of 1011 9.89 % entries.
13:12:09.019 - INFO - Accuracy is 73.00 %
13:12:17.019 - INFO - Matched 200 of 1011 19.78 % entries.
13:12:17.019 - INFO - Accuracy is 78.00 %
13:12:26.019 - INFO - Matched 300 of 1011 29.67 % entries.
13:12:26.019 - INFO - Accuracy is 75.33 %
13:12:35.019 - INFO - Matched 400 of 1011 39.56 % entries.
13:12:35.019 - INFO - Accuracy is 75.00 %
13:12:43.019 - INFO - Matched 500 of 1011 49.46 % entries.
13:12:44.019 - INFO - Accuracy is 74.60 %
13:12:52.019 - INFO - Matched 600 of 1011 59.35 % entries.
13:12:52.019 - INFO - Accuracy is 75.00 %
13:12:58.019 - INFO - Matched 700 of 1011 69.24 % entries.
13:12:58.019 - INFO - Accuracy is 75.

In [29]:
check_results_of_experiment('kantwert-S10000_T1000-on-small-full-but-manipulated')

# golden samples: 998
# evaluation samples: 1011
accuracy: 75.3 %
recall: 50.9 %
precision: 100.0 %
f-score: 67.4 %
number of too ambiguous matches: 0.0 %


Unnamed: 0,was_a_match,was_no_match
found_a_match,259,0
found_no_match,250,502


In [9]:
os.environ['EXPERIMENT_ID'] = 'small-full-but-manipulated'
%time exp_id, results_path = experiment.process_evaluation_dataset(base_path_to_data / '2018-07-03-small_set_with_full_but_manipulated_information_evaluation_data.pkl')

07:37:38.030 - INFO - Calling ERS to match http://172.18.0.1:8080/ers_service with 1011 records from the evaluation data set against the golden data.
07:37:38.030 - INFO - Going to store all results of the experiment at /home/datascientist/host/experiments/919e2b64-ac27-11e8-8288-0242ac120003.
07:37:55.030 - INFO - Matched 100 of 1011 9.89 % entries.
07:37:55.030 - INFO - Accuracy is: 72.00 %
07:38:10.030 - INFO - Matched 200 of 1011 19.78 % entries.
07:38:10.030 - INFO - Accuracy is: 74.00 %
07:38:30.030 - INFO - Matched 300 of 1011 29.67 % entries.
07:38:30.030 - INFO - Accuracy is: 73.00 %
07:38:48.030 - INFO - Matched 400 of 1011 39.56 % entries.
07:38:48.030 - INFO - Accuracy is: 74.00 %
07:39:09.030 - INFO - Matched 500 of 1011 49.46 % entries.
07:39:09.030 - INFO - Accuracy is: 72.20 %
07:39:33.030 - INFO - Matched 600 of 1011 59.35 % entries.
07:39:33.030 - INFO - Accuracy is: 72.33 %
07:39:58.030 - INFO - Matched 700 of 1011 69.24 % entries.
07:39:58.030 - INFO - Accuracy is: 

In [10]:
experiment.confusion_matrix

Unnamed: 0,was_a_match,was_no_match
found_a_match,54,0
found_no_match,455,502


In [6]:
check_results_of_experiment('small-full-but-manipulated-third-model')

# golden samples: 998
# evaluation samples: 1011
accuracy: 82.8 %
recall: 65.8 %
precision: 100.0 %
f-score: 79.4 %
number of too ambiguous matches: NaN


Unnamed: 0,was_a_match,was_no_match
found_a_match,335,0
found_no_match,174,502


In [5]:
os.environ['EXPERIMENT_ID'] = 'small-full-but-manipulated-second-model'
%time exp_id, results_path = experiment.process_evaluation_dataset(base_path_to_data / '2018-07-03-small_set_with_full_but_manipulated_information_evaluation_data.pkl')

14:25:37.030 - INFO - Calling ERS to match http://ers:8080/ers_service with 1011 records from the evaluation data set against the golden data.
14:25:37.030 - INFO - Going to store all results of the experiment at /home/datascientist/host/experiments/small-full-but-manipulated-second-model.
14:25:45.030 - INFO - Matched 100 of 1011 9.89 % entries.
14:25:45.030 - INFO - Accuracy is: 70.00 %
14:25:53.030 - INFO - Matched 200 of 1011 19.78 % entries.
14:25:53.030 - INFO - Accuracy is: 75.50 %
14:26:02.030 - INFO - Matched 300 of 1011 29.67 % entries.
14:26:02.030 - INFO - Accuracy is: 74.33 %
14:26:09.030 - INFO - Matched 400 of 1011 39.56 % entries.
14:26:09.030 - INFO - Accuracy is: 75.25 %
14:26:20.030 - INFO - Matched 500 of 1011 49.46 % entries.
14:26:20.030 - INFO - Accuracy is: 75.60 %
14:26:29.030 - INFO - Matched 600 of 1011 59.35 % entries.
14:26:29.030 - INFO - Accuracy is: 76.00 %
14:26:37.030 - INFO - Matched 700 of 1011 69.24 % entries.
14:26:37.030 - INFO - Accuracy is: 76.4

In [5]:
os.environ['EXPERIMENT_ID'] = 'small-full-but-manipulated-third-model'
%time exp_id, results_path = experiment.process_evaluation_dataset(base_path_to_data / '2018-07-03-small_set_with_full_but_manipulated_information_evaluation_data.pkl')

14:39:01.030 - INFO - Calling ERS to match http://ers:8080/ers_service with 1011 records from the evaluation data set against the golden data.
14:39:01.030 - INFO - Going to store all results of the experiment at /home/datascientist/host/experiments/small-full-but-manipulated-third-model.
14:39:11.030 - INFO - Matched 100 of 1011 9.89 % entries.
14:39:11.030 - INFO - Accuracy is: 81.00 %
14:39:21.030 - INFO - Matched 200 of 1011 19.78 % entries.
14:39:21.030 - INFO - Accuracy is: 83.00 %
14:39:33.030 - INFO - Matched 300 of 1011 29.67 % entries.
14:39:33.030 - INFO - Accuracy is: 82.00 %
14:39:41.030 - INFO - Matched 400 of 1011 39.56 % entries.
14:39:41.030 - INFO - Accuracy is: 82.50 %
14:39:55.030 - INFO - Matched 500 of 1011 49.46 % entries.
14:39:55.030 - INFO - Accuracy is: 83.00 %
14:40:04.030 - INFO - Matched 600 of 1011 59.35 % entries.
14:40:04.030 - INFO - Accuracy is: 82.67 %
14:40:15.030 - INFO - Matched 700 of 1011 69.24 % entries.
14:40:15.030 - INFO - Accuracy is: 83.43

In [32]:
check_results_of_experiment('small-full-but-manipulated-third-model')

# golden samples: 998
# evaluation samples: 1011
accuracy: 82.8 %
recall: 65.8 %
precision: 100.0 %
f-score: 79.4 %
number of too ambiguous matches: NaN


Unnamed: 0,was_a_match,was_no_match
found_a_match,335,0
found_no_match,174,502


## Small limited address and manipulated information

In [6]:
%time experiment = ERSExperiment(base_path_to_data / '2018-07-03-small_set_with_limited_address_and_manipulated_information_golden_data.pkl')


11:52:28.008 - INFO - Starting to populate ERS
11:52:28.008 - INFO - Calling ERS on http://ers:8080 with 998 records for the golden data set.
11:52:33.008 - INFO - At step 100 avg processing time per company entry was 0.0547 ms
11:52:37.008 - INFO - At step 200 avg processing time per company entry was 0.0327 ms
11:52:40.008 - INFO - At step 300 avg processing time per company entry was 0.0308 ms
11:52:43.008 - INFO - At step 400 avg processing time per company entry was 0.0283 ms
11:52:45.008 - INFO - At step 500 avg processing time per company entry was 0.0271 ms
11:52:48.008 - INFO - At step 600 avg processing time per company entry was 0.0259 ms
11:52:51.008 - INFO - At step 700 avg processing time per company entry was 0.0304 ms
11:52:54.008 - INFO - At step 800 avg processing time per company entry was 0.0262 ms
11:52:56.008 - INFO - At step 900 avg processing time per company entry was 0.0242 ms
11:52:59.008 - INFO - The ERS was successfully populated with 100.0 % of 998 the loa

In [14]:
os.environ['EXPERIMENT_ID'] = 'kantwert-S1000_T10000-on-small-limited-address-and-manipulated-infos'
%time exp_id, results_path = experiment.process_evaluation_dataset(base_path_to_data / '2018-07-03-small_set_with_limited_address_and_manipulated_information_evaluation_data.pkl')

11:54:32.019 - INFO - Calling ERS to match http://ers:8080 with 1011 records from the evaluation data set against the golden data.
11:54:32.019 - INFO - Going to store all results of the experiment at /home/datascientist/host/experiments/kantwert-S1000_T10000-on-small-limited-address-and-manipulated-infos/results.
11:54:39.019 - INFO - Matched 100 of 1011 9.89 % entries.
11:54:40.019 - INFO - Accuracy is 67.33 %
11:54:48.019 - INFO - Matched 200 of 1011 19.78 % entries.
11:54:48.019 - INFO - Accuracy is 74.13 %
11:54:59.019 - INFO - Matched 300 of 1011 29.67 % entries.
11:54:59.019 - INFO - Accuracy is 76.41 %
11:55:07.019 - INFO - Matched 400 of 1011 39.56 % entries.
11:55:07.019 - INFO - Accuracy is 76.87 %
11:55:18.019 - INFO - Matched 500 of 1011 49.46 % entries.
11:55:18.019 - INFO - Accuracy is 76.69 %
11:55:27.019 - INFO - Matched 600 of 1011 59.35 % entries.
11:55:27.019 - INFO - Accuracy is 75.25 %
11:55:36.019 - INFO - Matched 700 of 1011 69.24 % entries.
11:55:36.019 - INFO 

In [11]:
experiment = ERSExperiment()
experiment.load_results('kantwert-S1000_T10000-on-small-limited-address-and-manipulated-infos')
pp(experiment.stats())

# golden samples: 998
# evaluation samples: 1011
accuracy: 74.0 %
recall: 51.8 %
precision: 92.3 %
f-score: 66.3 %
number of too ambiguous matches: 0.0 %


In [17]:
experiment.confusion_matrix

Unnamed: 0,was_a_match,was_no_match
found_a_match,263,22
found_no_match,245,495


In [14]:
os.environ['EXPERIMENT_ID'] = 'small-limited-address-and-manipulated-infos'
%time exp_id, results_path = experiment.process_evaluation_dataset(base_path_to_data / '2018-07-03-small_set_with_limited_address_and_manipulated_information_evaluation_data.pkl')

11:31:03.022 - INFO - Calling ERS to match http://172.18.0.1:8080/ers_service with 1011 records from the evaluation data set against the golden data.
11:31:03.022 - INFO - Going to store all results of the experiment at /home/datascientist/host/experiments/small-limited-address-and-manipulated-infos.
11:31:31.022 - INFO - Matched 100 of 1011 9.89 % entries.
11:31:31.022 - INFO - Accuracy is: 65.00 %
11:31:52.022 - INFO - Matched 200 of 1011 19.78 % entries.
11:31:52.022 - INFO - Accuracy is: 68.00 %
11:32:31.022 - INFO - Matched 300 of 1011 29.67 % entries.
11:32:31.022 - INFO - Accuracy is: 68.67 %
11:32:58.022 - INFO - Matched 400 of 1011 39.56 % entries.
11:32:58.022 - INFO - Accuracy is: 68.75 %
11:33:35.022 - INFO - Matched 500 of 1011 49.46 % entries.
11:33:35.022 - INFO - Accuracy is: 67.60 %
11:34:05.022 - INFO - Matched 600 of 1011 59.35 % entries.
11:34:05.022 - INFO - Accuracy is: 67.50 %
11:34:43.022 - INFO - Matched 700 of 1011 69.24 % entries.
11:34:43.022 - INFO - Accura

In [15]:
experiment.confusion_matrix

Unnamed: 0,was_a_match,was_no_match
found_a_match,181,1
found_no_match,328,501


In [21]:
experiment.false_positives

Unnamed: 0_level_0,Unnamed: 1_level_0,founded,searched
Match-Id,Attribute,Unnamed: 2_level_1,Unnamed: 3_level_1
620,ersId,4b887052-9c8d-11e8-9a84-0242ac120002,
620,name,A E R O -Verpackungsgesellschaft m.b.H.,A E R O Verpackungsgesellschaft mHb
620,email,geldern@aeropack.de,info@aeropack.de
620,phoneNumber,+496359937070,+496359937070
620,website,http://http//www.aeropack.de,http://http//www.aeropack.de
620,address.street,Zeppelinstr. 16,
620,address.city,Geldern,


In [4]:
%time experiment = ERSExperiment(base_path_to_data / '2018-07-03-small_set_with_limited_address_and_manipulated_information_golden_data.pkl')

10:19:31.003 - INFO - Starting to populate ERS
10:19:31.003 - INFO - Calling ERS on http://ers:8080/ers_service with 998 records for the golden data set.
10:20:04.003 - INFO - The ERS was successfully populated with 100.0 % of 998 the loaded records.
10:20:04.003 - INFO - Upload of 0 records failed.
10:20:04.003 - INFO - Processing the golden data set took 0:00:32.590908 time.
CPU times: user 4.39 s, sys: 440 ms, total: 4.83 s
Wall time: 32.6 s


In [5]:
os.environ['EXPERIMENT_ID'] = 'small-limited-address-and-manipulated-infos-third-model'
%time exp_id, results_path = experiment.process_evaluation_dataset(base_path_to_data / '2018-07-03-small_set_with_limited_address_and_manipulated_information_evaluation_data.pkl')

10:21:41.003 - INFO - Calling ERS to match http://ers:8080/ers_service with 1011 records from the evaluation data set against the golden data.
10:21:41.003 - INFO - Going to store all results of the experiment at /home/datascientist/host/experiments/small-limited-address-and-manipulated-infos-third-model.
10:21:48.003 - INFO - Matched 100 of 1011 9.89 % entries.
10:21:48.003 - INFO - Accuracy is: 68.00 %
10:21:56.003 - INFO - Matched 200 of 1011 19.78 % entries.
10:21:56.003 - INFO - Accuracy is: 74.50 %
10:22:06.003 - INFO - Matched 300 of 1011 29.67 % entries.
10:22:06.003 - INFO - Accuracy is: 76.67 %
10:22:14.003 - INFO - Matched 400 of 1011 39.56 % entries.
10:22:14.003 - INFO - Accuracy is: 77.25 %
10:22:24.003 - INFO - Matched 500 of 1011 49.46 % entries.
10:22:24.003 - INFO - Accuracy is: 77.00 %
10:22:31.003 - INFO - Matched 600 of 1011 59.35 % entries.
10:22:31.003 - INFO - Accuracy is: 76.33 %
10:22:39.003 - INFO - Matched 700 of 1011 69.24 % entries.
10:22:39.003 - INFO - A

In [30]:
%time experiment = ERSExperiment(base_path_to_data / '2018-07-03-small_set_with_limited_address_and_manipulated_information_golden_data.pkl')
os.environ['EXPERIMENT_ID'] = 'small-limited-address-and-manipulated-infos-third-model-with-confidences-and-multiple-matches'
%time exp_id, results_path = experiment.process_evaluation_dataset(base_path_to_data / '2018-07-03-small_set_with_limited_address_and_manipulated_information_evaluation_data.pkl')

14:57:27.013 - INFO - Calling ERS to match http://192.168.16.1:8080/ers_service with 1011 records from the evaluation data set against the golden data.
14:57:27.013 - INFO - Going to store all results of the experiment at /home/datascientist/host/experiments/small-limited-address-and-manipulated-infos-third-model-with-confidences-and-multiple-matches.
14:57:34.013 - INFO - Matched 100 of 1011 9.89 % entries.
14:57:34.013 - INFO - Accuracy is 67.33 %
14:57:40.013 - INFO - Matched 200 of 1011 19.78 % entries.
14:57:40.013 - INFO - Accuracy is 74.13 %
14:57:49.013 - INFO - Matched 300 of 1011 29.67 % entries.
14:57:49.013 - INFO - Accuracy is 76.41 %
14:57:55.013 - INFO - Matched 400 of 1011 39.56 % entries.
14:57:55.013 - INFO - Accuracy is 77.06 %
14:58:04.013 - INFO - Matched 500 of 1011 49.46 % entries.
14:58:04.013 - INFO - Accuracy is 76.85 %
14:58:10.013 - INFO - Matched 600 of 1011 59.35 % entries.
14:58:11.013 - INFO - Accuracy is 75.91 %
14:58:17.013 - INFO - Matched 700 of 1011

## Medium sized full but manipulated information

In [33]:
%time experiment = ERSExperiment(base_path_to_data / '2018-07-03-medium_set_with_full_but_manipulated_information_golden_data.pkl')

14:05:10.019 - INFO - Starting to populate ERS
14:05:12.019 - INFO - Calling ERS on http://ers:8080 with 9926 records for the golden data set.
14:05:15.019 - INFO - At step 100 avg processing time per company entry was 0.0317 ms
14:05:18.019 - INFO - At step 200 avg processing time per company entry was 0.0288 ms
14:05:21.019 - INFO - At step 300 avg processing time per company entry was 0.0307 ms
14:05:25.019 - INFO - At step 400 avg processing time per company entry was 0.0401 ms
14:05:28.019 - INFO - At step 500 avg processing time per company entry was 0.0340 ms
14:05:31.019 - INFO - At step 600 avg processing time per company entry was 0.0316 ms
14:05:35.019 - INFO - At step 700 avg processing time per company entry was 0.0338 ms
14:05:38.019 - INFO - At step 800 avg processing time per company entry was 0.0278 ms
14:05:41.019 - INFO - At step 900 avg processing time per company entry was 0.0289 ms
14:05:43.019 - INFO - At step 1000 avg processing time per company entry was 0.0294

14:11:25.019 - INFO - At step 9300 avg processing time per company entry was 0.0515 ms
14:11:30.019 - INFO - At step 9400 avg processing time per company entry was 0.0504 ms
14:11:35.019 - INFO - At step 9500 avg processing time per company entry was 0.0499 ms
14:11:40.019 - INFO - At step 9600 avg processing time per company entry was 0.0527 ms
14:11:45.019 - INFO - At step 9700 avg processing time per company entry was 0.0519 ms
14:11:50.019 - INFO - At step 9800 avg processing time per company entry was 0.0532 ms
14:11:56.019 - INFO - At step 9900 avg processing time per company entry was 0.0543 ms
14:11:57.019 - INFO - The ERS was successfully populated with 100.0 % of 9926 the loaded records.
14:11:57.019 - INFO - Upload of 0 records failed.
14:11:57.019 - INFO - Processing the golden data set took 0:06:47.213539 time.
CPU times: user 25.7 s, sys: 7.11 s, total: 32.8 s
Wall time: 6min 47s


In [16]:
os.environ['EXPERIMENT_ID'] = 'kantwert-S1000_T10000-on-medium-limited-address-and-manipulated-infos'
%time exp_id, results_path = experiment.process_evaluation_dataset(base_path_to_data / '2018-07-03-medium_set_with_full_but_manipulated_information_evaluation_data.pkl')

12:04:30.019 - INFO - Calling ERS to match http://ers:8080 with 10059 records from the evaluation data set against the golden data.
12:04:30.019 - INFO - Going to store all results of the experiment at /home/datascientist/host/experiments/kantwert-S1000_T10000-on-medium-limited-address-and-manipulated-infos/results.
12:04:47.019 - INFO - Matched 100 of 10059 0.99 % entries.
12:04:47.019 - INFO - Accuracy is 84.00 %
12:05:03.019 - INFO - Matched 200 of 10059 1.99 % entries.
12:05:03.019 - INFO - Accuracy is 84.00 %
12:05:17.019 - INFO - Matched 300 of 10059 2.98 % entries.
12:05:17.019 - INFO - Accuracy is 83.33 %
12:05:35.019 - INFO - Matched 400 of 10059 3.98 % entries.
12:05:35.019 - INFO - Accuracy is 84.50 %
12:05:51.019 - INFO - Matched 500 of 10059 4.97 % entries.
12:05:51.019 - INFO - Accuracy is 84.40 %
12:06:08.019 - INFO - Matched 600 of 10059 5.96 % entries.
12:06:08.019 - INFO - Accuracy is 84.50 %
12:06:25.019 - INFO - Matched 700 of 10059 6.96 % entries.
12:06:25.019 - IN

12:25:35.019 - INFO - Matched 7800 of 10059 77.54 % entries.
12:25:35.019 - INFO - Accuracy is 84.26 %
12:25:53.019 - INFO - Matched 7900 of 10059 78.54 % entries.
12:25:53.019 - INFO - Accuracy is 84.34 %
12:26:12.019 - INFO - Matched 8000 of 10059 79.53 % entries.
12:26:12.019 - INFO - Accuracy is 84.29 %
12:26:30.019 - INFO - Matched 8100 of 10059 80.52 % entries.
12:26:30.019 - INFO - Accuracy is 84.32 %
12:26:45.019 - INFO - Matched 8200 of 10059 81.52 % entries.
12:26:45.019 - INFO - Accuracy is 84.35 %
12:27:01.019 - INFO - Matched 8300 of 10059 82.51 % entries.
12:27:01.019 - INFO - Accuracy is 84.35 %
12:27:17.019 - INFO - Matched 8400 of 10059 83.51 % entries.
12:27:17.019 - INFO - Accuracy is 84.32 %
12:27:37.019 - INFO - Matched 8500 of 10059 84.50 % entries.
12:27:37.019 - INFO - Accuracy is 84.28 %
12:27:51.019 - INFO - Matched 8600 of 10059 85.50 % entries.
12:27:51.019 - INFO - Accuracy is 84.24 %
12:28:08.019 - INFO - Matched 8700 of 10059 86.49 % entries.
12:28:08.019

In [38]:
check_results_of_experiment('kantwert-S1000_T10000-on-medium-limited-address-and-manipulated-infos')

# golden samples: 9926
# evaluation samples: 10.06 k
accuracy: 84.1 %
recall: 67.9 %
precision: 100.0 %
f-score: 80.9 %
number of too ambiguous matches: 0.0 %


Unnamed: 0,was_a_match,was_no_match
found_a_match,3384,1
found_no_match,1601,5073


In [35]:
os.environ['EXPERIMENT_ID'] = 'kantwert-S5000_T5000-on-medium-limited-address-and-manipulated-infos'
%time exp_id, results_path = experiment.process_evaluation_dataset(base_path_to_data / '2018-07-03-medium_set_with_full_but_manipulated_information_evaluation_data.pkl')

14:14:22.019 - INFO - Calling ERS to match http://ers:8080 with 10059 records from the evaluation data set against the golden data.
14:14:22.019 - INFO - Going to store all results of the experiment at data/experiments/kantwert-S5000_T5000-on-medium-limited-address-and-manipulated-infos/results.
14:14:25.019 - INFO - Matched 100 of 10059 0.99 % entries.
14:14:25.019 - INFO - Accuracy is 49.00 %
14:14:28.019 - INFO - Matched 200 of 10059 1.99 % entries.
14:14:28.019 - INFO - Accuracy is 53.50 %
14:14:30.019 - INFO - Matched 300 of 10059 2.98 % entries.
14:14:30.019 - INFO - Accuracy is 53.00 %
14:14:33.019 - INFO - Matched 400 of 10059 3.98 % entries.
14:14:33.019 - INFO - Accuracy is 52.75 %
14:14:37.019 - INFO - Matched 500 of 10059 4.97 % entries.
14:14:37.019 - INFO - Accuracy is 54.18 %
14:14:40.019 - INFO - Matched 600 of 10059 5.96 % entries.
14:14:40.019 - INFO - Accuracy is 53.65 %
14:14:42.019 - INFO - Matched 700 of 10059 6.96 % entries.
14:14:42.019 - INFO - Accuracy is 54.9

14:18:15.019 - INFO - Matched 7800 of 10059 77.54 % entries.
14:18:15.019 - INFO - Accuracy is 55.98 %
14:18:19.019 - INFO - Matched 7900 of 10059 78.54 % entries.
14:18:19.019 - INFO - Accuracy is 56.06 %
14:18:23.019 - INFO - Matched 8000 of 10059 79.53 % entries.
14:18:23.019 - INFO - Accuracy is 56.03 %
14:18:26.019 - INFO - Matched 8100 of 10059 80.52 % entries.
14:18:26.019 - INFO - Accuracy is 55.96 %
14:18:30.019 - INFO - Matched 8200 of 10059 81.52 % entries.
14:18:30.019 - INFO - Accuracy is 56.06 %
14:18:32.019 - INFO - Matched 8300 of 10059 82.51 % entries.
14:18:32.019 - INFO - Accuracy is 56.07 %
14:18:34.019 - INFO - Matched 8400 of 10059 83.51 % entries.
14:18:34.019 - INFO - Accuracy is 55.99 %
14:18:37.019 - INFO - Matched 8500 of 10059 84.50 % entries.
14:18:37.019 - INFO - Accuracy is 55.92 %
14:18:40.019 - INFO - Matched 8600 of 10059 85.50 % entries.
14:18:40.019 - INFO - Accuracy is 55.95 %
14:18:42.019 - INFO - Matched 8700 of 10059 86.49 % entries.
14:18:42.019

In [37]:
check_results_of_experiment('kantwert-S5000_T5000-on-medium-limited-address-and-manipulated-infos')

# golden samples: 9926
# evaluation samples: 10.06 k
accuracy: 55.9 %
recall: 11.3 %
precision: 97.6 %
f-score: 20.2 %
number of too ambiguous matches: 100.0 %


Unnamed: 0,was_a_match,was_no_match
found_a_match,562,14
found_no_match,4423,5068


In [9]:
os.environ['EXPERIMENT_ID'] = 'medium-limited-address-and-manipulated-infos-third-model'
%time exp_id, results_path = experiment.process_evaluation_dataset(base_path_to_data / '2018-07-03-medium_set_with_full_but_manipulated_information_evaluation_data.pkl')

10:38:50.003 - INFO - Calling ERS to match http://ers:8080/ers_service with 10059 records from the evaluation data set against the golden data.
10:38:50.003 - INFO - Going to store all results of the experiment at /home/datascientist/host/experiments/medium-limited-address-and-manipulated-infos-third-model.
10:39:08.003 - INFO - Matched 100 of 10059 0.99 % entries.
10:39:08.003 - INFO - Accuracy is: 89.00 %
10:39:22.003 - INFO - Matched 200 of 10059 1.99 % entries.
10:39:22.003 - INFO - Accuracy is: 84.50 %
10:39:35.003 - INFO - Matched 300 of 10059 2.98 % entries.
10:39:35.003 - INFO - Accuracy is: 82.33 %
10:39:53.003 - INFO - Matched 400 of 10059 3.98 % entries.
10:39:53.003 - INFO - Accuracy is: 83.25 %
10:40:08.003 - INFO - Matched 500 of 10059 4.97 % entries.
10:40:08.003 - INFO - Accuracy is: 83.80 %
10:40:23.003 - INFO - Matched 600 of 10059 5.96 % entries.
10:40:23.003 - INFO - Accuracy is: 83.33 %
10:40:38.003 - INFO - Matched 700 of 10059 6.96 % entries.
10:40:38.003 - INFO 

10:59:13.003 - INFO - Matched 7700 of 10059 76.55 % entries.
10:59:13.003 - INFO - Accuracy is: 83.84 %
10:59:33.003 - INFO - Matched 7800 of 10059 77.54 % entries.
10:59:33.003 - INFO - Accuracy is: 83.87 %
10:59:49.003 - INFO - Matched 7900 of 10059 78.54 % entries.
10:59:49.003 - INFO - Accuracy is: 83.82 %
11:00:08.003 - INFO - Matched 8000 of 10059 79.53 % entries.
11:00:08.003 - INFO - Accuracy is: 83.76 %
11:00:23.003 - INFO - Matched 8100 of 10059 80.52 % entries.
11:00:23.003 - INFO - Accuracy is: 83.60 %
11:00:39.003 - INFO - Matched 8200 of 10059 81.52 % entries.
11:00:39.003 - INFO - Accuracy is: 83.62 %
11:00:57.003 - INFO - Matched 8300 of 10059 82.51 % entries.
11:00:57.003 - INFO - Accuracy is: 83.70 %
11:01:16.003 - INFO - Matched 8400 of 10059 83.51 % entries.
11:01:16.003 - INFO - Accuracy is: 83.74 %
11:01:35.003 - INFO - Matched 8500 of 10059 84.50 % entries.
11:01:35.003 - INFO - Accuracy is: 83.72 %
11:01:53.003 - INFO - Matched 8600 of 10059 85.50 % entries.
11:

In [12]:
os.environ['EXPERIMENT_ID'] = 'medium-limited-address-and-manipulated-infos-third-model-with-confidences'
%time exp_id, results_path = experiment.process_evaluation_dataset(base_path_to_data / '2018-07-03-medium_set_with_full_but_manipulated_information_evaluation_data.pkl')

09:47:44.011 - INFO - Calling ERS to match http://ers:8080/ers_service with 10059 records from the evaluation data set against the golden data.
09:47:44.011 - INFO - Going to store all results of the experiment at /home/datascientist/host/experiments/medium-limited-address-and-manipulated-infos-third-model-with-confidences.
09:48:06.011 - INFO - Matched 100 of 10059 0.99 % entries.
09:48:06.011 - INFO - Accuracy is: 89.00 %
09:48:22.011 - INFO - Matched 200 of 10059 1.99 % entries.
09:48:22.011 - INFO - Accuracy is: 84.50 %
09:48:38.011 - INFO - Matched 300 of 10059 2.98 % entries.
09:48:38.011 - INFO - Accuracy is: 82.33 %
09:48:59.011 - INFO - Matched 400 of 10059 3.98 % entries.
09:48:59.011 - INFO - Accuracy is: 83.25 %
09:49:17.011 - INFO - Matched 500 of 10059 4.97 % entries.
09:49:17.011 - INFO - Accuracy is: 83.80 %
09:49:35.011 - INFO - Matched 600 of 10059 5.96 % entries.
09:49:35.011 - INFO - Accuracy is: 83.33 %
09:49:53.011 - INFO - Matched 700 of 10059 6.96 % entries.
09:

10:12:11.011 - INFO - Matched 7700 of 10059 76.55 % entries.
10:12:11.011 - INFO - Accuracy is: 83.84 %
10:12:35.011 - INFO - Matched 7800 of 10059 77.54 % entries.
10:12:35.011 - INFO - Accuracy is: 83.87 %
10:12:54.011 - INFO - Matched 7900 of 10059 78.54 % entries.
10:12:54.011 - INFO - Accuracy is: 83.82 %
10:13:17.011 - INFO - Matched 8000 of 10059 79.53 % entries.
10:13:17.011 - INFO - Accuracy is: 83.76 %
10:13:35.011 - INFO - Matched 8100 of 10059 80.52 % entries.
10:13:35.011 - INFO - Accuracy is: 83.60 %
10:13:55.011 - INFO - Matched 8200 of 10059 81.52 % entries.
10:13:55.011 - INFO - Accuracy is: 83.62 %
10:14:16.011 - INFO - Matched 8300 of 10059 82.51 % entries.
10:14:16.011 - INFO - Accuracy is: 83.70 %
10:14:38.011 - INFO - Matched 8400 of 10059 83.51 % entries.
10:14:38.011 - INFO - Accuracy is: 83.74 %
10:15:01.011 - INFO - Matched 8500 of 10059 84.50 % entries.
10:15:01.011 - INFO - Accuracy is: 83.72 %
10:15:22.011 - INFO - Matched 8600 of 10059 85.50 % entries.
10:

In [37]:
experiment = ERSExperiment()
experiment.load_results('medium-limited-address-and-manipulated-infos-third-model-with-confidences')
pp(experiment.stats())

# golden samples: 9926
# evaluation samples: 10.06 k
accuracy: 83.7 %
recall: 67.1 %
precision: 100.0 %
f-score: 80.3 %
number of too ambiguous matches: NaN


In [30]:
experiment.confusion_matrix

Unnamed: 0,was_a_match,was_no_match
found_a_match,3343,0
found_no_match,1642,5074


In [31]:
experiment.true_and_predicted_ids.similarity.describe()

count    3343.000000
mean        0.858038
std         0.087581
min         0.507606
25%         0.821255
50%         0.881558
75%         0.919811
max         0.984899
Name: similarity, dtype: float64

In [5]:
%time experiment = ERSExperiment(base_path_to_data / '2018-07-03-medium_set_with_full_but_manipulated_information_golden_data.pkl')
os.environ['EXPERIMENT_ID'] = 'medium-limited-address-and-manipulated-infos-third-model-with-confidences-and-multiple-matches'
%time exp_id, results_path = experiment.process_evaluation_dataset(base_path_to_data / '2018-07-03-medium_set_with_full_but_manipulated_information_evaluation_data.pkl')

16:18:25.013 - INFO - Starting to populate ERS
16:18:27.013 - INFO - Calling ERS on http://ers:8080/ers_service with 9926 records for the golden data set.
16:26:32.013 - INFO - The ERS was successfully populated with 100.0 % of 9926 the loaded records.
16:26:32.013 - INFO - Upload of 0 records failed.
16:26:32.013 - INFO - Processing the golden data set took 0:08:07.699403 time.
CPU times: user 44.9 s, sys: 4.41 s, total: 49.3 s
Wall time: 8min 7s
16:26:32.013 - INFO - Calling ERS to match http://ers:8080/ers_service with 10059 records from the evaluation data set against the golden data.
16:26:32.013 - INFO - Going to store all results of the experiment at /home/datascientist/host/experiments/medium-limited-address-and-manipulated-infos-third-model-with-confidences-and-multiple-matches.
16:27:04.013 - INFO - Matched 100 of 10059 0.99 % entries.
16:27:04.013 - INFO - Accuracy is 89.00 %
16:27:28.013 - INFO - Matched 200 of 10059 1.99 % entries.
16:27:28.013 - INFO - Accuracy is 84.50 %

16:58:24.013 - INFO - Accuracy is 83.81 %
16:58:53.013 - INFO - Matched 7300 of 10059 72.57 % entries.
16:58:53.013 - INFO - Accuracy is 83.80 %
16:59:14.013 - INFO - Matched 7400 of 10059 73.57 % entries.
16:59:14.013 - INFO - Accuracy is 83.84 %
16:59:38.013 - INFO - Matched 7500 of 10059 74.56 % entries.
16:59:38.013 - INFO - Accuracy is 83.78 %
17:00:08.013 - INFO - Matched 7600 of 10059 75.55 % entries.
17:00:08.013 - INFO - Accuracy is 83.82 %
17:00:36.013 - INFO - Matched 7700 of 10059 76.55 % entries.
17:00:36.013 - INFO - Accuracy is 83.83 %
17:01:09.013 - INFO - Matched 7800 of 10059 77.54 % entries.
17:01:09.013 - INFO - Accuracy is 83.86 %
17:01:35.013 - INFO - Matched 7900 of 10059 78.54 % entries.
17:01:35.013 - INFO - Accuracy is 83.81 %
17:02:07.013 - INFO - Matched 8000 of 10059 79.53 % entries.
17:02:07.013 - INFO - Accuracy is 83.75 %
17:02:31.013 - INFO - Matched 8100 of 10059 80.52 % entries.
17:02:31.013 - INFO - Accuracy is 83.59 %
17:02:58.013 - INFO - Matched 8

In [6]:
experiment.confusion_matrix

Unnamed: 0,was_a_match,was_no_match
found_a_match,3343,1
found_no_match,1642,5074


In [15]:
experiment.true_and_predicted_ids.similarity.describe()

count    3343.000000
mean        0.858038
std         0.087581
min         0.507606
25%         0.821255
50%         0.881558
75%         0.919811
max         0.984899
Name: similarity, dtype: float64

In [7]:
os.environ['EXPERIMENT_ID'] = 'redo-by-jupyter-re-redo-on-small-limited-address-and-manipulated-infos-wo-sanitizer'
%time exp_id, results_path = experiment.process_evaluation_dataset(base_path_to_data / '2018-07-03-small_set_with_limited_address_and_manipulated_information_evaluation_data.pkl')

11:54:25.008 - INFO - Calling ERS to match http://ers:8080 with 1011 records from the evaluation data set against the golden data.
11:54:25.008 - INFO - Going to store all results of the experiment at data/experiments/redo-by-jupyter-re-redo-on-small-limited-address-and-manipulated-infos-wo-sanitizer/results.
11:54:32.008 - INFO - Matched 100 of 1011 9.89 % entries.
11:54:32.008 - INFO - Accuracy is 67.33 %
11:54:41.008 - INFO - Matched 200 of 1011 19.78 % entries.
11:54:41.008 - INFO - Accuracy is 74.13 %
11:54:52.008 - INFO - Matched 300 of 1011 29.67 % entries.
11:54:52.008 - INFO - Accuracy is 76.41 %
11:55:00.008 - INFO - Matched 400 of 1011 39.56 % entries.
11:55:00.008 - INFO - Accuracy is 76.87 %
11:55:12.008 - INFO - Matched 500 of 1011 49.46 % entries.
11:55:12.008 - INFO - Accuracy is 76.69 %
11:55:21.008 - INFO - Matched 600 of 1011 59.35 % entries.
11:55:22.008 - INFO - Accuracy is 74.79 %
11:55:31.008 - INFO - Matched 700 of 1011 69.24 % entries.
11:55:31.008 - INFO - Acc

In [9]:
experiment = ERSExperiment()
experiment.load_results('redo-by-jupyter-re-redo-on-small-limited-address-and-manipulated-infos-wo-sanitizer')
pp(experiment.stats())

# golden samples: 998
# evaluation samples: 1011
accuracy: 73.5 %
recall: 51.8 %
precision: 90.4 %
f-score: 65.8 %
number of too ambiguous matches: 0.0 %


In [10]:
experiment = ERSExperiment()
experiment.load_results('2019-05-08T134216_re-redo-small_set_with_limited_address_and_manipulated-wo-sanitizer')
pp(experiment.stats())

# golden samples: 998
# evaluation samples: 1011
accuracy: 73.4 %
recall: 51.8 %
precision: 90.1 %
f-score: 65.8 %
number of too ambiguous matches: 0.0 %


## Big full but manipulated information

In [10]:
%time experiment = ERSExperiment(base_path_to_data / '2018-07-03-big_set_with_full_but_manipulated_information_golden_data.pkl')

12:07:24.003 - INFO - Starting to populate ERS
12:07:26.003 - INFO - Calling ERS on http://ers:8080/ers_service with 31152 records for the golden data set.
12:52:49.003 - INFO - The ERS was successfully populated with 100.0 % of 31152 the loaded records.
12:52:49.003 - INFO - Upload of 0 records failed.
12:52:49.003 - INFO - Processing the golden data set took 0:45:25.093377 time.
CPU times: user 2min 18s, sys: 13.9 s, total: 2min 32s
Wall time: 45min 25s


In [11]:
os.environ['EXPERIMENT_ID'] = 'big-limited-address-and-manipulated-infos-third-model'
%time exp_id, results_path = experiment.process_evaluation_dataset(base_path_to_data / '2018-07-03-big_set_with_full_but_manipulated_information_evaluation_data.pkl')

13:06:26.003 - INFO - Calling ERS to match http://ers:8080/ers_service with 31115 records from the evaluation data set against the golden data.
13:06:26.003 - INFO - Going to store all results of the experiment at /home/datascientist/host/experiments/big-limited-address-and-manipulated-infos-third-model.
13:06:52.003 - INFO - Matched 100 of 31115 0.32 % entries.
13:06:52.003 - INFO - Accuracy is: 89.00 %
13:07:17.003 - INFO - Matched 200 of 31115 0.64 % entries.
13:07:17.003 - INFO - Accuracy is: 87.00 %
13:07:41.003 - INFO - Matched 300 of 31115 0.96 % entries.
13:07:41.003 - INFO - Accuracy is: 84.67 %
13:08:05.003 - INFO - Matched 400 of 31115 1.29 % entries.
13:08:05.003 - INFO - Accuracy is: 84.75 %
13:08:29.003 - INFO - Matched 500 of 31115 1.61 % entries.
13:08:29.003 - INFO - Accuracy is: 84.20 %
13:08:52.003 - INFO - Matched 600 of 31115 1.93 % entries.
13:08:52.003 - INFO - Accuracy is: 84.33 %
13:09:16.003 - INFO - Matched 700 of 31115 2.25 % entries.
13:09:16.003 - INFO - A

13:38:34.003 - INFO - Accuracy is: 84.40 %
13:39:02.003 - INFO - Matched 7800 of 31115 25.07 % entries.
13:39:02.003 - INFO - Accuracy is: 84.44 %
13:39:28.003 - INFO - Matched 7900 of 31115 25.39 % entries.
13:39:28.003 - INFO - Accuracy is: 84.35 %
13:39:54.003 - INFO - Matched 8000 of 31115 25.71 % entries.
13:39:54.003 - INFO - Accuracy is: 84.29 %
13:40:21.003 - INFO - Matched 8100 of 31115 26.03 % entries.
13:40:21.003 - INFO - Accuracy is: 84.28 %
13:40:47.003 - INFO - Matched 8200 of 31115 26.35 % entries.
13:40:47.003 - INFO - Accuracy is: 84.30 %
13:41:15.003 - INFO - Matched 8300 of 31115 26.68 % entries.
13:41:15.003 - INFO - Accuracy is: 84.30 %
13:41:36.003 - INFO - Matched 8400 of 31115 27.00 % entries.
13:41:36.003 - INFO - Accuracy is: 84.21 %
13:42:06.003 - INFO - Matched 8500 of 31115 27.32 % entries.
13:42:06.003 - INFO - Accuracy is: 84.26 %
13:42:32.003 - INFO - Matched 8600 of 31115 27.64 % entries.
13:42:32.003 - INFO - Accuracy is: 84.28 %
13:42:59.003 - INFO -

14:14:55.003 - INFO - Matched 15600 of 31115 50.14 % entries.
14:14:55.003 - INFO - Accuracy is: 83.85 %
14:15:24.003 - INFO - Matched 15700 of 31115 50.46 % entries.
14:15:24.003 - INFO - Accuracy is: 83.85 %
14:15:50.003 - INFO - Matched 15800 of 31115 50.78 % entries.
14:15:50.003 - INFO - Accuracy is: 83.84 %
14:16:19.003 - INFO - Matched 15900 of 31115 51.10 % entries.
14:16:19.003 - INFO - Accuracy is: 83.84 %
14:16:49.003 - INFO - Matched 16000 of 31115 51.42 % entries.
14:16:49.003 - INFO - Accuracy is: 83.84 %
14:17:19.003 - INFO - Matched 16100 of 31115 51.74 % entries.
14:17:19.003 - INFO - Accuracy is: 83.86 %
14:17:48.003 - INFO - Matched 16200 of 31115 52.06 % entries.
14:17:48.003 - INFO - Accuracy is: 83.83 %
14:18:15.003 - INFO - Matched 16300 of 31115 52.39 % entries.
14:18:15.003 - INFO - Accuracy is: 83.81 %
14:18:41.003 - INFO - Matched 16400 of 31115 52.71 % entries.
14:18:41.003 - INFO - Accuracy is: 83.81 %
14:19:11.003 - INFO - Matched 16500 of 31115 53.03 % en

14:54:56.003 - INFO - Accuracy is: 83.57 %
14:55:27.003 - INFO - Matched 23500 of 31115 75.53 % entries.
14:55:27.003 - INFO - Accuracy is: 83.57 %
14:56:03.003 - INFO - Matched 23600 of 31115 75.85 % entries.
14:56:03.003 - INFO - Accuracy is: 83.56 %
14:56:36.003 - INFO - Matched 23700 of 31115 76.17 % entries.
14:56:36.003 - INFO - Accuracy is: 83.57 %
14:57:07.003 - INFO - Matched 23800 of 31115 76.49 % entries.
14:57:07.003 - INFO - Accuracy is: 83.55 %
14:57:40.003 - INFO - Matched 23900 of 31115 76.81 % entries.
14:57:40.003 - INFO - Accuracy is: 83.54 %
14:58:17.003 - INFO - Matched 24000 of 31115 77.13 % entries.
14:58:17.003 - INFO - Accuracy is: 83.55 %
14:58:49.003 - INFO - Matched 24100 of 31115 77.45 % entries.
14:58:49.003 - INFO - Accuracy is: 83.56 %
14:59:18.003 - INFO - Matched 24200 of 31115 77.78 % entries.
14:59:18.003 - INFO - Accuracy is: 83.57 %
14:59:52.003 - INFO - Matched 24300 of 31115 78.10 % entries.
14:59:52.003 - INFO - Accuracy is: 83.57 %
15:00:27.003

CPU times: user 3min 17s, sys: 10.5 s, total: 3min 27s
Wall time: 2h 33min 16s


In [12]:
experiment.confusion_matrix

Unnamed: 0,was_a_match,was_no_match
found_a_match,10419,6
found_no_match,5086,15604


In [13]:
experiment.stats()

{'recall': 0.671976781683328,
 'selectivity': 0.9996156310057656,
 'precision': 0.9994244604316547,
 'f-score': 0.8036251446201311}

## Upload big data set to staging environment

In [6]:
# make sure that you have `kubectl proxy` running to get access to the cluster
os.environ['ERS_BASEURI'] = 'http://localhost:8001/api/v1/namespaces/valdon-csf-staging/services/entity-recognition-service:80/proxy/ers_service'
%time experiment = ERSExperiment(base_path_to_data / '2018-07-03-big_set_with_full_information_golden_data.pkl')

13:19:46.001 - INFO - Starting to populate ERS
13:19:48.001 - INFO - Calling ERS on http://localhost:8001/api/v1/namespaces/valdon-csf-staging/services/entity-recognition-service:80/proxy/ers_service with 31152 records for the golden data set.
16:06:35.001 - INFO - The ERS was successfully populated with 100.0 % of 31152 the loaded records.
16:06:35.001 - INFO - Upload of 0 records failed.
16:06:35.001 - INFO - Processing the golden data set took 2:46:49.031056 time.
CPU times: user 2min 33s, sys: 9.2 s, total: 2min 43s
Wall time: 2h 46min 49s


### To Be Checked

In [73]:
e = ERSExperiment()
e.load_results('small-limited-address-infos')
e.false_negatives

Unnamed: 0_level_0,Unnamed: 1_level_0,searched,should_have_founded
Match-Id,Attribute,Unnamed: 2_level_1,Unnamed: 3_level_1
187,ersId,cd06e5a5-5b08-4118-ac60-61f07288e0ef,cd06e5a5-5b08-4118-ac60-61f07288e0ef
187,name,AAD-Trading,AAD-Trading
187,website,http://www.aad-trading.de,http://www.aad-trading.de
187,registerNumber,HRB 18964,HRB 18964
187,commercialRegister,Dresden,Dresden


In [24]:
base_path_to_results = Path(os.environ.get('EXPERIMENTS_PATH','/home/datascientist/host/experiments'))
list(base_path_to_results.glob("*"))

[PosixPath('/home/datascientist/host/experiments/kantwert-S1000_T10000-on-small-full-infos'),
 PosixPath('/home/datascientist/host/experiments/medium-limited-address-and-manipulated-infos-third-model'),
 PosixPath('/home/datascientist/host/experiments/.gitkeep'),
 PosixPath('/home/datascientist/host/experiments/kantwert-S1000_T10000-on-small-limited-address-and-manipulated-infos'),
 PosixPath('/home/datascientist/host/experiments/medium-limited-address-and-manipulated-infos-third-model-with-confidences-and-multiple-matches'),
 PosixPath('/home/datascientist/host/experiments/kantwert-S1000_T10000-on-small-full-but-manipulated'),
 PosixPath('/home/datascientist/host/experiments/medium-limited-address-and-manipulated-infos-third-model-with-confidences'),
 PosixPath('/home/datascientist/host/experiments/kantwert-S1000_T10000-on-medium-limited-address-and-manipulated-infos')]

# 2018-09-04 Experiment results:

In [8]:
e = ERSExperiment()
e.load_results('small-full-infos')
e.confusion_matrix

Unnamed: 0,was_a_match,was_no_match
found_a_match,509,0
found_no_match,0,502


In [13]:
e = ERSExperiment()
e.load_results('small-limited-address-infos')
e.confusion_matrix

Unnamed: 0,was_a_match,was_no_match
found_a_match,508,3
found_no_match,1,499


### Accuracy improvments:

In [66]:
e = ERSExperiment()
e.load_results('small-limited-address-and-manipulated-infos')
e.confusion_matrix

Unnamed: 0,was_a_match,was_no_match
found_a_match,181,1
found_no_match,328,501


\begin{align}
confusion\ matrix = 
\begin{bmatrix}
TP & FP \\
FN & TN
\end{bmatrix} 
\end{align}

\begin{align}
recall = \frac{TP}{TP+FN} \\\\
precision = \frac{TP}{TP+FP} 
\end{align}

In [67]:
pp(e.stats())

# golden samples: 998
# evaluation samples: 1011
accuracy: 67.5 %
recall: 35.6 %
precision: 99.5 %
f-score: 52.4 %


In [69]:
e = ERSExperiment()
e.load_results('small-limited-address-and-manipulated-infos-third-model')
pp(e.stats())

# golden samples: 998
# evaluation samples: 1011
accuracy: 75.4 %
recall: 52.0 %
precision: 98.1 %
f-score: 68.0 %


### Bigger data sets:

In [36]:
pp(e.stats())

# golden samples: 9926
# evaluation samples: 10.06 k
accuracy: 75.7 %
recall: 51.1 %
precision: 99.8 %
f-score: 67.6 %
number of too ambiguous matches: 0.0 %


In [70]:
e = ERSExperiment()
e.load_results('medium-limited-address-and-manipulated-infos-third-model')
pp(e.stats())

# golden samples: 9926
# evaluation samples: 10.06 k
accuracy: 83.7 %
recall: 67.1 %
precision: 100.0 %
f-score: 80.3 %


In [71]:
e = ERSExperiment()
e.load_results('big-limited-address-and-manipulated-infos-third-model')
pp(e.stats())

# golden samples: 31.15 k
# evaluation samples: 31.11 k
accuracy: 83.6 %
recall: 67.2 %
precision: 99.9 %
f-score: 80.4 %


In [72]:
e.confusion_matrix.applymap(human_readable_number)

Unnamed: 0,was_a_match,was_no_match
found_a_match,10.42 k,6
found_no_match,5086,15.60 k


### What sorts of errors occure:

In [4]:
ls ../../experiments/

[0m[01;34m2019-03-22T124550_medium_set_with_limited_and_manipulated_information[0m/
[01;34m2019-03-22T144658_medium_set_with_limited_and_manipulated_information_without_sanitizer[0m[K/
[01;34m2019-03-28T094145_no_exact_phone_matching_model_and_medium_set_with_limited_and_manipulated_information_with_sanitizer[0m[K/


In [21]:
e = ERSExperiment()
e.load_results('2019-03-22T124550_medium_set_with_limited_and_manipulated_information')

In [6]:
pp(e.stats())

# golden samples: 9926
# evaluation samples: 10.06 k
accuracy: 75.7 %
recall: 51.1 %
precision: 99.8 %
f-score: 67.6 %
number of too ambiguous matches: 0.0 %


In [7]:
e.confusion_matrix

Unnamed: 0,was_a_match,was_no_match
found_a_match,2547,5
found_no_match,2438,5071


In [63]:
e.false_positives.loc[(5689,)]

Unnamed: 0_level_0,searched,founded
Attribute,Unnamed: 1_level_1,Unnamed: 2_level_1
ersId,,98beeb80-e0df-11e8-8531-0242ac110002
name,esserec gmbh,esser gmbh
legalForm,,
email,infoessertecde,vertriebessernachrichtentechnikde
phoneNumber,492131180,49261942960
website,essertecde,essernachrichtentechnikde
vatID,de203274366,
registerNumber,,
commercialRegister,,
address.street,,


In [22]:
e.highlight_false_positive_errors()

Unnamed: 0_level_0,Unnamed: 1_level_0,searched,founded
Match-Id,Attribute,Unnamed: 2_level_1,Unnamed: 3_level_1
696,ersId,b4052e3c-5397-46c2-b5f4-360bb5019465,98f95202-e0df-11e8-8531-0242ac110002
696,name,alfred hrnig gmhb co kg,alfred renn gmbh co kg
696,email,,infospeditionrennde
696,phoneNumber,49238144901,49742295460
696,website,,speditionrennde
696,vatID,,de142646845
1915,ersId,98ac02cc-e0df-11e8-8531-0242ac110002,251b6abb-c91a-4293-bf32-3e54d537c2b1
1915,name,agrargenossnschaft bucha eg,agrargenossenschaft forst eg
3256,ersId,,98b04530-e0df-11e8-8531-0242ac110002
3256,name,fruchtexpress ts gmbh,frucht express gmbh


In [23]:
e.highlight_false_negative_errors()

Unnamed: 0_level_0,Unnamed: 1_level_0,searched,should_have_founded
Match-Id,Attribute,Unnamed: 2_level_1,Unnamed: 3_level_1
1,name,c s gbmh consulting udn servicse,c s gmbh consulting services
4,name,gg preißer gmbh kartonagefnabrikation udn großhanedl,gg preißer gmbh kartonagenfabrikation großhandel
4,phoneNumber,496331508040,4963315080240
18,name,acomplast gbmh,accomplast gmbh
18,phoneNumber,4937263010,49372263010
22,name,briigtte wäschevertribesgesellschaft mti beschrnäkter hfatung,brigitte wäschevertriebsgesellschaft beschränkter haftung
24,name,drukchaus anderas abrahma gbmh co kg,druckhaus andreas abraham gmbh co kg
27,name,einirchtungen scharfesteinschmidt gmbh,einrichtungen scharfensteinschmidt gmbh
27,phoneNumber,49172867291,491728647291
32,name,craoline van aree gmbh getränkevertrieb,caroline van laere gmbh getränkevertrieb


In [26]:
e.highlight_false_negative_errors().reset_index().Attribute.unique()

array(['name', 'phoneNumber'], dtype=object)

In [18]:
e = ERSExperiment()
e.load_results('2019-03-22T144658_medium_set_with_limited_and_manipulated_information_without_sanitizer')

In [10]:
pp(e.stats())

# golden samples: 9926
# evaluation samples: 10.06 k
accuracy: 71.7 %
recall: 55.4 %
precision: 77.8 %
f-score: 64.8 %
number of too ambiguous matches: 89


In [19]:
e.confusion_matrix

Unnamed: 0,was_a_match,was_no_match
found_a_match,2731,778
found_no_match,2195,4795


In [20]:
e.highlight_false_negative_errors()

Unnamed: 0_level_0,Unnamed: 1_level_0,searched,should_have_founded
Match-Id,Attribute,Unnamed: 2_level_1,Unnamed: 3_level_1
1,name,C & S GbmH Consulting udn Servicse,C & S GmbH Consulting und Services
18,name,ACOMPLAST GbmH,ACCOMPLAST GmbH
18,phoneNumber,+4937263010,+49372263010
22,name,Briigtte Wäschevertribes-Gesellschaft mti beschrnäkter Hfatung,Brigitte Wäschevertriebs-Gesellschaft mit beschränkter Haftung
24,name,Drukchaus Anderas Abrahma GbmH & C.o KG.,Druckhaus Andreas Abraham GmbH & Co. KG.
27,name,Einirchtungen Scharfestein-Schmidt GmbH,Einrichtungen Scharfenstein-Schmidt GmbH
27,phoneNumber,+49172867291,+491728647291
32,name,Craoline Van aree GmbH Getränkevertrieb,Caroline Van Laere GmbH Getränkevertrieb
35,name,Care Deutschland GmbH,Carel Deutschland GmbH
35,phoneNumber,+49605196209,+49605196290


In [5]:
e = ERSExperiment()
e.load_results('2019-03-28T094145_no_exact_phone_matching_model_and_medium_set_with_limited_and_manipulated_information_with_sanitizer')

In [7]:
pp(e.stats())

# golden samples: 9926
# evaluation samples: 10.06 k
accuracy: 76.5 %
recall: 52.7 %
precision: 99.7 %
f-score: 68.9 %
number of too ambiguous matches: 0.0 %


In [6]:
e.confusion_matrix

Unnamed: 0,was_a_match,was_no_match
found_a_match,2627,9
found_no_match,2358,5070


In [8]:
e.highlight_false_negative_errors()

Unnamed: 0_level_0,Unnamed: 1_level_0,searched,should_have_founded
Match-Id,Attribute,Unnamed: 2_level_1,Unnamed: 3_level_1
1,name,c s gbmh consulting udn servicse,c s gmbh consulting services
4,name,gg preißer gmbh kartonagefnabrikation udn großhanedl,gg preißer gmbh kartonagenfabrikation großhandel
4,phoneNumber,496331508040,4963315080240
18,name,acomplast gbmh,accomplast gmbh
18,phoneNumber,4937263010,49372263010
22,name,briigtte wäschevertribesgesellschaft mti beschrnäkter hfatung,brigitte wäschevertriebsgesellschaft beschränkter haftung
24,name,drukchaus anderas abrahma gbmh co kg,druckhaus andreas abraham gmbh co kg
27,name,einirchtungen scharfesteinschmidt gmbh,einrichtungen scharfensteinschmidt gmbh
27,phoneNumber,49172867291,491728647291
32,name,craoline van aree gmbh getränkevertrieb,caroline van laere gmbh getränkevertrieb


In [9]:
e.experiment_infos

{'evaluation_dataset_filepath': 'experiments/2019-03-28T094145_no_exact_phone_matching_model_and_medium_set_with_limited_and_manipulated_information_with_sanitizer/data/evaluation.pkl',
 'golden_dataset_filepath': 'experiments/2019-03-28T094145_no_exact_phone_matching_model_and_medium_set_with_limited_and_manipulated_information_with_sanitizer/data/golden.pkl',
 'experiment_id': '2019-03-28T094145_no_exact_phone_matching_model_and_medium_set_with_limited_and_manipulated_information_with_sanitizer',
 'date': '2019-03-28 09:11:54.197638',
 'number_of_too_ambiguous_matches': 0}

In [9]:
check_results_of_experiment('2019-04-19T234325_kantwert-S1000-T10000-sanitized')

# golden samples: 998
# evaluation samples: 1011
accuracy: 55.0 %
recall: 10.6 %
precision: 100.0 %
f-score: 19.2 %
number of too ambiguous matches: 0.0 %


Unnamed: 0,was_a_match,was_no_match
found_a_match,54,0
found_no_match,455,502


In [8]:
check_results_of_experiment('2019-04-20T213552_kantwert-S1000-T10000-sanitized_again')

# golden samples: 998
# evaluation samples: 1011
accuracy: 60.1 %
recall: 20.8 %
precision: 100.0 %
f-score: 34.5 %
number of too ambiguous matches: 0.0 %


Unnamed: 0,was_a_match,was_no_match
found_a_match,106,0
found_no_match,403,502


In [15]:
experiment.experiment_infos

{'evaluation_dataset_filepath': 'data/experiments/2019-04-20T213552_kantwert-S1000-T10000-sanitized_again/data/evaluation.pkl',
 'golden_dataset_filepath': 'data/experiments/2019-04-20T213552_kantwert-S1000-T10000-sanitized_again/data/golden.pkl',
 'experiment_id': '2019-04-20T213552_kantwert-S1000-T10000-sanitized_again',
 'date': '2019-04-20 19:43:22.541339',
 'number_of_too_ambiguous_matches': 0}

In [5]:
%time experiment = ERSExperiment(base_path_to_data / '2018-07-03-small_set_with_full_but_manipulated_information_golden_data.pkl')
os.environ['EXPERIMENT_ID'] = 'redo-by-jupyter-2019-04-20T213552_kantwert-S1000-T10000-sanitized_again'
%time exp_id, results_path = experiment.process_evaluation_dataset(base_path_to_data / '2018-07-03-small_set_with_full_but_manipulated_information_evaluation_data.pkl')

12:38:54.008 - INFO - Starting to populate ERS
12:38:55.008 - INFO - Calling ERS on http://ers:8080 with 998 records for the golden data set.
12:39:01.008 - INFO - At step 100 avg processing time per company entry was 0.0583 ms
12:39:04.008 - INFO - At step 200 avg processing time per company entry was 0.0348 ms
12:39:08.008 - INFO - At step 300 avg processing time per company entry was 0.0359 ms
12:39:11.008 - INFO - At step 400 avg processing time per company entry was 0.0358 ms
12:39:14.008 - INFO - At step 500 avg processing time per company entry was 0.0299 ms
12:39:17.008 - INFO - At step 600 avg processing time per company entry was 0.0295 ms
12:39:21.008 - INFO - At step 700 avg processing time per company entry was 0.0336 ms
12:39:24.008 - INFO - At step 800 avg processing time per company entry was 0.0302 ms
12:39:27.008 - INFO - At step 900 avg processing time per company entry was 0.0295 ms
12:39:30.008 - INFO - The ERS was successfully populated with 100.0 % of 998 the loa

In [6]:
pp(experiment.stats())

# golden samples: 998
# evaluation samples: 1011
accuracy: 83.4 %
recall: 67.0 %
precision: 100.0 %
f-score: 80.2 %
number of too ambiguous matches: 0.0 %


In [5]:
check_results_of_experiment('redo-by-jupyter-2019-04-20T213552_kantwert-S1000-T10000-sanitized_again')

# golden samples: 998
# evaluation samples: 1011
accuracy: 83.4 %
recall: 67.0 %
precision: 100.0 %
f-score: 80.2 %
number of too ambiguous matches: 0.0 %


Unnamed: 0,was_a_match,was_no_match
found_a_match,341,0
found_no_match,168,502


In [6]:
check_results_of_experiment('2019-05-08T145457_kantwert-S1000-T10000-sanitized_again_and_again')

# golden samples: 998
# evaluation samples: 1011
accuracy: 83.4 %
recall: 67.0 %
precision: 100.0 %
f-score: 80.2 %
number of too ambiguous matches: 0.0 %


Unnamed: 0,was_a_match,was_no_match
found_a_match,341,0
found_no_match,168,502


## Experiments with Kantwert data

In [31]:
golden = pd.read_pickle('data/training-data-from-kantwert-buergle/golden_data.pkl')
evaluation = pd.read_pickle('data/training-data-from-kantwert-buergle/evaluation_data.pkl')
backtester_golden = pd.read_pickle('data/processed/2018-07-03-small_set_with_full_information_golden_data.pkl')
backtester_evaluation = pd.read_pickle('data/processed/2018-07-03-small_set_with_full_information_evaluation_data.pkl')

In [29]:
golden.columns

Index(['abs_hq_city', 'abs_hq_country', 'abs_hq_email', 'abs_hq_phone',
       'abs_hq_street', 'abs_hq_zip_code', 'abs_legal_form', 'abs_name',
       'abs_register_number', 'abs_taxid', 'abs_website', 'record_id'],
      dtype='object')

In [32]:
backtester_golden.columns

Index(['ersId', 'crmId', 'buergelId', 'name', 'legalForm', 'email',
       'phoneNumber', 'website', 'vatID', 'registerNumber',
       'commercialRegister', 'address.street', 'address.postalCode',
       'address.city', 'address.country'],
      dtype='object')

In [34]:
KANTWERT_2_ERS_DATAFIELD_MAPPING = {'abs_name':'name', 'abs_legal_form':'legalForm',
                                'abs_register_number':'registerNumber', 'abs_hq_email':'email', 'abs_website':'website',
                                'abs_hq_phone':'phoneNumber', 'abs_taxid':'vatID',
                                'abs_hq_street':'address.street', 'abs_hq_zip_code':'address.postalCode',
                                'abs_hq_city':'address.city', 'abs_hq_country':'address.country',
                                'record_id':'ersId'}


golden.rename(columns=KANTWERT_2_ERS_DATAFIELD_MAPPING, inplace=True)
evaluation.rename(columns=KANTWERT_2_ERS_DATAFIELD_MAPPING, inplace=True)

In [37]:
set(golden.columns) - set(backtester_golden.columns)

set()

In [38]:
set(backtester_golden.columns) - set(golden.columns)

{'buergelId', 'commercialRegister', 'crmId'}

In [39]:
golden['buergelId'] = None
golden['commercialRegister'] = None
golden['crmId'] = None
evaluation['buergelId'] = None
evaluation['commercialRegister'] = None
evaluation['crmId'] = None

In [44]:
len(set(backtester_golden.ersId).intersection(set(backtester_evaluation.ersId)))/len(backtester_golden)

0.5100200400801603

In [42]:
len(set(golden.ersId).intersection(set(evaluation.ersId)))/len(golden)

0.5913254576505624

In [46]:
backtester_golden.ersId.isna().sum()/len(backtester_golden)

0.0

In [47]:
golden.ersId.isna().sum()/len(golden)

0.0

In [48]:
backtester_evaluation.ersId.isna().sum()/len(backtester_evaluation)

0.49653808110781406

In [49]:
evaluation.ersId.isna().sum()/len(evaluation)

0.0

In [54]:
backtester_evaluation.ersId.isin(backtester_golden.ersId).sum()/len(backtester_evaluation)

0.503461918892186

In [50]:
evaluation.loc[~evaluation.ersId.isin(golden.ersId),'ersId'] = None

In [51]:
evaluation.ersId.isna().sum()/len(evaluation)

0.35774837888798616

In [52]:
evaluation.ersId.isin(golden.ersId).sum()/len(evaluation)

0.6422516211120138

In [55]:
backtester_golden.ersId.isin(backtester_evaluation.ersId).sum()/len(backtester_golden)

0.5100200400801603

In [56]:
golden.ersId.isin(evaluation.ersId).sum()/len(golden)

0.5913254576505624

In [57]:
golden.to_pickle('data/processed/2019-05-10-kantwert_golden_data.pkl')
evaluation.to_pickle('data/processed/2019-05-10-kantwert_evaluation_data.pkl')

In [5]:
base_path_to_data

PosixPath('/home/datascientist/host/data/processed')

In [6]:
os.environ['EXPERIMENT_ID'] = 'standard-crm-model-on-kantwert'
%time experiment = ERSExperiment(base_path_to_data / '2019-05-10-kantwert_golden_data.pkl')

09:00:14.010 - INFO - Starting to populate ERS
09:00:15.010 - INFO - Calling ERS on http://ers:8080 with 70851 records for the golden data set.
09:00:23.010 - INFO - At step 100 avg processing time per company entry was 0.0768 ms
09:00:26.010 - INFO - At step 200 avg processing time per company entry was 0.0372 ms
09:00:30.010 - INFO - At step 300 avg processing time per company entry was 0.0376 ms
09:00:34.010 - INFO - At step 400 avg processing time per company entry was 0.0340 ms
09:00:37.010 - INFO - At step 500 avg processing time per company entry was 0.0304 ms
09:00:40.010 - INFO - At step 600 avg processing time per company entry was 0.0299 ms
09:00:43.010 - INFO - At step 700 avg processing time per company entry was 0.0366 ms
09:00:46.010 - INFO - At step 800 avg processing time per company entry was 0.0304 ms
09:00:49.010 - INFO - At step 900 avg processing time per company entry was 0.0313 ms
09:00:53.010 - INFO - At step 1000 avg processing time per company entry was 0.031

09:06:06.010 - INFO - At step 9300 avg processing time per company entry was 0.0441 ms
09:06:10.010 - INFO - At step 9400 avg processing time per company entry was 0.0439 ms
09:06:14.010 - INFO - At step 9500 avg processing time per company entry was 0.0448 ms
09:06:19.010 - INFO - At step 9600 avg processing time per company entry was 0.0446 ms
09:06:24.010 - INFO - At step 9700 avg processing time per company entry was 0.0458 ms
09:06:28.010 - INFO - At step 9800 avg processing time per company entry was 0.0448 ms
09:06:33.010 - INFO - At step 9900 avg processing time per company entry was 0.0458 ms
09:06:38.010 - INFO - At step 10000 avg processing time per company entry was 0.0490 ms
09:06:42.010 - INFO - At step 10100 avg processing time per company entry was 0.0468 ms
09:06:48.010 - INFO - At step 10200 avg processing time per company entry was 0.0588 ms
09:06:54.010 - INFO - At step 10300 avg processing time per company entry was 0.0579 ms
09:06:58.010 - INFO - At step 10400 avg

09:14:56.010 - INFO - At step 18700 avg processing time per company entry was 0.0660 ms
09:15:03.010 - INFO - At step 18800 avg processing time per company entry was 0.0657 ms
09:15:10.010 - INFO - At step 18900 avg processing time per company entry was 0.0652 ms
09:15:16.010 - INFO - At step 19000 avg processing time per company entry was 0.0693 ms
09:15:23.010 - INFO - At step 19100 avg processing time per company entry was 0.0656 ms
09:15:31.010 - INFO - At step 19200 avg processing time per company entry was 0.0780 ms
09:15:38.010 - INFO - At step 19300 avg processing time per company entry was 0.0676 ms
09:15:44.010 - INFO - At step 19400 avg processing time per company entry was 0.0683 ms
09:15:51.010 - INFO - At step 19500 avg processing time per company entry was 0.0673 ms
09:15:58.010 - INFO - At step 19600 avg processing time per company entry was 0.0664 ms
09:16:05.010 - INFO - At step 19700 avg processing time per company entry was 0.0674 ms
09:16:11.010 - INFO - At step 19

09:27:00.010 - INFO - At step 28100 avg processing time per company entry was 0.0867 ms
09:27:08.010 - INFO - At step 28200 avg processing time per company entry was 0.0858 ms
09:27:17.010 - INFO - At step 28300 avg processing time per company entry was 0.0872 ms
09:27:26.010 - INFO - At step 28400 avg processing time per company entry was 0.0884 ms
09:27:35.010 - INFO - At step 28500 avg processing time per company entry was 0.0897 ms
09:27:44.010 - INFO - At step 28600 avg processing time per company entry was 0.0897 ms
09:27:52.010 - INFO - At step 28700 avg processing time per company entry was 0.0869 ms
09:28:01.010 - INFO - At step 28800 avg processing time per company entry was 0.0881 ms
09:28:10.010 - INFO - At step 28900 avg processing time per company entry was 0.0912 ms
09:28:19.010 - INFO - At step 29000 avg processing time per company entry was 0.0901 ms
09:28:28.010 - INFO - At step 29100 avg processing time per company entry was 0.0909 ms
09:28:38.010 - INFO - At step 29

09:43:01.010 - INFO - At step 37500 avg processing time per company entry was 0.1065 ms
09:43:12.010 - INFO - At step 37600 avg processing time per company entry was 0.1105 ms
09:43:22.010 - INFO - At step 37700 avg processing time per company entry was 0.1059 ms
09:43:34.010 - INFO - At step 37800 avg processing time per company entry was 0.1130 ms
09:43:45.010 - INFO - At step 37900 avg processing time per company entry was 0.1096 ms
09:43:56.010 - INFO - At step 38000 avg processing time per company entry was 0.1080 ms
09:44:06.010 - INFO - At step 38100 avg processing time per company entry was 0.1068 ms
09:44:20.010 - INFO - At step 38200 avg processing time per company entry was 0.1420 ms
09:44:37.010 - INFO - At step 38300 avg processing time per company entry was 0.1639 ms
09:44:49.010 - INFO - At step 38400 avg processing time per company entry was 0.1188 ms
09:45:00.010 - INFO - At step 38500 avg processing time per company entry was 0.1093 ms
09:45:11.010 - INFO - At step 38

10:01:52.010 - INFO - At step 46900 avg processing time per company entry was 0.1240 ms
10:02:04.010 - INFO - At step 47000 avg processing time per company entry was 0.1221 ms
10:02:16.010 - INFO - At step 47100 avg processing time per company entry was 0.1238 ms
10:02:29.010 - INFO - At step 47200 avg processing time per company entry was 0.1226 ms
10:02:41.010 - INFO - At step 47300 avg processing time per company entry was 0.1255 ms
10:02:54.010 - INFO - At step 47400 avg processing time per company entry was 0.1241 ms
10:03:06.010 - INFO - At step 47500 avg processing time per company entry was 0.1234 ms
10:03:20.010 - INFO - At step 47600 avg processing time per company entry was 0.1405 ms
10:03:33.010 - INFO - At step 47700 avg processing time per company entry was 0.1304 ms
10:03:46.010 - INFO - At step 47800 avg processing time per company entry was 0.1306 ms
10:03:59.010 - INFO - At step 47900 avg processing time per company entry was 0.1319 ms
10:04:12.010 - INFO - At step 48

10:23:26.010 - INFO - At step 56300 avg processing time per company entry was 0.1469 ms
10:23:41.010 - INFO - At step 56400 avg processing time per company entry was 0.1488 ms
10:23:55.010 - INFO - At step 56500 avg processing time per company entry was 0.1437 ms
10:24:10.010 - INFO - At step 56600 avg processing time per company entry was 0.1438 ms
10:24:24.010 - INFO - At step 56700 avg processing time per company entry was 0.1438 ms
10:24:39.010 - INFO - At step 56800 avg processing time per company entry was 0.1468 ms
10:24:54.010 - INFO - At step 56900 avg processing time per company entry was 0.1490 ms
10:25:08.010 - INFO - At step 57000 avg processing time per company entry was 0.1464 ms
10:25:23.010 - INFO - At step 57100 avg processing time per company entry was 0.1466 ms
10:25:38.010 - INFO - At step 57200 avg processing time per company entry was 0.1462 ms
10:25:52.010 - INFO - At step 57300 avg processing time per company entry was 0.1458 ms
10:26:07.010 - INFO - At step 57

10:48:04.010 - INFO - At step 65700 avg processing time per company entry was 0.1613 ms
10:48:20.010 - INFO - At step 65800 avg processing time per company entry was 0.1603 ms
10:48:36.010 - INFO - At step 65900 avg processing time per company entry was 0.1600 ms
10:48:52.010 - INFO - At step 66000 avg processing time per company entry was 0.1595 ms
10:49:08.010 - INFO - At step 66100 avg processing time per company entry was 0.1599 ms
10:49:24.010 - INFO - At step 66200 avg processing time per company entry was 0.1596 ms
10:49:40.010 - INFO - At step 66300 avg processing time per company entry was 0.1630 ms
10:49:56.010 - INFO - At step 66400 avg processing time per company entry was 0.1599 ms
10:50:12.010 - INFO - At step 66500 avg processing time per company entry was 0.1597 ms
10:50:28.010 - INFO - At step 66600 avg processing time per company entry was 0.1600 ms
10:50:44.010 - INFO - At step 66700 avg processing time per company entry was 0.1618 ms
10:51:00.010 - INFO - At step 66

In [9]:
%time exp_id, _ = experiment.process_evaluation_dataset(base_path_to_data / '2019-05-10-kantwert_evaluation_data.pkl')

12:30:37.010 - INFO - Calling ERS to match http://ers:8080 with 65233 records from the evaluation data set against the golden data.
12:30:37.010 - INFO - Going to store all results of the experiment at data/experiments/standard-crm-model-on-kantwert/results.
12:31:28.010 - INFO - Matched 100 of 65233 0.15 % entries.
12:31:28.010 - INFO - Accuracy is 91.74 %
12:32:13.010 - INFO - Matched 200 of 65233 0.31 % entries.
12:32:13.010 - INFO - Accuracy is 91.98 %
12:33:01.010 - INFO - Matched 300 of 65233 0.46 % entries.
12:33:01.010 - INFO - Accuracy is 92.68 %
12:33:48.010 - INFO - Matched 400 of 65233 0.61 % entries.
12:33:48.010 - INFO - Accuracy is 93.67 %
12:34:38.010 - INFO - Matched 500 of 65233 0.77 % entries.
12:34:38.010 - INFO - Accuracy is 93.59 %
12:36:16.010 - INFO - Matched 700 of 65233 1.07 % entries.
12:36:16.010 - INFO - Accuracy is 93.88 %
12:37:06.010 - INFO - Matched 800 of 65233 1.23 % entries.
12:37:06.010 - INFO - Accuracy is 94.12 %
12:37:56.010 - INFO - Matched 900 

15:09:50.010 - INFO - Matched 8100 of 65233 12.42 % entries.
15:09:50.010 - INFO - Accuracy is 92.47 %
15:10:54.010 - INFO - Matched 8200 of 65233 12.57 % entries.
15:10:54.010 - INFO - Accuracy is 92.50 %
15:12:00.010 - INFO - Matched 8300 of 65233 12.72 % entries.
15:12:00.010 - INFO - Accuracy is 92.49 %
15:13:09.010 - INFO - Matched 8400 of 65233 12.88 % entries.
15:13:10.010 - INFO - Accuracy is 92.44 %
15:15:23.010 - INFO - Matched 8500 of 65233 13.03 % entries.
15:15:23.010 - INFO - Accuracy is 92.42 %
15:16:30.010 - INFO - Matched 8600 of 65233 13.18 % entries.
15:16:30.010 - INFO - Accuracy is 92.34 %
15:17:39.010 - INFO - Matched 8700 of 65233 13.34 % entries.
15:17:39.010 - INFO - Accuracy is 92.28 %
15:18:46.010 - INFO - Matched 8800 of 65233 13.49 % entries.
15:18:46.010 - INFO - Accuracy is 92.28 %
15:19:53.010 - INFO - Matched 8900 of 65233 13.64 % entries.
15:19:53.010 - INFO - Accuracy is 92.26 %
15:21:01.010 - INFO - Matched 9000 of 65233 13.80 % entries.
15:21:01.010

22:46:27.010 - INFO - Matched 16200 of 65233 24.83 % entries.
22:46:27.010 - INFO - Accuracy is 92.64 %
22:47:49.010 - INFO - Matched 16300 of 65233 24.99 % entries.
22:47:49.010 - INFO - Accuracy is 92.65 %
22:49:11.010 - INFO - Matched 16400 of 65233 25.14 % entries.
22:49:11.010 - INFO - Accuracy is 92.66 %
22:50:34.010 - INFO - Matched 16500 of 65233 25.29 % entries.
22:50:34.010 - INFO - Accuracy is 92.66 %
22:51:57.010 - INFO - Matched 16600 of 65233 25.45 % entries.
22:51:57.010 - INFO - Accuracy is 92.68 %
22:53:24.010 - INFO - Matched 16700 of 65233 25.60 % entries.
22:53:24.010 - INFO - Accuracy is 92.68 %
22:54:47.010 - INFO - Matched 16800 of 65233 25.75 % entries.
22:54:47.010 - INFO - Accuracy is 92.69 %
22:56:10.010 - INFO - Matched 16900 of 65233 25.91 % entries.
22:56:10.010 - INFO - Accuracy is 92.71 %
22:57:34.010 - INFO - Matched 17000 of 65233 26.06 % entries.
22:57:34.010 - INFO - Accuracy is 92.71 %
22:58:58.010 - INFO - Matched 17100 of 65233 26.21 % entries.
22

00:47:39.011 - INFO - Matched 24200 of 65233 37.10 % entries.
00:47:39.011 - INFO - Accuracy is 92.61 %
00:49:18.011 - INFO - Matched 24300 of 65233 37.25 % entries.
00:49:18.011 - INFO - Accuracy is 92.59 %
00:50:59.011 - INFO - Matched 24400 of 65233 37.40 % entries.
00:50:59.011 - INFO - Accuracy is 92.59 %
00:52:43.011 - INFO - Matched 24500 of 65233 37.56 % entries.
00:52:43.011 - INFO - Accuracy is 92.60 %
00:54:23.011 - INFO - Matched 24600 of 65233 37.71 % entries.
00:54:23.011 - INFO - Accuracy is 92.61 %
00:56:04.011 - INFO - Matched 24700 of 65233 37.86 % entries.
00:56:04.011 - INFO - Accuracy is 92.61 %
00:57:45.011 - INFO - Matched 24800 of 65233 38.02 % entries.
00:57:45.011 - INFO - Accuracy is 92.58 %
00:59:26.011 - INFO - Matched 24900 of 65233 38.17 % entries.
00:59:26.011 - INFO - Accuracy is 92.59 %
01:01:06.011 - INFO - Matched 25000 of 65233 38.32 % entries.
01:01:06.011 - INFO - Accuracy is 92.59 %
01:02:53.011 - INFO - Matched 25100 of 65233 38.48 % entries.
01

03:15:34.011 - INFO - Matched 32200 of 65233 49.36 % entries.
03:15:34.011 - INFO - Accuracy is 92.44 %
03:17:37.011 - INFO - Matched 32300 of 65233 49.51 % entries.
03:17:37.011 - INFO - Accuracy is 92.43 %
03:19:38.011 - INFO - Matched 32400 of 65233 49.67 % entries.
03:19:38.011 - INFO - Accuracy is 92.44 %
03:21:40.011 - INFO - Matched 32500 of 65233 49.82 % entries.
03:21:40.011 - INFO - Accuracy is 92.44 %
03:23:43.011 - INFO - Matched 32600 of 65233 49.97 % entries.
03:23:43.011 - INFO - Accuracy is 92.44 %
03:25:54.011 - INFO - Matched 32700 of 65233 50.13 % entries.
03:25:55.011 - INFO - Accuracy is 92.44 %
03:27:56.011 - INFO - Matched 32800 of 65233 50.28 % entries.
03:27:56.011 - INFO - Accuracy is 92.44 %
03:29:58.011 - INFO - Matched 32900 of 65233 50.43 % entries.
03:29:59.011 - INFO - Accuracy is 92.43 %
03:32:00.011 - INFO - Matched 33000 of 65233 50.59 % entries.
03:32:00.011 - INFO - Accuracy is 92.44 %
03:34:09.011 - INFO - Matched 33100 of 65233 50.74 % entries.
03

06:14:57.011 - INFO - Matched 40100 of 65233 61.47 % entries.
06:14:57.011 - INFO - Accuracy is 92.28 %
06:17:30.011 - INFO - Matched 40200 of 65233 61.63 % entries.
06:17:30.011 - INFO - Accuracy is 92.27 %
06:20:01.011 - INFO - Matched 40300 of 65233 61.78 % entries.
06:20:01.011 - INFO - Accuracy is 92.28 %
06:22:26.011 - INFO - Matched 40400 of 65233 61.93 % entries.
06:22:26.011 - INFO - Accuracy is 92.27 %
06:24:52.011 - INFO - Matched 40500 of 65233 62.09 % entries.
06:24:52.011 - INFO - Accuracy is 92.27 %
06:27:18.011 - INFO - Matched 40600 of 65233 62.24 % entries.
06:27:18.011 - INFO - Accuracy is 92.29 %
06:29:41.011 - INFO - Matched 40700 of 65233 62.39 % entries.
06:29:41.011 - INFO - Accuracy is 92.29 %
06:32:19.011 - INFO - Matched 40800 of 65233 62.55 % entries.
06:32:19.011 - INFO - Accuracy is 92.29 %
06:34:53.011 - INFO - Matched 40900 of 65233 62.70 % entries.
06:34:53.011 - INFO - Accuracy is 92.27 %
06:37:22.011 - INFO - Matched 41000 of 65233 62.85 % entries.
06

10:00:47.011 - INFO - Matched 48300 of 65233 74.04 % entries.
10:00:47.011 - INFO - Accuracy is 89.40 %
10:03:46.011 - INFO - Matched 48400 of 65233 74.20 % entries.
10:03:46.011 - INFO - Accuracy is 89.38 %
10:07:01.011 - INFO - Matched 48500 of 65233 74.35 % entries.
10:07:01.011 - INFO - Accuracy is 89.34 %
10:10:07.011 - INFO - Matched 48600 of 65233 74.50 % entries.
10:10:07.011 - INFO - Accuracy is 89.33 %
10:13:07.011 - INFO - Matched 48700 of 65233 74.66 % entries.
10:13:07.011 - INFO - Accuracy is 89.31 %
10:16:17.011 - INFO - Matched 48800 of 65233 74.81 % entries.
10:16:17.011 - INFO - Accuracy is 89.28 %
10:19:40.011 - INFO - Matched 48900 of 65233 74.96 % entries.
10:19:40.011 - INFO - Accuracy is 89.27 %
10:22:51.011 - INFO - Matched 49000 of 65233 75.12 % entries.
10:22:51.011 - INFO - Accuracy is 89.25 %
10:25:50.011 - INFO - Matched 49100 of 65233 75.27 % entries.
10:25:50.011 - INFO - Accuracy is 89.23 %
10:29:13.011 - INFO - Matched 49200 of 65233 75.42 % entries.
10

14:56:31.011 - INFO - Matched 56600 of 65233 86.77 % entries.
14:56:31.011 - INFO - Accuracy is 87.78 %
15:00:23.011 - INFO - Matched 56700 of 65233 86.92 % entries.
15:00:23.011 - INFO - Accuracy is 87.75 %
15:04:12.011 - INFO - Matched 56800 of 65233 87.07 % entries.
15:04:13.011 - INFO - Accuracy is 87.73 %
15:07:59.011 - INFO - Matched 56900 of 65233 87.23 % entries.
15:07:59.011 - INFO - Accuracy is 87.74 %
15:11:57.011 - INFO - Matched 57000 of 65233 87.38 % entries.
15:11:57.011 - INFO - Accuracy is 87.73 %
15:15:48.011 - INFO - Matched 57100 of 65233 87.53 % entries.
15:15:48.011 - INFO - Accuracy is 87.72 %
15:19:38.011 - INFO - Matched 57200 of 65233 87.69 % entries.
15:19:38.011 - INFO - Accuracy is 87.71 %
15:23:29.011 - INFO - Matched 57300 of 65233 87.84 % entries.
15:23:29.011 - INFO - Accuracy is 87.70 %
15:27:30.011 - INFO - Matched 57400 of 65233 87.99 % entries.
15:27:30.011 - INFO - Accuracy is 87.68 %
15:31:27.011 - INFO - Matched 57500 of 65233 88.15 % entries.
15

20:46:20.011 - INFO - Matched 64900 of 65233 99.49 % entries.
20:46:20.011 - INFO - Accuracy is 86.85 %
20:50:43.011 - INFO - Matched 65000 of 65233 99.64 % entries.
20:50:43.011 - INFO - Accuracy is 86.85 %
20:55:26.011 - INFO - Matched 65100 of 65233 99.80 % entries.
20:55:26.011 - INFO - Accuracy is 86.84 %
20:59:58.011 - INFO - Matched 65200 of 65233 99.95 % entries.
20:59:58.011 - INFO - Accuracy is 86.82 %
21:02:42.011 - INFO - 1507 out of 65233 (2.31 %) were too ambiguous to find a valid match.
21:02:42.011 - INFO - Final result:
21:02:42.011 - INFO - Accuracy is 86.82 %
21:04:18.011 - INFO - Processing the evaluation data set took 1 day, 8:33:40.944316 time.
CPU times: user 6h 41min 59s, sys: 3min 20s, total: 6h 45min 20s
Wall time: 1d 8h 33min 40s


In [11]:
pp(experiment.stats())

# golden samples: 70.85 k
# evaluation samples: 65.23 k
accuracy: 86.8 %
recall: 97.8 %
precision: 83.4 %
f-score: 90.0 %
number of too ambiguous matches: 1507


In [12]:
experiment.confusion_matrix

Unnamed: 0,was_a_match,was_no_match
found_a_match,40564,8071
found_no_match,902,18519


In [13]:
check_results_of_experiment('standard-crm-model-on-kantwert')

# golden samples: 70.85 k
# evaluation samples: 65.23 k
accuracy: 86.8 %
recall: 97.8 %
precision: 83.4 %
f-score: 90.0 %
number of too ambiguous matches: 1507


Unnamed: 0,was_a_match,was_no_match
found_a_match,40564,8071
found_no_match,902,18519
