In [1]:
import os
import logging
from pathlib import Path
from abc import ABC, abstractmethod
import pandas as pd
import enforce
import dedupe
import pickle
from nltk.corpus import stopwords

INFO:root:Generating grammar tables from /usr/lib/python3.6/lib2to3/Grammar.txt
INFO:root:Generating grammar tables from /usr/lib/python3.6/lib2to3/PatternGrammar.txt


In [2]:
class LegalEntityMatchingModel(ABC):
    """Base class of all models to match entities (i.e. companies and persons of the entity knowledge graph).

    It maintains a Pandas DataFrame with all known entities: `self.known_entities`.
    """
 
    def __init__(self):
        super(LegalEntityMatchingModel, self).__init__()
        self.known_entities = pd.DataFrame()
    
    @enforce.runtime_validation
    def add_entity(self, entity: dict):
        if len(self.known_entities) > 0:
            assert not entity['ers_id'] in self.known_entities.ers_id.values, f'Cannot add entity with ERS-Id {entity["ers_id"]} a second time.'
        self.known_entities = self.known_entities.append(entity, ignore_index=True)
        logging.debug(f"{entity['ers_id']} was added to known entities (# {len(self.known_entities)})")

    @enforce.runtime_validation
    def update_entity(self, entity: dict):
        if entity.get('ers_id') is None:
            logging.warning(f'Cant update the entity {entity}. No ERS-Id is available for it although it should be. Check the calling code again!.')
        else:
            # logging.debug(f"Updating entity: {entity.get('ers_id')}")
            if sum(self.known_entities.ers_id==entity['ers_id']) > 1:
                logging.warning(f"Found more than one entry in 'known_entities' for {entity['ers_id']}. Might be a problem.")
                logging.warning(f"{self.known_entities.ers_id==entity['ers_id']}")

            old = self.known_entities[self.known_entities.ers_id==entity['ers_id']].to_dict(orient='record')[0]
            if len(entity.items() ^ old.items()) == 0:
                logging.debug(f"Although requested nothing to update for {entity['ers_id']}.")
                return
            # logging.debug(f'New entry is: {entity}')
            # logging.debug(f"Old entry is: {old}")
            self.remove_entity(entity['ers_id'])
            self.add_entity(entity)

    @enforce.runtime_validation
    def remove_entity(self, ers_id: str):
        self.known_entities = self.known_entities[self.known_entities.ers_id != ers_id]
        logging.debug(f"{ers_id} was removed from known entities (# {len(self.known_entities)})")

    def remove_all_entities(self):
        self.known_entities = pd.DataFrame()

    @abstractmethod
    @enforce.runtime_validation
    def match_entity(self, entity: dict) -> []:
        """This method is the essential part of the whole class and should return a list of matches 
        for the given entity. The entity will be a dict containing all data fields known about an entity. 
        The method has to return a (maybe empty) list of (str, float) tuples where the first contains the ERS-Id and the 
        second the confidence in the match (aka similarity to the found matching entry). The entries in the list
        has to be ordered descendingly by the confidence value: the first entry should be the best match.
        """
        pass

    @abstractmethod
    @enforce.runtime_validation
    def get_infos(self) -> str:
        """Should return some string that captures how the model is actually configured (e.g. the 
        filename of the trained dedupe model). The output will be added to the info endpoint of the
        ERS to protocoll all the details about the whole setup of the service. This is e.g. requested
        by the ER backtester when an experiment is executed.
        """
        pass


In [3]:
def jaro_similarity(s1, s2):
    # First, store the length of the strings
    # because they will be re-used several times.
    len_s1, len_s2 = len(s1), len(s2)

    # The upper bound of the distance for being a matched character.
    match_bound = max(len_s1, len_s2) // 2 - 1

    # Initialize the counts for matches and transpositions.
    matches = 0  # no.of matched characters in s1 and s2
    transpositions = 0  # no. of transpositions between s1 and s2
    flagged_1 = []  # positions in s1 which are matches to some character in s2
    flagged_2 = []  # positions in s2 which are matches to some character in s1

    # Iterate through sequences, check for matches and compute transpositions.
    for i in range(len_s1):  # Iterate through each character.
        upperbound = min(i + match_bound, len_s2 - 1)
        lowerbound = max(0, i - match_bound)
        for j in range(lowerbound, upperbound + 1):
            if s1[i] == s2[j] and j not in flagged_2:
                matches += 1
                flagged_1.append(i)
                flagged_2.append(j)
                break
    flagged_2.sort()
    for i, j in zip(flagged_1, flagged_2):
        if s1[i] != s2[j]:
            transpositions += 1

    if matches == 0:
        return 0
    else:
        return (
            1
            / 3
            * (
                matches / len_s1
                + matches / len_s2
                + (matches - transpositions // 2) / matches
            )
        )
def jaro_winkler_similarity(s1, s2, p=0.1, max_l=4):
    # To ensure that the output of the Jaro-Winkler's similarity
    # falls between [0,1], the product of l * p needs to be
    # also fall between [0,1].
    if not 0 <= max_l * p <= 1:
        warnings.warn(
            str(
                "The product  `max_l * p` might not fall between [0,1]."
                "Jaro-Winkler similarity might not be between 0 and 1."
            )
        )

    # Compute the Jaro similarity
    jaro_sim = jaro_similarity(s1, s2)

    # Initialize the upper bound for the no. of prefixes.
    # if user did not pre-define the upperbound,
    # use shorter length between s1 and s2

    # Compute the prefix matches.
    l = 0
    # zip() will automatically loop until the end of shorter string.
    for s1_i, s2_i in zip(s1, s2):
        if s1_i == s2_i:
            l += 1
        else:
            break
        if l == max_l:
            break
    # Return the similarity value as described in docstring.
    return jaro_sim + (l * p * (1 - jaro_sim))

def city_similarity(companyA, companyB):
    cityA = companyA['address.city'][:6] if pd.isnull(companyA['address.city']) is not True else companyA['address.city']
    cityB = companyB['address.city'][:6] if pd.isnull(companyB['address.city']) is not True else companyB['address.city']
    if cityA.lower() == cityB.lower():
        return 1
    else:
        return 0  
def street_similarity(companyA, companyB):
    streetA = companyA['address.street'][:6] if pd.isnull(companyA['address.street']) is not True else companyA['address.street']
    streetB = companyB['address.street'][:6] if pd.isnull(companyB['address.street']) is not True else companyB['address.street']
    if str(streetA).lower() == str(streetB).lower():
        return 1
    else:
        return 0  
def phone_similarity(companyA, companyB):
    phoneA = companyA['phone_number'][:7] if pd.isnull(companyA['phone_number']) is not True else companyA['phone_number']
    phoneB = companyB['phone_number'][:7] if pd.isnull(companyB['phone_number']) is not True else companyB['phone_number']
    if phoneA == phoneB:
        return 1
    else:
        return 0  
    
def name_similarity(companyA, companyB):
    nameA = companyA['name']
    nameB = companyB['name']
    return jaro_winkler_similarity(nameA, nameB)

def postcode_similarity(companyA, companyB):
    codeA = companyA['address.postal_code']
    codeB = companyB['address.postal_code']
    if codeA == codeB:
        return 1
    else:
        return 0
    
def legalform_similarity(companyA, companyB):
    legalA = companyA['legal_form']
    legalB = companyB['legal_form']
    if str(legalA).lower() == str(legalB).lower():
        return 1
    else:
        return 0
    
def pairwise_similarity(entity_ref: dict, entity_eval: dict) -> []:
    sim = dict()
    sim['name'] = name_similarity(entity_ref, entity_eval)
    sim['legal_form'] = legalform_similarity(entity_ref, entity_eval)
    sim['address.postal_code'] = postcode_similarity(entity_ref, entity_eval)
    sim['phone_number'] = phone_similarity(entity_ref, entity_eval)
    sim['address.street'] = street_similarity(entity_ref, entity_eval)
    return sim

In [4]:
import re

rgxBV  = re.compile(r"( B\.?V\.?( | ?&|,|-|$))|((B|b)(esloten|ESLOTEN)(V|v)((ennootschap|ENNOOTSCHAP) (MET|met) (beperkte|BEPERKTE) (A|a)(ansprakelijkheid|ANSPRAKELIJKHEID))?)")
rgxEV  = re.compile(r"( |,|\.|^)(E|e)\.?(V|v)( |,|\.|$)|(|,|\.|^)(E|e)(INGETRAGENER|ingetragener) (V|v)(EREIN|erein)")
rgxVAG = re.compile(r"( |,|\.|^)(V|v)\.?(A|a)\.?(G)( |,|\.|$)")
rgxEG  = re.compile(r"(G|g)(ENOSSENSCHAFT|genossenschaft)|( |,|\.|^)(E|e)(INGETRAGENE|ingetragene) (G|g)(EN|en)\.?(OSSENSCHAFT|ossenschaft)?|( |,|\.|^)(E|e)\.(G|g)(|,|\.|$)|eG( |,|\.|$)")
rgxGBR = re.compile(r"(G|g)?(E|e)?(S|s)?\.?(ELL|ell)?\.?(SCH|sch)?\.?(AFT|aft)?(B|b)(Ü|ü|UE|ue)(RGERLICHEN|rgerlichen) (R|r)(ECHTS|echts)|(|,|\.|^)(Gbr|GbR|GBR)( |,|$)|(|,|\.)(Inh\.?(aber)?(in)?|INH\.?(ABER)?(IN)?)")
rgxUCO = re.compile(r"(&|\+| UND | und | (U|u)(\.| )) ?(C|c)(O|o|IE|Ie|ie)( |,|-|\.|$|\)?)")
rgxAG  = re.compile(r"(( |-)A(G|g)( | ?&|,|-|$))|A(C|c|K|k)(TIEN|tien)-?(GES|ges)\.?(ELLSCHAFT|ellschaft|ELLSCH|ellsch)?\.?")
rgxSE  = re.compile(r"(( |-)SE( | ?&|,|-|$))|S(OCIETAS|ocietas) ?(E|e)(UROP|urop)")
rgxUG  = re.compile(r"((( |-)U((G|G-)|g)( |,|$))|(U(NTERN|ntern)\.?-?(EHMER|ehmer)? ?(GES|ges)\.?(ELLSCHAFT|ellschaft|ELLSCH|ellsch)?\.?))(\(?haftungsbeschr\.?(ä|ae)?(nkt)?\)?)?")
rgxLTD = re.compile(r"(U(K|k))?( |,|\.|^)(LTD|Ltd)(-| |,|\.|$)|UK (Limited|LIMITED)")
rgxGMBH = re.compile(r"(G|g)?(E|e)?(S|s)?\.?(ELL|ell)?\.?(SCH|sch)?\.?(AFT|aft)? ?(M|m)\.?(IT|it)? ?(B|b)(ESCHR|eschr)?\.?(Ä|ä)?(AE?|ae?)?(NKTER|nkter)? ?(H|h)( |,|\.|$|AFTUNG|aftung)")
rgxKG = re.compile(r"(( |-)(K|k)\.?(G|g)( |&|,|-|\.|$|aA))|K(O|o)(M|m)(M|m)(A|a)(N|n)(D|d)(I|i)(T|t)\.?(G|g)?\.?(E|e)?(S|s)?\.?(E|e)?(L|l)?(L|l)?\.?(S|s)?(C|c)?(H|h)?\.?(A|a)?(F|f)?(T|t)?")
rgxOHG = re.compile(r"( O\.?H\.?G\.?)( |,|$)|(O|o)(FF|ff)\.? ?(ENE|ene)?\.? ?(H|h)(ANDELS|andels)( |-)? ?(G|g)\.?(E|e)(S|s)\.?(E|e)?(L|l)?(L|l)?\.?(S|s)?(C|c)?(H|h)?\.?(A|a)?(F|f)?(T|t)?")
rgxKGAA = re.compile(r"(( |-)(K|k)(G|g)(a|A)A)|K(O|o)(M|m)(M|m)(A|a)(N|n)(D|d)(I|i)(T|t).*((A|a)uf A(ktien|KTIEN))")

def legalFormDetector(s):
    """ Detects a company´s legal form if the legal form is part of the string s.
    Detection is done by applying various regular expressions to match different
    spellings of legal forms. Considered legal forms are mainly the ones most releveant for the
    DACH region.

    Returns: detected legal form, normalized company name string (replacing legal form by canonical one)
        
    """
    form, name = legalFormDetectorRaw(s)
    return form, re.sub(' +',' ', name).strip()

def legalFormDetectorRaw(s):
    if (rgxAG.search(s) and rgxUCO.search(s) and rgxKG.search(s)):
        return "AG & Co. KG", rgxAG.sub(" AG ", rgxUCO.sub(" & Co. ", rgxKG.sub("KG", s)))
    if (rgxAG.search(s) and rgxUCO.search(s) and rgxKGAA.search(s)):
        return "AG & Co. KGaA", rgxAG.sub(" AG ", rgxUCO.sub(" & Co. ", rgxKGAA.sub("KgaA", s)))
    if (rgxAG.search(s) and rgxUCO.search(s) and rgxOHG.search(s)):
        return "AG & Co. OHG", rgxAG.sub(" AG ", rgxUCO.sub(" & Co. ", rgxOHG.sub("OHG", s)))
    if (rgxBV.search(s) and rgxUCO.search(s) and rgxKG.search(s)):
        return "BV & Co. KG", rgxBV.sub(" BV ", rgxUCO.sub(" & Co. ",rgxKG.sub("KG", s)))
    if (rgxBV.search(s) and rgxUCO.search(s) and rgxKGAA.search(s)):
        return "BV & Co. KGaA", rgxBV.sub(" BV ", rgxUCO.sub(" & Co. ",rgxKGAA.sub("KgaA", s)))
    if (rgxBV.search(s) and rgxUCO.search(s) and rgxOHG.search(s)):
        return "BV & Co. OHG", rgxBV.sub(" BV ", rgxUCO.sub(" & Co. ",rgxOHG.sub("OHG", s)))
    if (rgxGMBH.search(s) and rgxUCO.search(s) and rgxKG.search(s)):
        return "GmbH & Co. KG", rgxGMBH.sub(" GmbH ", rgxUCO.sub(" & Co. ",rgxKG.sub("KG", s)))
    if (rgxGMBH.search(s) and rgxUCO.search(s) and rgxKGAA.search(s)):
        return "GmbH & Co. KGaA", rgxGMBH.sub(" GmbH ", rgxUCO.sub("&  Co. ",rgxKGAA.sub("KgaA", s)))
    if (rgxGMBH.search(s) and rgxUCO.search(s) and rgxOHG.search(s)):
        return "GmbH & Co. OHG", rgxGMBH.sub(" GmbH ", rgxUCO.sub(" & Co. ",rgxOHG.sub("OHG", s)))
    if (rgxLTD.search(s) and rgxUCO.search(s) and rgxKG.search(s)):
        return "Ltd. & Co. KG", rgxLTD.sub(" Ltd. ", rgxUCO.sub(" & Co. ",rgxKG.sub("KG", s)))
    if (rgxLTD.search(s) and rgxUCO.search(s) and rgxKGAA.search(s)):
        return "Ltd. & Co. KGaA", rgxLTD.sub(" LTD ", rgxUCO.sub(" & Co. ",rgxKGAA.sub("KgaA", s)))
    if (rgxLTD.search(s) and rgxUCO.search(s) and rgxOHG.search(s)):
        return "Ltd. & Co. OHG", rgxLTD.sub(" LTD ", rgxUCO.sub(" & Co. ",rgxOHG.sub("OHG", s)))
    if (rgxSE.search(s) and rgxUCO.search(s) and rgxKG.search(s)):
        return "SE & Co. KG", rgxSE.sub(" SE ", rgxUCO.sub(" & Co. ",rgxKG.sub("KG", s)))
    if (rgxSE.search(s) and rgxUCO.search(s) and rgxKGAA.search(s)):
        return "SE & Co. KGaA", rgxSE.sub(" SE ", rgxUCO.sub(" & Co. ",rgxKGAA.sub("KgaA", s)))
    if (rgxSE.search(s) and rgxUCO.search(s) and rgxOHG.search(s)):
        return "SE & Co. OHG", rgxSE.sub(" SE ", rgxUCO.sub(" & Co. ",rgxOHG.sub("OHG", s)))
    if (rgxUG.search(s) and rgxUCO.search(s) and rgxKG.search(s)):
        return "UG & Co. KG", rgxUG.sub(" UG ", rgxUCO.sub(" & Co. ",rgxKG.sub("KG", s)))
    if (rgxUG.search(s) and rgxUCO.search(s) and rgxKGAA.search(s)):
        return "UG & Co. KGaA",  rgxUG.sub(" UG ", rgxUCO.sub(" & Co. ",rgxKGAA.sub("KgaA", s)))
    if (rgxUG.search(s) and rgxUCO.search(s) and rgxOHG.search(s)):
        return "UG & Co. OHG", rgxUG.sub(" UG ", rgxUCO.sub(" & Co. ",rgxOHG.sub("OHG", s)))
    if (rgxGMBH.search(s)):      
        return "GmbH", rgxGMBH.sub(" GmbH ", s)
    if (rgxLTD.search(s)):
        return "Ltd.", rgxLTD.sub(" Ltd. ", s)
    if (rgxBV.search(s)):
        return "BV", rgxBV.sub(" BV ", s)
    if (rgxUG.search(s)):
        return "UG", rgxUG.sub(" UG ", s)
    if (rgxEG.search(s)):
        return "e.G.", rgxEG.sub(" e.G. ", s)
    if (rgxKGAA.search(s)):
        return "KGaA", rgxKGAA.sub(" KGaA ", s)
    if (rgxKG.search(s)):
        return "KG", rgxKG.sub(" KG ", s)
    if (rgxOHG.search(s)):
        return "OHG", rgxOHG.sub(" OHG ", s)
    if (rgxAG.search(s)):
        return "AG", rgxAG.sub(" AG ", s)
    if (rgxSE.search(s)):
        return "SE", rgxSE.sub(" SE ", s)
    if (rgxEV.search(s)):
        return "e.V.", rgxEV.sub(" e.V. ", s)
    if (rgxGBR.search(s)):
        return "GbR", rgxGBR.sub(" GbR ", s)
    if (rgxVAG.search(s)):
        return "V.a.G.", rgxVAG.sub(" V.a.G. ", s)

    return "NA", s

In [5]:
def sanitize(s):
    if s is None:
        return s

    s = str(s)
    processed = re.sub('http','',s).lower()
    processed = re.sub('https','',processed).lower()
    processed = re.sub('www','',processed).lower()
    processed = re.sub('[\!\:\"#\.%&@\(\,\-\/)+]','',processed).lower()

    blacklist = stopwords.words('german')
    for word in blacklist:
        processed = processed.replace(" " + word + " ", " ")
    return processed.strip()
def sanitize_entity(entity):
    lfd = legalFormDetector(entity['name'])
    entity['name'] = lfd[1]
    
    legalform = legalFormDetector(str(entity['legal_form']))
    entity['legal_form'] = legalform[0] if legalform[0] !='NA' else entity['legal_form']
    entity['name'] = sanitize(entity['name'])
    entity['address.street'] = sanitize(entity['address.street'])
    entity['phone_number'] = sanitize(entity['phone_number'])
    return entity

In [6]:
class BundesbankModel(LegalEntityMatchingModel):

    def __init__(self):
        super(BundesbankModel, self).__init__()

        model_filepath = list(Path.cwd().glob(f"**/{os.environ.get('DEDUPE_MODEL', 'randomforest_bestrandom.sav')}"))
        assert len(model_filepath) == 1, f'Dont know what file to load as dedupe model. Found more too many options: {model_filepath}.'
        model_filepath = model_filepath[0]
                                                                                         
        with open(model_filepath, 'rb') as model_file:
            self.dedupe_model = pickle.load(open(model_filepath, 'rb'))

        logging.info(f'Loaded bundesbank model {model_filepath} successfully.')

        self.infos = f'dedupe_model_filepath={model_filepath}'
        self.known_entities = {}

    def transform_entity_dict_to_sanitized_dict(self, entity):
        for field in DEDUPE_2_ERS_DATAFIELD_MAPPING.keys():
            entity[DEDUPE_2_ERS_DATAFIELD_MAPPING[field]] = entity.pop(field)
        entity = sanitize_entity(entity)
        return entity
                                              
    def index(self, entity: dict):
        ix = entity['name'][0].lower()
        if ix in self.known_entities.keys():
            known_lists = self.known_entities[ix]
            known_lists = known_lists.append(entity)
        else:
            self.known_entities[ix] = [entity]
                                                                                                                                        
    def add_entity(self, entity: dict):
        entity = self.transform_entity_dict_to_sanitized_dict(entity)
        ix = entity['name'][0].lower()
        if ix in self.known_entities and len(self.known_entities[ix]) > 0:
            assert not entity['ers_id'] in pd.DataFrame(self.known_entities[ix]).ers_id.values, f'Cannot add entity with ERS-Id {entity["ers_id"]} a second time.'
        self.index(entity)
        logging.debug(f"{entity['ers_id']} was added to known entities (# {len(self.known_entities)})")

    def remove_entity(self, ers_id: str):
            
        ix = entity['name'][0].lower()
        n = list(pd.DataFrame(D4_evaluation).ersId == "05e9cf2b-033d-4192-9365-ef6b08f737b0").index(True)
        self.known_entities[ix].pop(n)  
        super(BundesbankModel, self).remove_entity(ers_id)

    def remove_all_entities(self):
        self.known_entities = {}
        super(BundesbankModel, self).remove_all_entities()
            
    def match(self, entity: dict, threshold, n_matches):
        matches=[]
        ix = entity['name'][0].lower()
        if ix not in self.known_entities.keys():
            return []
        search_entities =  self.known_entities[ix]
        for base in search_entities:
            simi = pd.DataFrame.from_dict([pairwise_similarity(base, entity)])
            simi = self.dedupe_model.predict_proba(simi)[0][1]
            if simi>threshold:
               matches.append([base, simi])
        matches = [item for item in matches if item[1] > threshold]
        matches = sorted(matches, key = lambda x: float(x[1]), reverse=True)
        matches = matches[:n_matches]
        return matches
    
    def match2(self, entity: dict, threshold, n_matches):
        matches=[]
        ix = entity['name'][0].lower()
        if ix not in self.known_entities.keys():
            return []
        search_entities =  self.known_entities[ix]
        for base in search_entities:
            simi = pd.DataFrame.from_dict([pairwise_similarity(base, entity)])
            simi = self.dedupe_model.predict(simi)[0]
            if simi == 1:
               matches.append([base])
        matches = matches[:n_matches]
        return matches
        
    def match_entity(self, entity: dict) -> []:
        similarities = []
        try:
            entity = entity.copy()
            entity['ers_id'] = 'unknown'
            dedupe_dict = self.transform_entity_dict_to_sanitized_dict(entity)
            logging.debug(f'Calling dedupe model with this information: {dedupe_dict}')
            # the n_matches is set to DEDUPE_SEARCH_MAX_THRESHOLD+1 so that the Java code has a "chance" to find this
            # problematic case where too many matches are regared as valid by the dedupe model:
            matches = self.match2(dedupe_dict, threshold=0.85,
                                        n_matches=1)
            # DO NOT leave this logging cmd in the productive service: it will break the Jenkins release pipeline for UNKNOWN reasons!!!
            # logging.debug(f'Dedupe returned: {matches}')
       
            if len(matches) > 0:
                #similarities = [(match[0], float(match[1])) for match in matches]
                similarities = [(match[0], 1) for match in matches]
               
        # dedupe strangely and only sometimes indicates that no match could be found with an ValueError:
        except ValueError:
            pass
        return similarities

    def get_infos(self):
        return self.infos


In [7]:
bundesbank = BundesbankModel()

INFO:root:Loaded bundesbank model /home/datascientist/host/src/notebooks/randomforest_bestrandom.sav successfully.


In [8]:
base_path_to_data = Path(os.environ.get('HOME')) / 'host' / 'data' / 'processed'
D4_golden=pd.read_pickle(base_path_to_data / '2018-07-03-small_set_with_limited_address_and_manipulated_information_golden_data.pkl')
D4_evaluation=pd.read_pickle(base_path_to_data / '2018-07-03-small_set_with_limited_address_and_manipulated_information_evaluation_data.pkl')

In [9]:
D4_golden = D4_golden.to_dict(orient='records')
D4_evaluation = D4_evaluation.to_dict(orient='records')

In [10]:
DEDUPE_2_ERS_DATAFIELD_MAPPING = {'legalForm':'legal_form',
                                'registerNumber':'register_number', 
                                 'address.postalCode': 'address.postal_code', 
                                 'phoneNumber': "phone_number", 
                                 'ersId': 'ers_id'}

In [12]:
for entity in D4_golden:
    bundesbank.add_entity(entity)

In [13]:
D4_evaluation[90]

{'ersId': '195d45a8-3b57-47c3-a620-689dbcc6d99e',
 'crmId': nan,
 'buergelId': nan,
 'name': 'Agb GmbH',
 'legalForm': nan,
 'email': 'info@abg-tagungszentrum.de',
 'phoneNumber': '+498416500',
 'website': 'http://www.abg-tagungszentrum.de',
 'vatID': 'DE129513052',
 'registerNumber': nan,
 'commercialRegister': nan,
 'address.street': nan,
 'address.postalCode': nan,
 'address.city': nan,
 'address.country': nan}

In [14]:
bundesbank.match_entity(D4_evaluation[90])

[({'crmId': nan,
   'buergelId': nan,
   'name': 'avg gmbh',
   'email': 'info@albers.de',
   'website': 'http://http//www.albers.de',
   'vatID': nan,
   'commercialRegister': nan,
   'address.street': 'nan',
   'address.city': nan,
   'address.country': nan,
   'legal_form': nan,
   'register_number': nan,
   'address.postal_code': nan,
   'phone_number': '49593184910',
   'ers_id': '7a12921a-3ac9-11e9-ae97-0242ac120003'},
  1)]

In [15]:
def process_evaluation_dataset(bundesbank, evaluation_dataset):
    true_and_predicted_ids = pd.DataFrame(columns=['evaluation_row_id', 'true','predicted', 'similarity'])
    true_positives = []
    true_negatives = []
    false_positives = []
    false_negatives = []
    
    for i, company in enumerate(evaluation_dataset):
        print('---')
        print(i)
        true_ers_id = company['ersId']
        match_result = bundesbank.match_entity(company)
        
        if match_result==[]:
            if pd.isnull(true_ers_id):
                true_negatives.append(i)
            else:
                print('FN')
                print(true_ers_id)
                false_negatives.append(i)
        else:
            for match in match_result:
                if true_ers_id == match[0]['ers_id']:
                    true_positives.append(i)
                else:
                    print('FP')
                    print(true_ers_id)
                    print(match)
                    false_positives.append((i, match))
        print(len(false_negatives))
        print(len(false_positives))
    confusion_matrix = pd.DataFrame([[len(true_positives), len(false_positives)],
                                 [len(false_negatives), len(true_negatives)]],
                                columns=['was_a_match', 'was_no_match'],
                                index=['found_a_match', 'found_no_match'])
    result = {}
    result['accuracy'] = (confusion_matrix.loc['found_a_match','was_a_match']+confusion_matrix.loc['found_no_match','was_no_match'])/confusion_matrix.sum().sum()
    result['recall'] = confusion_matrix.loc['found_a_match','was_a_match']/confusion_matrix.was_a_match.sum()
    result['precision'] = confusion_matrix.loc['found_a_match','was_a_match']/confusion_matrix.loc['found_a_match'].sum()
    result['selectivity'] = confusion_matrix.loc['found_no_match','was_no_match']/confusion_matrix.was_no_match.sum()
    result['f-score'] = 2*result['precision']*result['recall']/(result['precision']+result['recall'])
    return confusion_matrix, result 
    
    
    

In [18]:
confusion_matrix, result  = process_evaluation_dataset(bundesbank, D4_evaluation)

---
0
0
0
---
1
FN
05e9cf2b-033d-4192-9365-ef6b08f737b0
1
0
---
2
1
0
---
3
FP
nan
({'crmId': nan, 'buergelId': nan, 'name': 'alpha consulting gmbh', 'email': nan, 'website': nan, 'vatID': nan, 'commercialRegister': nan, 'address.street': 'nan', 'address.city': nan, 'address.country': nan, 'legal_form': nan, 'register_number': nan, 'address.postal_code': nan, 'phone_number': '493716665840', 'ers_id': '79c95686-3ac9-11e9-ae97-0242ac120003'}, 1)
1
1
---
4
1
1
---
5
1
1
---
6
1
1
---
7
1
1
---
8
FN
c90c1ba3-5543-493b-86ab-667316234553
2
1
---
9
2
1
---
10
2
1
---
11
2
1
---
12
FP
nan
({'crmId': nan, 'buergelId': nan, 'name': 'agravis technik raiffeisen gmbh', 'email': 'agravis-technik-raiffeisen@agravis.de', 'website': 'http://http//www.agravis-technik-raiffeisen.de', 'vatID': nan, 'commercialRegister': nan, 'address.street': 'nan', 'address.city': nan, 'address.country': nan, 'legal_form': nan, 'register_number': nan, 'address.postal_code': nan, 'phone_number': '49555160080', 'ers_id': '

7
16
---
132
FP
d00b0464-d4fe-432a-9c01-2cf4327592e6
({'crmId': nan, 'buergelId': nan, 'name': 'apart gmbh', 'email': 'info@apart.de', 'website': 'http://http//www.apart.de', 'vatID': nan, 'commercialRegister': nan, 'address.street': 'nan', 'address.city': nan, 'address.country': nan, 'legal_form': nan, 'register_number': nan, 'address.postal_code': nan, 'phone_number': '49632797480', 'ers_id': '798ce05c-3ac9-11e9-ae97-0242ac120003'}, 1)
7
17
---
133
7
17
---
134
7
17
---
135
7
17
---
136
FP
nan
({'crmId': nan, 'buergelId': nan, 'name': 'aws systemtechnik gmbh', 'email': 'info@aws-systemtechnik.de', 'website': 'http://http//www.aws-systemtechnik.de', 'vatID': nan, 'commercialRegister': nan, 'address.street': 'nan', 'address.city': nan, 'address.country': nan, 'legal_form': nan, 'register_number': nan, 'address.postal_code': nan, 'phone_number': '498752865630', 'ers_id': '79d13c48-3ac9-11e9-ae97-0242ac120003'}, 1)
7
18
---
137
FP
nan
({'crmId': nan, 'buergelId': nan, 'name': 'abwasserge

20
29
---
300
20
29
---
301
20
29
---
302
20
29
---
303
20
29
---
304
20
29
---
305
20
29
---
306
20
29
---
307
20
29
---
308
20
29
---
309
20
29
---
310
20
29
---
311
20
29
---
312
20
29
---
313
20
29
---
314
FN
79aeba74-3ac9-11e9-ae97-0242ac120003
21
29
---
315
21
29
---
316
21
29
---
317
21
29
---
318
21
29
---
319
FP
7a08d31a-3ac9-11e9-ae97-0242ac120003
({'crmId': nan, 'buergelId': nan, 'name': 'abwassergesellschaft magdebur gmbh', 'email': 'info@agm-magdeburg.de', 'website': 'http://www.agm-magdeburg.de', 'vatID': 'DE231047913', 'commercialRegister': nan, 'address.street': 'nan', 'address.city': nan, 'address.country': nan, 'legal_form': nan, 'register_number': nan, 'address.postal_code': nan, 'phone_number': '493915870', 'ers_id': '7a19df02-3ac9-11e9-ae97-0242ac120003'}, 1)
21
30
---
320
21
30
---
321
21
30
---
322
21
30
---
323
21
30
---
324
21
30
---
325
21
30
---
326
21
30
---
327
21
30
---
328
21
30
---
329
21
30
---
330
21
30
---
331
21
30
---
332
21
30
---
333
21
30
---
334

KeyboardInterrupt: 

In [169]:
confusion_matrix

Unnamed: 0,was_a_match,was_no_match
found_a_match,386,199
found_no_match,105,441


In [170]:
result

{'accuracy': 0.731211317418214,
 'recall': 0.7861507128309573,
 'precision': 0.6598290598290598,
 'selectivity': 0.6890625,
 'f-score': 0.7174721189591077}

## new evaluation dastaset

In [19]:
bundesbank = BundesbankModel()

INFO:root:Loaded bundesbank model /home/datascientist/host/src/notebooks/randomforest_bestrandom.sav successfully.


In [20]:
base_path_to_data = Path(os.environ.get('HOME')) / 'host' / 'data' / 'ers_data'
D4_golden=pd.read_pickle(base_path_to_data / 'golden_data.pkl')
D4_evaluation=pd.read_pickle(base_path_to_data / 'evaluation_data.pkl')

In [21]:
D4_golden = D4_golden.to_dict(orient='records')
D4_evaluation = D4_evaluation.to_dict(orient='records')

In [22]:
DEDUPE_2_ERS_DATAFIELD_MAPPING = {'abs_name':'name', 'abs_legal_form':'legal_form',
                                'abs_register_number':'register_number', 'abs_hq_email':'email', 'abs_website':'website',
                                'abs_hq_phone':'phone_number', 'abs_taxid':'vat_id',
                                'abs_hq_street':'address.street', 'abs_hq_zip_code':'address.postal_code',
                                'abs_hq_city':'address.city', 'abs_hq_country':'address.country',
                                'record_id':'ers_id'}

In [23]:
D4_golden[0]

{'abs_hq_city': 'Düsseldorf',
 'abs_hq_country': 'DE',
 'abs_hq_email': None,
 'abs_hq_phone': '0211/866600',
 'abs_hq_street': 'Schadowplatz 18',
 'abs_hq_zip_code': '40212',
 'abs_legal_form': 'Einzelfirma',
 'abs_name': 'Schadow-Apotheke Pharmacie Internationale J. Müller- Behrendt e.K.',
 'abs_register_number': '767',
 'abs_taxid': None,
 'abs_website': None,
 'record_id': '7aa9c2e0-d3d8-11e4-b3ef-53b449010d08'}

In [24]:
for entity in D4_golden:
    bundesbank.add_entity(entity)

In [30]:
len(D4_golden)

75212

In [25]:
def process_evaluation_dataset2(bundesbank, evaluation_dataset):
    true_and_predicted_ids = pd.DataFrame(columns=['evaluation_row_id', 'true','predicted', 'similarity'])
    true_positives = []
    true_negatives = []
    false_positives = []
    false_negatives = []
    
    for i, company in enumerate(evaluation_dataset):
        print('---')
        print(i)
        true_ers_id = company['record_id']
        match_result = bundesbank.match_entity(company)
        
        if match_result==[]:
            if pd.isnull(true_ers_id):
                true_negatives.append(i)
            else:
                print('FN')
                print(true_ers_id)
                false_negatives.append(i)
        else:
            for match in match_result:
                if true_ers_id == match[0]['ers_id']:
                    true_positives.append(i)
                else:
                    print('FP')
                    print(true_ers_id)
                    print(match)
                    false_positives.append((i, match))
        print(len(false_negatives))
        print(len(false_positives))
    confusion_matrix = pd.DataFrame([[len(true_positives), len(false_positives)],
                                 [len(false_negatives), len(true_negatives)]],
                                columns=['was_a_match', 'was_no_match'],
                                index=['found_a_match', 'found_no_match'])
    result = {}
    result['accuracy'] = (confusion_matrix.loc['found_a_match','was_a_match']+confusion_matrix.loc['found_no_match','was_no_match'])/confusion_matrix.sum().sum()
    result['recall'] = confusion_matrix.loc['found_a_match','was_a_match']/confusion_matrix.was_a_match.sum()
    result['precision'] = confusion_matrix.loc['found_a_match','was_a_match']/confusion_matrix.loc['found_a_match'].sum()
    result['selectivity'] = confusion_matrix.loc['found_no_match','was_no_match']/confusion_matrix.was_no_match.sum()
    result['f-score'] = 2*result['precision']*result['recall']/(result['precision']+result['recall'])
    return confusion_matrix, result 
    
    
    

In [26]:
D4_evaluation

[{'abs_hq_city': 'Düsseldorf',
  'abs_hq_country': 'DE',
  'abs_hq_email': None,
  'abs_hq_phone': None,
  'abs_hq_street': 'Schadowpl. 18',
  'abs_hq_zip_code': '40212',
  'abs_legal_form': 'e.K.',
  'abs_name': 'Schadow-Apotheke Pharmacie Internationale J. Müller-Behrendt e.K.',
  'abs_register_number': None,
  'abs_taxid': None,
  'abs_website': 'http://www.schadowapotheke.de/',
  'record_id': '7aa9c2e0-d3d8-11e4-b3ef-53b449010d08'},
 {'abs_hq_city': 'Schemmerhofen',
  'abs_hq_country': 'DE',
  'abs_hq_email': None,
  'abs_hq_phone': None,
  'abs_hq_street': 'Ferdinand Dünkel Str. 5',
  'abs_hq_zip_code': '88433',
  'abs_legal_form': 'GmbH & Co. KG',
  'abs_name': 'A.I. Neue-Mitte-Salem-Gesellschaft mbH & Co. KG',
  'abs_register_number': None,
  'abs_taxid': None,
  'abs_website': None,
  'record_id': 'd7c0ba50-d3de-11e4-b3ef-53b449010d08'},
 {'abs_hq_city': 'München',
  'abs_hq_country': 'DE',
  'abs_hq_email': None,
  'abs_hq_phone': None,
  'abs_hq_street': 'Widenmayerstr. 36',


In [None]:
confusion_matrix, result  = process_evaluation_dataset2(bundesbank, D4_evaluation)

---
0
0
0
---
1
0
0
---
2
0
0
---
3
0
0
---
4
0
0
---
5
0
0
---
6
0
0
---
7
0
0
---
8
0
0
---
9
0
0
---
10
FN
90cefbc0-ee92-11e8-863a-4d2d59d73547
1
0
---
11
1
0
---
12
1
0
---
13
1
0
---
14
1
0
---
15
1
0
---
16
1
0
---
17
1
0
---
18
1
0
---
19
1
0
---
20
1
0
---
21
1
0
---
22
1
0
---
23
1
0
---
24
1
0
---
25
1
0
---
26
1
0
---
27
1
0
---
28
1
0
---
29
1
0
---
30
1
0
---
31
FN
be8a6bd0-d3ed-11e4-b3ef-53b449010d08
2
0
---
32
2
0
---
33
2
0
---
34
2
0
---
35
2
0
---
36
2
0
---
37
2
0
---
38
2
0
---
39
2
0
---
40
2
0
---
41
2
0
---
42
2
0
---
43
2
0
---
44
2
0
---
45
2
0
---
46
2
0
---
47
2
0
---
48
2
0
---
49
2
0
---
50
FN
88c25270-0358-11e5-83ff-53b449010d08
3
0
---
51
FN
b2795480-dd22-11e4-b967-53b449010d08
4
0
---
52
4
0
---
53
4
0
---
54
4
0
---
55
4
0
---
56
4
0
---
57
4
0
---
58
4
0
---
59
4
0
---
60
4
0
---
61
4
0
---
62
4
0
---
63
4
0
---
64
4
0
---
65
4
0
---
66
4
0
---
67
4
0
---
68
4
0
---
69
4
0
---
70
4
0
---
71
4
0
---
72
4
0
---
73
4
0
---
74
4
0
---
75
4
0
---
76
4
0
---

22
0
---
587
22
0
---
588
22
0
---
589
22
0
---
590
22
0
---
591
22
0
---
592
22
0
---
593
22
0
---
594
22
0
---
595
22
0
---
596
22
0
---
597
22
0
---
598
22
0
---
599
22
0
---
600
22
0
---
601
22
0
---
602
22
0
---
603
22
0
---
604
22
0
---
605
22
0
---
606
22
0
---
607
22
0
---
608
22
0
---
609
22
0
---
610
22
0
---
611
22
0
---
612
22
0
---
613
22
0
---
614
22
0
---
615
22
0
---
616
22
0
---
617
FN
658aa720-d3e3-11e4-b3ef-53b449010d08
23
0
---
618
23
0
---
619
23
0
---
620
23
0
---
621
23
0
---
622
23
0
---
623
23
0
---
624
23
0
---
625
23
0
---
626
23
0
---
627
23
0
---
628
23
0
---
629
23
0
---
630
23
0
---
631
23
0
---
632
23
0
---
633
FN
3b3ce8a0-d3e0-11e4-b3ef-53b449010d08
24
0
---
634
24
0
---
635
24
0
---
636
24
0
---
637
24
0
---
638
24
0
---
639
24
0
---
640
24
0
---
641
24
0
---
642
24
0
---
643
24
0
---
644
24
0
---
645
24
0
---
646
24
0
---
647
24
0
---
648
24
0
---
649
24
0
---
650
24
0
---
651
24
0
---
652
24
0
---
653
24
0
---
654
24
0
---
655
24
0
---
656
24
0
---
6

42
2
---
1095
42
2
---
1096
42
2
---
1097
42
2
---
1098
FN
bcea2520-6670-11e5-82bb-299f943b296d
43
2
---
1099
43
2
---
1100
43
2
---
1101
43
2
---
1102
43
2
---
1103
FN
f12b1ee0-d3d4-11e4-b3ef-53b449010d08
44
2
---
1104
44
2
---
1105
44
2
---
1106
44
2
---
1107
44
2
---
1108
FN
fac88590-dec0-11e4-9dbc-53b449010d08
45
2
---
1109
45
2
---
1110
45
2
---
1111
45
2
---
1112
45
2
---
1113
45
2
---
1114
45
2
---
1115
45
2
---
1116
45
2
---
1117
45
2
---
1118
45
2
---
1119
45
2
---
1120
45
2
---
1121
45
2
---
1122
45
2
---
1123
45
2
---
1124
45
2
---
1125
45
2
---
1126
45
2
---
1127
45
2
---
1128
45
2
---
1129
45
2
---
1130
45
2
---
1131
45
2
---
1132
45
2
---
1133
45
2
---
1134
45
2
---
1135
45
2
---
1136
45
2
---
1137
45
2
---
1138
45
2
---
1139
45
2
---
1140
45
2
---
1141
45
2
---
1142
45
2
---
1143
45
2
---
1144
45
2
---
1145
45
2
---
1146
45
2
---
1147
45
2
---
1148
45
2
---
1149
45
2
---
1150
45
2
---
1151
45
2
---
1152
45
2
---
1153
45
2
---
1154
45
2
---
1155
45
2
---
1156
45
2
---
115

66
4
---
1558
66
4
---
1559
66
4
---
1560
66
4
---
1561
66
4
---
1562
66
4
---
1563
66
4
---
1564
66
4
---
1565
66
4
---
1566
66
4
---
1567
66
4
---
1568
66
4
---
1569
66
4
---
1570
FN
2d9931e0-d3ea-11e4-b3ef-53b449010d08
67
4
---
1571
67
4
---
1572
67
4
---
1573
67
4
---
1574
67
4
---
1575
67
4
---
1576
67
4
---
1577
67
4
---
1578
67
4
---
1579
67
4
---
1580
67
4
---
1581
67
4
---
1582
67
4
---
1583
67
4
---
1584
67
4
---
1585
67
4
---
1586
67
4
---
1587
67
4
---
1588
67
4
---
1589
67
4
---
1590
67
4
---
1591
67
4
---
1592
67
4
---
1593
67
4
---
1594
67
4
---
1595
67
4
---
1596
67
4
---
1597
67
4
---
1598
67
4
---
1599
67
4
---
1600
67
4
---
1601
67
4
---
1602
67
4
---
1603
67
4
---
1604
67
4
---
1605
67
4
---
1606
67
4
---
1607
67
4
---
1608
67
4
---
1609
67
4
---
1610
67
4
---
1611
67
4
---
1612
67
4
---
1613
67
4
---
1614
67
4
---
1615
67
4
---
1616
67
4
---
1617
67
4
---
1618
67
4
---
1619
67
4
---
1620
67
4
---
1621
67
4
---
1622
67
4
---
1623
67
4
---
1624
67
4
---
1625
67
4
---