#### Execute always the following cells to init the notebook:

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import json
from pathlib import Path
import math
import re
import pandas as pd
import numpy as np
import random as rnd
import dedupe
from dotenv import find_dotenv, load_dotenv
import dotenv
import subprocess
import requests
from urllib.parse import urljoin
import enforce
import uuid
from unidecode import unidecode
import nltk

import backtester
import utils

INFO:root:Generating grammar tables from /usr/lib/python3.6/lib2to3/Grammar.txt
INFO:root:Generating grammar tables from /usr/lib/python3.6/lib2to3/PatternGrammar.txt


In [3]:
def transform_crm_record_to_company_object(crm_record: dict, information_source: str = None) -> dict:
    company = utils.convert_flat_dict_to_nested_dicts(crm_record)
    company = utils.remove_bad_values(company)

    # company['ersId'] = 'DUMMY_COMPANY_ID'
    company['informationSources'] = information_source
    company['address']['informationSources'] = information_source
    # company['address']['ersId'] = 'DUMMY_ADDRESS_RELATIONSHIP_ID'
    company['residentAddressSet'] = [
        {'classType': 'ResidenceRelationship', 'type': 'MAIN_ADDRESS',  # 'ersId': 'DUMMY_ADDRESS_ID',
         'informationSources': information_source, 'address': company['address']}]
    company.pop('address', None)

    return company

In [4]:
load_dotenv(find_dotenv())

True

# Check data from CRM

In [322]:
MAP_CRM_TO_ERS_COLUMNS = {'id':'crmId', 'buergel_id':'buergelId', 'name':'name', 'legal_form':'legalForm',
                               'hq_email':'email', 'hq_phone':'phoneNumber', 'website':'website',
                               'taxid':'vatID', 'register_number':'registerNumber', 'local_court':'commercialRegister',
                               'hq_street':'address.street', 'hq_zip_code':'address.postalCode',
                               'hq_city':'address.city', 'hq_country':'address.country'}

In [323]:
df = pd.read_csv(Path(os.environ.get('DATA_PATH'))/'raw'/'2018-06-08-data_for_dedupe.csv', dtype={'buergel_id':str, 12:str, 24:str, 33:str})
df = df[list(MAP_CRM_TO_ERS_COLUMNS.keys())]
df.rename(columns=MAP_CRM_TO_ERS_COLUMNS, inplace=True)
df[:5]

Unnamed: 0,crmId,buergelId,name,legalForm,email,phoneNumber,website,vatID,registerNumber,commercialRegister,address.street,address.postalCode,address.city,address.country
0,1,55013451.0,Bio River Life Science im Rheinland e.V.,,,,http://www.bioriver.de,,,,Merowinger Platz 1a,40225,Düsseldorf,DE
1,2,,Die Schuhleister GmbH & Co. KG,,,,http://www.die-schuhleister.de,,,,Eifelplatz 1-3,50677,Köln,DE
2,3,58088981.0,evopark GmbH,,,,http://www.evopark.de,,,,Sedanstraße 31-33,50668,Köln,DE
3,4,,Stadtsparkasse Düsseldorf,,,,http://www.sskduesseldorf.de,,,,Berliner Allee 33,40212,Düsseldorf,DE
4,5,,Verivox Versicherungsvergleich GmbH,,,,http://www.verivox.de,,,,Am Taubenfeld 10,69123,Heidelberg,DE


In [196]:
len(df)

71649

# Check data from dedupe

In [48]:
COLUMNS_IMPORTED_INTO_NEO4J = ['cluster_id', 'id', 'buergel_id', 'name', 'legal_form', 'hq_email', 'hq_phone', 'website',
                               'hq_street', 'hq_zip_code','hq_city', 'hq_country', 'taxid', 'register_number', 'local_court']

df = pd.read_csv(Path(os.environ.get('DATA_PATH'))/'raw'/'2018-07-03-data_from_dedupe.csv', dtype={0:str, 'buergel_id':str, 14:str, 16:str})
df = df[COLUMNS_IMPORTED_INTO_NEO4J]

In [24]:
duplicates = df.groupby("cluster_id").filter(lambda x: len(x) > 1)
duplicates.to_excel(Path(os.getenv('DATA_PATH'))/'processed'/'2018-07-03-duplicates_from_dedupe.xlsx')

In [26]:
sum(duplicates.cluster_id.value_counts()>2)

36

In [23]:
df[df.cluster_id=='f9bb27bc-8ceb-4676-b3a5-3d0a61c8b94e']

Unnamed: 0,cluster_id,id,buergel_id,name,legal_form,hq_email,hq_phone,website,hq_street,hq_zip_code,hq_city,hq_country,taxid,register_number,local_court
27859,f9bb27bc-8ceb-4676-b3a5-3d0a61c8b94e,82184,,H & T Feinkost GmbH,,vetrieb@ht-feinkost.de,4952412229184.0,http://http//www.ht-feinkost.de,Im Krupploch 19,33334,Gütersloh,,,,
27860,f9bb27bc-8ceb-4676-b3a5-3d0a61c8b94e,63114,,H&T Feinkost GmbH,,,,,Im Krupploch 19,33334,Gütersloh,,,,


In [12]:
len((df.cluster_id.value_counts()>1).index.values)

28169

In [36]:
sum(df.cluster_id.value_counts()==1)/len(df)

0.38818406397856214

In [37]:
sum(df.cluster_id.isna())/len(df)

0.6012644977599129

# Select and save entries for golden and evaluation data sets
Before executing this, download from Valdons 
[OneDrive](https://valdon-my.sharepoint.de/:f:/g/personal/jochen_krause_valdon_biz1/EgqfqvAhgwJHjXpl0u_3fVwBDRxAy7AGiUiuU50mfTzd9g?e=hBuPzr) 
the file `2018-07-03-data_from_dedupe.csv` to `./data/raw`.

In [4]:
load_dotenv(find_dotenv())
MAP_CRM_TO_ERS_COLUMNS = {'cluster_id':'ersId','id':'crmId', 'buergel_id':'buergelId', 'name':'name', 'legal_form':'legalForm',
                               'hq_email':'email', 'hq_phone':'phoneNumber', 'website':'website',
                               'taxid':'vatID', 'register_number':'registerNumber', 'local_court':'commercialRegister',
                               'hq_street':'address.street', 'hq_zip_code':'address.postalCode',
                               'hq_city':'address.city', 'hq_country':'address.country'}

def prepare_and_save_data_sets(golden_data, evaluation_data, filemarker):
    golden_ids = set(golden_data.ersId)
    eval_ids = set(evaluation_data.ersId)
    unknow_ids = eval_ids - golden_ids
    evaluation_data.loc[evaluation_data.ersId.isin(unknow_ids),'ersId'] = np.nan
    golden_data.drop(columns='randomId').to_pickle(Path(os.environ.get('DATA_PATH')) / 'processed' / f'{filemarker}_golden_data.pkl')
    evaluation_data.drop(columns='randomId').sample(frac=1.0, random_state=1).to_pickle(Path(os.environ.get('DATA_PATH')) / 'processed' / f'{filemarker}_evaluation_data.pkl')

def manipulate_data(data):
    columns_to_manipulate = ['name', 'phoneNumber', 'address.street', 'address.city', 'address.postalCode']
    for col in columns_to_manipulate:
        data.loc[:,col] = data.loc[:,col].apply(lambda x: utils.swap_letters_in_sentence(x, likelihood_to_swap_letters_per_word=.5))
    for col in columns_to_manipulate:
        data.loc[:,col] = data.loc[:,col].apply(lambda x: utils.drop_letter(x, likelihood_to_drop_letter=.5))
    

df = pd.read_csv(Path(os.environ.get('DATA_PATH'))/'raw'/'2018-07-03-data_from_dedupe.csv', dtype={0:str, 'buergel_id':str, 14:str, 16:str})
# fix cluster_ids: create a UUID where none is set:
df.cluster_id.fillna(value='UUID', inplace=True)
df.cluster_id = df.cluster_id.apply(lambda x:x if x != 'UUID' else str(uuid.uuid1()))

# replace all nan with '':
# df.fillna(value='', inplace=True)

# select and rename needed columns:
df = df[list(MAP_CRM_TO_ERS_COLUMNS.keys())]
df.rename(columns=MAP_CRM_TO_ERS_COLUMNS, inplace=True)
print(f'{len(df)} entries are available')

essential_columns = ['ersId','buergelId','name','address.street','address.postalCode','address.city']
df.dropna(subset=essential_columns, inplace=True)
print(f'{len(df)} entries remaining after dropping rows with nan values in the most important columns.')

# # and only of there is exactly one record per Bürgel Id:
df = df[df.buergelId.isin(df.buergelId.value_counts()[df.buergelId.value_counts()==1].index.values)]
print(f'{len(df)} entries remaining after taking only unique entries of Bürgel Ids.')

df = df[df.ersId.isin(df.ersId.value_counts()[df.ersId.value_counts()==1].index.values)]
print(f'{len(df)} entries remaining after taking only unique entries of ERS aka dedupe cluster Ids.')

df = df[df.vatID.isna() | df.vatID.isin(df.vatID.value_counts()[df.vatID.value_counts()==1].index.values)]
print(f'{len(df)} entries remaining after taking only unique entries of VAT Ids.')

df.drop_duplicates(subset=essential_columns, inplace=True)
print(f'{len(df)} entries remaining after dropping duplicates in the most important columns.')

df.sort_values(by='name', inplace=True)
np.random.seed(111)
rnd.seed(222)
df['randomId'] = np.random.rand(len(df))

# remove Buergel and CRM Ids:
df.loc[:,['buergelId', 'crmId']] = np.nan


# first data set is just to test the backtester itself:
golden_data = df[:10].copy()
duplicate = golden_data.iloc[9].copy()
duplicate['ersId'] = 'badbad_duplicate'
golden_data = golden_data.append(duplicate)
evaluation_data = df[5:15].copy()
evaluation_data.loc[evaluation_data.iloc[:2].index,'buergelId'] = np.nan
prepare_and_save_data_sets(golden_data, evaluation_data, '2018-07-03-for_unit_tests')

# first real data sets are created to get a baseline.
# these data sets contain all the available informations:
golden_data = df[df.randomId < 2/3].copy()
evaluation_data = df[df.randomId > 1/3].copy()
prepare_and_save_data_sets(golden_data, evaluation_data, '2018-07-03-big_set_with_full_information')

# save once more same data but manipulated:
manipulated_evaluation_data = evaluation_data.copy() 
manipulate_data(manipulated_evaluation_data)
prepare_and_save_data_sets(golden_data, manipulated_evaluation_data, '2018-07-03-big_set_with_full_but_manipulated_information')

# and remove some address data:
addressColumns = ['address.street', 'address.postalCode','address.city', 'address.country']
golden_data.loc[~((golden_data.randomId<.1) | golden_data.randomId.between(.45,.55)), addressColumns] = np.nan
evaluation_data.loc[~((evaluation_data.randomId>.9) | evaluation_data.randomId.between(.45,.55)), addressColumns] = np.nan
golden_data = golden_data.apply(lambda row:utils.drop_some_columns(row, addressColumns), axis=1)
evaluation_data = evaluation_data.apply(lambda row:utils.drop_some_columns(row, addressColumns), axis=1)
golden_data.loc[golden_data.randomId.between(.45,.55), addressColumns] = np.nan
evaluation_data.loc[evaluation_data.randomId.between(.45,.55), addressColumns] = np.nan
prepare_and_save_data_sets(golden_data, evaluation_data, '2018-07-03-big_set_with_limited_address_information')

# save once more same data but manipulated:
manipulated_evaluation_data = evaluation_data.copy() 
manipulate_data(manipulated_evaluation_data)
prepare_and_save_data_sets(golden_data, manipulated_evaluation_data, '2018-07-03-big_set_with_limited_address_and_manipulated_information')


# the same data sets as the one above with just less samples
golden_data = df[:1500][df.randomId < 2/3].copy()
evaluation_data = df[:1500][df.randomId > 1/3].copy()
prepare_and_save_data_sets(golden_data, evaluation_data, '2018-07-03-small_set_with_full_information')

# save once more same data but manipulated:
manipulated_evaluation_data = evaluation_data.copy() 
manipulate_data(manipulated_evaluation_data)
prepare_and_save_data_sets(golden_data, manipulated_evaluation_data, '2018-07-03-small_set_with_full_but_manipulated_information')

# remove some address data:
addressColumns = ['address.street', 'address.postalCode','address.city', 'address.country']
golden_data.loc[~((golden_data.randomId<.1) | golden_data.randomId.between(.45,.55)), addressColumns] = np.nan
evaluation_data.loc[~((evaluation_data.randomId>.9) | evaluation_data.randomId.between(.45,.55)), addressColumns] = np.nan
golden_data = golden_data.apply(lambda row:utils.drop_some_columns(row, addressColumns), axis=1)
evaluation_data = evaluation_data.apply(lambda row:utils.drop_some_columns(row, addressColumns), axis=1)
golden_data.loc[golden_data.randomId.between(.45,.55), addressColumns] = np.nan
evaluation_data.loc[evaluation_data.randomId.between(.45,.55), addressColumns] = np.nan
prepare_and_save_data_sets(golden_data, evaluation_data, '2018-07-03-small_set_with_limited_address_information')

# save once more same data but manipulated:
manipulated_evaluation_data = evaluation_data.copy() 
manipulate_data(manipulated_evaluation_data)
prepare_and_save_data_sets(golden_data, manipulated_evaluation_data, '2018-07-03-small_set_with_limited_address_and_manipulated_information')


# the same data sets as the ones above with medium numbers of samples:
golden_data = df[:15000][df.randomId < 2/3].copy()
evaluation_data = df[:15000][df.randomId > 1/3].copy()
prepare_and_save_data_sets(golden_data, evaluation_data, '2018-07-03-medium_set_with_full_information')

# save once more same data but manipulated:
manipulated_evaluation_data = evaluation_data.copy() 
manipulate_data(manipulated_evaluation_data)
prepare_and_save_data_sets(golden_data, manipulated_evaluation_data, '2018-07-03-medium_set_with_full_but_manipulated_information')

# remove some address data:
addressColumns = ['address.street', 'address.postalCode','address.city', 'address.country']
golden_data.loc[~((golden_data.randomId<.1) | golden_data.randomId.between(.45,.55)), addressColumns] = np.nan
evaluation_data.loc[~((evaluation_data.randomId>.9) | evaluation_data.randomId.between(.45,.55)), addressColumns] = np.nan
golden_data = golden_data.apply(lambda row:utils.drop_some_columns(row, addressColumns), axis=1)
evaluation_data = evaluation_data.apply(lambda row:utils.drop_some_columns(row, addressColumns), axis=1)
golden_data.loc[golden_data.randomId.between(.45,.55), addressColumns] = np.nan
evaluation_data.loc[evaluation_data.randomId.between(.45,.55), addressColumns] = np.nan
prepare_and_save_data_sets(golden_data, evaluation_data, '2018-07-03-medium_set_with_limited_address_information')

# save once more same data but manipulated:
manipulated_evaluation_data = evaluation_data.copy() 
manipulate_data(manipulated_evaluation_data)
prepare_and_save_data_sets(golden_data, manipulated_evaluation_data, '2018-07-03-medium_set_with_limited_and_manipulated_information')

print('Done')

71649 entries are available
49132 entries remaining after dropping rows with nan values in the most important columns.
46819 entries remaining after taking only unique entries of Bürgel Ids.
46815 entries remaining after taking only unique entries of ERS aka dedupe cluster Ids.
46762 entries remaining after taking only unique entries of VAT Ids.
46762 entries remaining after dropping duplicates in the most important columns.




Done


### Notes
- check names containing " e.g. '" Finalin " GmbH'
- ckeck Müller & Schmidt Pfeilringwerk GmbH & Co. KG. it has two entries with the same Buergel ID

## Data Exploration

Notes:
- names contain: double white spaces, 

In [33]:
text_of_all_names = "".join(df.name.values)

In [109]:
all_special_characters = list(set([c for c in re.sub('[a-zA-Z\d\s:]', "", text_of_all_names)]))
dict(zip(all_special_characters, [unidecode(c) for c in all_special_characters]))

{'ç': 'c',
 '-': '-',
 'á': 'a',
 '?': '?',
 '·': '*',
 ',': ',',
 'à': 'a',
 '∙': '[?]',
 '*': '*',
 'é': 'e',
 'ü': 'u',
 '>': '>',
 '[': '[',
 '³': '3',
 '~': '~',
 '&': '&',
 '@': '@',
 'Ç': 'C',
 '<': '<',
 ';': ';',
 '\\': '\\',
 'ß': 'ss',
 'ä': 'a',
 'ó': 'o',
 'Ö': 'O',
 '²': '2',
 '´': "'",
 '–': '-',
 'É': 'E',
 '/': '/',
 'ô': 'o',
 '°': 'deg',
 '.': '.',
 'è': 'e',
 '’': "'",
 ']': ']',
 '!': '!',
 '`': '`',
 '®': '(r)',
 'Ü': 'U',
 '"': '"',
 'Ä': 'A',
 '•': '*',
 '(': '(',
 ')': ')',
 '+': '+',
 '̈': '',
 'ö': 'o',
 'ù': 'u',
 '|': '|',
 "'": "'"}

In [73]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/datascientist/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [99]:
# txt = unidecode(text_of_all_names.lower())
text_of_all_names = " ".join(df.name.values)
txt = unidecode(text_of_all_names)

tokens = pd.Series(nltk.word_tokenize(txt))
wt = tokens.value_counts()[:50]

tokenizer = nltk.data.load('tokenizers/punkt/german.pickle')
tokens = pd.Series(tokenizer.tokenize(txt))
pt = tokens.value_counts()[:50]

tokenizer = nltk.tokenize.WordPunctTokenizer()
tokens = pd.Series(tokenizer.tokenize(txt))
wpt = tokens.value_counts()[:50]

tokens = pd.DataFrame({'word_tokenizer':wt.index.values, 
                       'punkt_tokenizer':pt.index.values,
                       'wordpunkt_tokenizer':wpt.index.values})
del txt, pt, wt, wpt
tokens

Unnamed: 0,punkt_tokenizer,word_tokenizer,wordpunkt_tokenizer
0,KG.,GmbH,GmbH
1,GmbH & Co.,&,.
2,& Co.,KG,-
3,KG Gebr.,Co.,&
4,Inh.,und,Co
5,Ing.,.,KG
6,KG Joh.,mbH,und
7,KG Wilh.,-,mbH
8,Wilh.,Gesellschaft,Gesellschaft
9,Dipl.-Ing.,Haftung,K


In [105]:
df[df.name.str.contains('KG\.')][:5]

Unnamed: 0,ersId,crmId,buergelId,name,legalForm,email,phoneNumber,website,vatID,registerNumber,commercialRegister,address.street,address.postalCode,address.city,address.country,randomId
35205,e6def564-9bb9-11e8-9a84-0242ac120002,18629,3739628,A. Kempf GmbH & Co. KG. Uniformmützen,,info@kempf-muetzen.de,499655252.0,http://http//www.kempf-muetzen.de,,,,Zeinrieder Str. 7,92552,Teunz,,0.552642
18764,a8059c34-07ab-4e1f-9cc5-c94629b834a5,6997,3522150,A. u. K. Müller GmbH & Co KG.,,info@akmueller.de,4921173910.0,http://http//www.akmueller.de,,,,Dresdener Str. 162,40595,Düsseldorf.,,0.345009
32236,e6d58998-9bb9-11e8-9a84-0242ac120002,19954,6054627,AWK Verschlüsse GmbH & Co. KG.,,kontakt@awk-verschluesse.de,49923199505.0,http://http//www.awk-verschluesse.de,,,,Thölauer Str. 12,95615,Marktredwitz,,0.417684
71129,e7564f06-9bb9-11e8-9a84-0242ac120002,50866,5447566,Adolf Sauter GmbH & Co KG.,,info@adolf-sauter.de,,http://http//www.adolf-sauter.de,,,,Ludwigstr. 4,73054,Eislingen,,0.93706
60854,e73281fc-9bb9-11e8-9a84-0242ac120002,57629,987457,Alfred Sternjakob GmbH & Co. KG.,,steinmann@steinmanngruppe.de,49623349010.0,http://http//www.sternjakob.de,,,,Frankenstr. 47-55,67227,Frankenthal (Pfalz),,0.793146


In [121]:
r = re.compile('[A-Z]{1,3}\.')
txt = text_of_all_names
tokens = pd.Series(list(filter(r.match, nltk.tokenize.WhitespaceTokenizer().tokenize(txt))))
tokens.value_counts()[:50]

H.          184
K.          153
KG.         133
A.          115
W.          109
J.           90
M.           80
G.           75
E.           73
F.           67
C.           63
G.m.b.H.     59
U.           46
R.           43
B.           43
L.           33
P.           30
S.           30
D.           26
K.G.         26
E.K.         19
CO.          17
V.           16
O.           13
N.           12
T.           12
E.G.         11
F.W.         11
B.V.         11
E.V.         10
I.            9
DR.           8
G.M.B.H.      8
M.B.H.        7
H.-J.         5
W.H.          4
H.P.          4
I.S.T.        4
G.m.b.H       4
S.A.          4
INC.          3
K.H.          3
G.U.T.        3
OHG.          3
G.H.          3
W.F.          3
J.G.          3
F.A.          3
I.C.S.        2
AG.           2
dtype: int64

In [32]:
["G.m.b.H.",