# Step - by - step Guide to finding potential matches
1. Prepare the data
2. Push the data to Elastic Search
3. Create the first similarity matrix
4. Use the Explorer to find weed out implausible matches
5. Do further scoring and add new features to the similarity matrix
6. Train a machine learning model on the data
 

## 1. Prepare the data

In [None]:
from suricate.data.companies import getleft, getright

In [None]:
df_source_raw=getleft(nrows=None)
df_target_raw = getright(nrows=None)
print(df_source_raw.shape[0])
print(df_target_raw.shape[0])

In [None]:
def prepare_source(df):
    """

    Args:
        df:

    Returns:
        pd.DataFrame
    """
    df2 = df
    return df2

def prepare_target(df):
    """

    Args:
        df:

    Returns:
        pd.DataFrame
    """
    df2 = df
    return df2

df_source = prepare_source(df_source_raw)
df_target = prepare_target(df_target_raw)

## 2. Push the data to Elastic Search

In [None]:
import elasticsearch
import pandas as pd
import time
from suricate.dbconnectors.esconnector import index_with_es

In [None]:
df_target.set_index('ix', drop=True, inplace=True)
esclient = elasticsearch.Elasticsearch()
es_indice = 'df_target'
if True:
    try:
        esclient.indices.delete(index=es_indice)
    except:
        pass
    request_body = {
        "settings": {
            "number_of_shards": 5,
            "number_of_replicas": 5
        },

        "mappings": {
            "_doc": {
                "properties": {
                    "ix": {"type": "keyword"},
                    "name": {"type": "text"},
                    "street": {"type": "text"},
                    "city": {"type": "text"},
                    "postalcode": {"type": "text"},
                    "countrycode": {"type": "keyword"}
                }
            }
        }
    }
    esclient.indices.create(index=es_indice, body=request_body)
    index_with_es(client=esclient, df=df_target, index=es_indice, ixname="ix", reset_index=True, doc_type='_doc')
    time.sleep(5)

In [None]:
catcount = esclient.count(index=es_indice)['count']
assert catcount == df_target.shape[0]
print(catcount)

## 3. Create the first similarity matrix

In [None]:
from suricate.dbconnectors import EsConnector

In [None]:
scoreplan = {
        'name': {
            'type': 'FreeText'
        },
        'street': {
            'type': 'FreeText'
        },
        'city': {
            'type': 'FreeText'
        },
        'duns': {
            'type': 'Exact'
        },
        'postalcode': {
            'type': 'FreeText'
        },
        'countrycode': {
            'type': 'Exact'
        }
    }
escon = EsConnector(
    client=esclient,
    scoreplan=scoreplan,
    index="right",
    explain=False,
    size=10
)
df_source.set_index('ix', drop=True, inplace=True)
Xtc = escon.fit_transform(X=df_source)
ix = Xtc.index
Xsbs = escon.getsbs(X=df_source, on_ix=ix)

## 4. Explore the data

In [None]:
from suricate.data.companies import getytrue
from suricate.explore import Explorer

Cheatsheet: load already determined labels

In [None]:
y_true = getytrue()
print(y_true.value_counts())

In [None]:
n_questions = 100
## Fit the cluster to non-supervized data
exp = Explorer(n_simple=n_questions, n_hard=n_questions)
exp.fit_cluster(X=Xtc[['es_score']])
y_cluster = pd.Series(data=exp.pred_cluster(X=Xtc), index=Xtc.index, name='y_cluster')
X_cluster = pd.DataFrame(y_cluster)
X_cluster['avg_score'] = Xtc[['es_score']].mean(axis=1)
X_cluster['y_true'] = y_true['y_true']
X_cluster['ix']=Xtc['ix']
X_cluster.reset_index(inplace=True, drop=False)
X_cluster.set_index('ix', inplace=True)

### Ask simple questions
ix_simple = exp.ask_simple(X=Xtc)
Sbs_simple = Xsbs.loc[ix_simple]
y_simple = y_true.loc[ix_simple]['y_true']

### Fit the cluser with supervized data
exp.fit(X=Xtc, y=y_simple, fit_cluster=False)

### Ask hard (pointed) questions
ix_hard = exp.ask_hard(X=Xtc, y=y_simple)
Sbs_hard = Xsbs.loc[ix_hard]
y_hard = y_true.loc[ix_hard]['y_true']

### Obtain the results of the labels
y_questions = y_true.loc[ix_hard.union(ix_simple)]['y_true']
X_questions = Xsbs.loc[y_questions.index].copy()
X_questions['y_cluster'] = y_cluster
X_questions['y_true'] = y_questions
X_questions.reset_index(inplace=True, drop=False)
X_questions.set_index('ix', inplace=True)

## 5. Pruning the data

In [None]:
import pandas as pd
from suricate.sbsdftransformers import FuncSbsComparator
from sklearn.pipeline import FeatureUnion

In [None]:
pruning_threshold = 15
ix_further = Xtc.loc[Xtc['es_score'] > pruning_threshold].index
Xtc = Xtc.loc[ix_further]
Xsbs = Xsbs.loc[ix_further]
y_true = y_true.loc[ix_further]

In [None]:
_sbs_score_list = [
    ('name_fuzzy', FuncSbsComparator(on='name', comparator='fuzzy')),
    ('street_fuzzy', FuncSbsComparator(on='street', comparator='fuzzy')),
    ('name_token', FuncSbsComparator(on='name', comparator='token')),
    ('street_token', FuncSbsComparator(on='street', comparator='token')),
    ('city_fuzzy', FuncSbsComparator(on='city', comparator='fuzzy')),
    ('postalcode_fuzzy', FuncSbsComparator(on='postalcode', comparator='fuzzy')),
    ('postalcode_contains', FuncSbsComparator(on='postalcode', comparator='contains'))
]

scorer_sbs = FeatureUnion(transformer_list=_sbs_score_list)
Xscores2 = scorer_sbs.fit_transform(X=Xsbs)
Xscores2 = pd.DataFrame(data=Xscores2, index=ix_further, columns=[c[0] for c in _sbs_score_list])
for c in ['ix', 'es_score']:
    Xscores2[c] = Xtc[c]
Xscores2.reset_index(inplace=True, drop=False)
Xscores2.set_index('ix', inplace = True)

## 6. Apply the machine-learning model

In [None]:
from suricate.pipeline import PartialClf
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
Xscores2 = Xscores2[[c for c in Xscores2.columns if c != 'ix']]
### Make the pipeline
pipe = Pipeline(steps=[
    ('Impute', SimpleImputer(strategy='constant', fill_value=0)),
    ('Scaler', Normalizer()),
    ('PCA', PCA(n_components=4)),
    ('Predictor', GradientBoostingClassifier(n_estimators=500))
])
pred = PartialClf(classifier=pipe)
pred.fit(X=Xscores2, y=y_true)
print(pred.score(X=Xscores2, y=y_true))
y_pred = pred.predict(X=Xscores2)
print(y_pred.shape[0])    


