In [None]:
import random
from collections import Counter

from presidio_evaluator.data_generator import read_synth_dataset
from presidio_evaluator import InputSample
from presidio_evaluator.models.crf_model import CRFModel
from presidio_evaluator.validation import split_dataset, save_to_json

import sklearn_crfsuite
from sklearn_crfsuite import metrics

In [None]:
DATA_PATH = "../../presidio-research/data/generated_address_size_2000_date_October_27_2021.json"

# Baslines
* CRF (conditional Random Field)
* spacy (Evaluate with existing Spacy model)

## Data Split
Note that we don't want the same pattern to be in more than one set. ([code sample](https://github.com/microsoft/presidio-research/blob/master/notebooks/Split%20by%20pattern%20%23.ipynb))    
note that `split_dataset` function is based on `template#` in `meta_data` 

In [None]:
all_samples = read_synth_dataset(DATA_PATH)
print(len(all_samples))


In [None]:
all_samples[0]

In [None]:
TRAIN_TEST_RATIOS = [0.7,0.3]
train,test = split_dataset(all_samples, TRAIN_TEST_RATIOS)

## CRF

In [None]:
all_samples = read_synth_dataset(DATA_PATH)
all_samples = [sample for sample in all_samples if len(sample.spans) > 0]
print("Kept {} samples after removal of non-tagged samples".format(len(all_samples)))

random.shuffle(all_samples)

train_len = int(len(all_samples)* 0.80)
train_data = all_samples[:train_len]
test_data = all_samples[train_len:]

train_data = InputSample.create_conll_dataset(train_data, to_bio=False)
test_data = InputSample.create_conll_dataset(test_data, to_bio=False)

test_data.head()

In [None]:
# Turn every sentence into a list of lists (list of tokens + pos + label)
test_sents=test_data.groupby('sentence')[['text','pos','label']].apply(lambda x: x.values.tolist())
train_sents=train_data.groupby('sentence')[['text','pos','label']].apply(lambda x: x.values.tolist())

In [None]:
CRFModel.sent2features(train_sents[0])[0]

In [None]:
%time
X_train = [CRFModel.sent2features(s) for s in train_sents]
y_train = [CRFModel.sent2labels(s) for s in train_sents]

X_test = [CRFModel.sent2features(s) for s in test_sents]
y_test = [CRFModel.sent2labels(s) for s in test_sents]

In [None]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

In [None]:
import pickle
with open("../model_weights/crf.pickle",'wb') as f:
    pickle.dump(crf, f,protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
with open("../model_weights/crf.pickle", 'rb') as f:
    crf = pickle.load(f)

In [None]:
labels = list(crf.classes_)
# labels.remove('O')
labels

In [None]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)

In [None]:
y_5_pred = crf.predict([X_test[5]])
# y_5_pred[0]

In [None]:
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

## Evaluate an existing spaCy trained model
* Using [this noteboook](https://github.com/microsoft/presidio-research/blob/master/notebooks/models/Evaluate%20spacy%20models.ipynb)

In [None]:
import spacy

from collections import Counter

from presidio_evaluator.models import SpacyModel

from presidio_evaluator.evaluation import Evaluator, ModelError
from presidio_evaluator.data_generator import read_synth_dataset
%reload_ext autoreload
%autoreload 2

synth_samples = read_synth_dataset("../../presidio-research/data/generated_address_size_2000_date_October_27_2021.json")
print(len(synth_samples))
DATASET = synth_samples

In [None]:
entity_counter = Counter()
for sample in DATASET:
    for span in sample.spans:
        entity_counter[span.entity_type]+=1

In [None]:
entity_counter

In [None]:
DATASET[1]

In [None]:
#max length sentence
max([len(sample.tokens) for sample in DATASET])

In [None]:
models = [
    "en_core_web_lg", 
    # "en_core_web_trf",
    ]

In [None]:
for model in models:
    print("-----------------------------------")
    print("Evaluating model {}".format(model))
    nlp = spacy.load(model)
    spacy_model = SpacyModel(model=nlp,entities_to_keep=['GPE'])
    evaluator = Evaluator(model=spacy_model)
    evaluation_results = evaluator.evaluate_all(DATASET)
    scores = evaluator.calculate_score(evaluation_results)
    
    print("Confusion matrix:")
    print(scores.results)

    print("Precision and recall")
    scores.print()
    errors = scores.model_errors