# Baslines
* CRF (conditional Random Field)
* spacy (Evaluate with existing Spacy model)

## CRF

In [1]:
import random
from collections import Counter

from presidio_evaluator.data_generator import read_synth_dataset
from presidio_evaluator import InputSample
from presidio_evaluator.models.crf_model import CRFModel

import sklearn_crfsuite
from sklearn_crfsuite import metrics

all_samples = read_synth_dataset("../../presidio-research/data/generated_address_size_2000_date_October_27_2021.json")
all_samples = [sample for sample in all_samples if len(sample.spans) > 0]
print("Kept {} samples after removal of non-tagged samples".format(len(all_samples)))

random.shuffle(all_samples)

train_len = int(len(all_samples)* 0.80)
train_data = all_samples[:train_len]
test_data = all_samples[train_len:]

train_data = InputSample.create_conll_dataset(train_data, to_bio=False)
test_data = InputSample.create_conll_dataset(test_data, to_bio=False)

test_data.head()

Flair is not installed by default
Flair is not installed
Kept 2000 samples after removal of non-tagged samples


Unnamed: 0,text,pos,tag,label,sentence
0,*,PUNCT,NFP,O,0
1,Asia,PROPN,NNP,O,0
2,Pacific,PROPN,NNP,O,0
3,Energy,PROPN,NNP,O,0
4,-,PUNCT,HYPH,O,0


In [2]:
# Turn every sentence into a list of lists (list of tokens + pos + label)
test_sents=test_data.groupby('sentence')[['text','pos','label']].apply(lambda x: x.values.tolist())
train_sents=train_data.groupby('sentence')[['text','pos','label']].apply(lambda x: x.values.tolist())

In [3]:
CRFModel.sent2features(train_sents[0])[0]

{'bias': 1.0,
 'word.lower()': 'let',
 'word[-3:]': 'Let',
 'word[-2:]': 'et',
 'word.isupper()': False,
 'word.istitle()': True,
 'word.isdigit()': False,
 'postag': 'VERB',
 'postag[:2]': 'VE',
 'BOS': True,
 '+1:word.lower()': "'s",
 '+1:word.istitle()': False,
 '+1:word.isupper()': False,
 '+1:postag': 'PRON',
 '+1:postag[:2]': 'PR'}

In [4]:
%time
X_train = [CRFModel.sent2features(s) for s in train_sents]
y_train = [CRFModel.sent2labels(s) for s in train_sents]

X_test = [CRFModel.sent2features(s) for s in test_sents]
y_test = [CRFModel.sent2labels(s) for s in test_sents]

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.01 µs


In [5]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 8.72 s, sys: 30.4 ms, total: 8.75 s
Wall time: 8.75 s




CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.1, c2=0.1,
    keep_tempfiles=None, max_iterations=100)

In [6]:
import pickle
with open("../model_weights/crf.pickle",'wb') as f:
    pickle.dump(crf, f,protocol=pickle.HIGHEST_PROTOCOL)

In [7]:
with open("../model_weights/crf.pickle", 'rb') as f:
    crf = pickle.load(f)

In [20]:
labels = list(crf.classes_)
# labels.remove('O')
labels

['O', 'B-GPE', 'I-GPE', 'L-GPE', 'U-GPE']

In [21]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)

0.992169451864869

In [22]:
y_5_pred = crf.predict([X_test[5]])
# y_5_pred[0]

In [23]:
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

              precision    recall  f1-score   support

           O      0.993     0.998     0.996     25032
       B-GPE      0.976     0.942     0.959       482
       I-GPE      0.990     0.969     0.979      3707
       L-GPE      0.963     0.929     0.946       482
       U-GPE      1.000     0.400     0.571        10

    accuracy                          0.992     29713
   macro avg      0.985     0.848     0.890     29713
weighted avg      0.992     0.992     0.992     29713





## Evaluate an existing spaCy trained model
* Using [this noteboook](https://github.com/microsoft/presidio-research/blob/master/notebooks/models/Evaluate%20spacy%20models.ipynb)

In [12]:
import spacy

from collections import Counter

from presidio_evaluator.models import SpacyModel

from presidio_evaluator.evaluation import Evaluator, ModelError
from presidio_evaluator.data_generator import read_synth_dataset
%reload_ext autoreload
%autoreload 2

synth_samples = read_synth_dataset("../../presidio-research/data/generated_address_size_2000_date_October_27_2021.json")
print(len(synth_samples))
DATASET = synth_samples

2000


In [13]:
entity_counter = Counter()
for sample in DATASET:
    for span in sample.spans:
        entity_counter[span.entity_type]+=1

In [14]:
entity_counter

Counter({'LOCATION': 2532})

In [15]:
DATASET[1]

Full text: Key Corporate Statistics

Headquarters    
Hwy 42 and I-40
(918) 266-3060
www.gearainey.com <http://www.gearainey.com/>
Locations
Spans: [Type: LOCATION, value: Hwy 42 and I-40, start: 43, end: 58]
Tokens: [Key, Corporate, Statistics, 

, Headquarters,    
, Hwy, 42, and, I-40, 
, (, 918, ), 266, -, 3060, 
, www.gearainey.com, <, http://www.gearainey.com/, >, 
, Locations]
Tags: ['O', 'O', 'O', 'O', 'O', 'O', 'B-LOCATION', 'I-LOCATION', 'I-LOCATION', 'L-LOCATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

In [16]:
#max length sentence
max([len(sample.tokens) for sample in DATASET])

334

In [17]:
models = [
    "en_core_web_lg", 
    # "en_core_web_trf",
    ]

In [19]:
for model in models:
    print("-----------------------------------")
    print("Evaluating model {}".format(model))
    nlp = spacy.load(model)
    spacy_model = SpacyModel(model=nlp,entities_to_keep=['GPE'])
    evaluator = Evaluator(model=spacy_model)
    evaluation_results = evaluator.evaluate_all(DATASET)
    scores = evaluator.calculate_score(evaluation_results)
    
    print("Confusion matrix:")
    print(scores.results)

    print("Precision and recall")
    scores.print()
    errors = scores.model_errors

-----------------------------------
Evaluating model en_core_web_lg


Evaluating <class 'presidio_evaluator.evaluation.evaluator.Evaluator'>:   0%|          | 10/2000 [00:00<00:21, 93.01it/s]

Translating entites using this dictionary: {'ORGANIZATION': 'ORG', 'COUNTRY': 'GPE', 'CITY': 'GPE', 'LOCATION': 'GPE', 'PERSON': 'PERSON', 'FIRST_NAME': 'PERSON', 'LAST_NAME': 'PERSON', 'NATION_MAN': 'GPE', 'NATION_WOMAN': 'GPE', 'NATION_PLURAL': 'GPE', 'NATIONALITY': 'GPE', 'GPE': 'GPE', 'ORG': 'ORG'}


Evaluating <class 'presidio_evaluator.evaluation.evaluator.Evaluator'>: 100%|██████████| 2000/2000 [00:22<00:00, 89.46it/s]

Confusion matrix:
Counter({('O', 'O'): 121491, ('GPE', 'O'): 21004, ('GPE', 'GPE'): 3118, ('O', 'GPE'): 983})
Precision and recall
                        Entity                     Precision                        Recall
                           GPE                        76.03%                        12.93%
                           PII                        76.03%                        12.93%
PII F measure: 0.1459704836840185



