In [1]:
import random
from collections import Counter

from presidio_evaluator.data_generator import read_synth_dataset
from presidio_evaluator import InputSample
from presidio_evaluator.models.crf_model import CRFModel
from presidio_evaluator.validation import split_dataset, save_to_json

import sklearn_crfsuite
from sklearn_crfsuite import metrics

Flair is not installed by default
Flair is not installed


In [2]:
DATA_PATH = "../../presidio-research/data/generated_address_size_1000_date_November_07_2021.json"

# Baslines
* CRF (conditional Random Field)
* spacy (Evaluate with existing Spacy model)

## Data Split
Note that we don't want the same pattern to be in more than one set. ([code sample](https://github.com/microsoft/presidio-research/blob/master/notebooks/Split%20by%20pattern%20%23.ipynb))    
note that `split_dataset` function is based on `template#` in `meta_data` 

In [3]:
all_samples = read_synth_dataset(DATA_PATH)
print(len(all_samples))


1000


In [4]:
all_samples[0]

Full text: X-Folder: \Vincent_Kaminski_Jun2001_9\Notes Folders\Audit
X-Origin: Kaminski-V
X-FileName: vkamins.nsf

Professor Darrel Duffie
24201 North Highway One

         May 8, 2000
Spans: [Type: LOCATION, value: 24201 North Highway One, start: 128, end: 151]
Tokens: [X, -, Folder, :, \Vincent_Kaminski_Jun2001_9\Notes, Folders\Audit, 
, X, -, Origin, :, Kaminski, -, V, 
, X, -, FileName, :, vkamins.nsf, 

, Professor, Darrel, Duffie, 
, 24201, North, Highway, One, 

         , May, 8, ,, 2000]
Tags: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOCATION', 'I-LOCATION', 'I-LOCATION', 'L-LOCATION', 'O', 'O', 'O', 'O', 'O']

In [5]:
TRAIN_TEST_RATIOS = [0.7,0.3]
train_data,test_data = split_dataset(all_samples, TRAIN_TEST_RATIOS)

In [6]:
train_data[0]

Full text: Sally,

That tiki person is out according to mom....

Fabian and Mary Ann Block
550 15th Street, Denver
Barney and Joann Boyette
10400 Ridgeview Crt., Cupertino
Spans: [Type: LOCATION, value: 550 15th Street, Denver, start: 80, end: 103, Type: LOCATION, value: 10400 Ridgeview Crt., Cupertino, start: 129, end: 160]
Tokens: [Sally, ,, 

, That, tiki, person, is, out, according, to, mom, ...., 

, Fabian, and, Mary, Ann, Block, 
, 550, 15th, Street, ,, Denver, 
, Barney, and, Joann, Boyette, 
, 10400, Ridgeview, Crt, ., ,, Cupertino]
Tags: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOCATION', 'I-LOCATION', 'I-LOCATION', 'I-LOCATION', 'L-LOCATION', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOCATION', 'I-LOCATION', 'I-LOCATION', 'I-LOCATION', 'I-LOCATION', 'L-LOCATION']

## CRF

In [7]:
# all_samples = read_synth_dataset(DATA_PATH)
# all_samples = [sample for sample in all_samples if len(sample.spans) > 0]
# print("Kept {} samples after removal of non-tagged samples".format(len(all_samples)))

# random.shuffle(all_samples)

# train_len = int(len(all_samples)* 0.80)
# train_data = all_samples[:train_len]
# test_data = all_samples[train_len:]

train_data = InputSample.create_conll_dataset(train_data, to_bio=False)
test_data = InputSample.create_conll_dataset(test_data, to_bio=False)

test_data.head()

Unnamed: 0,text,pos,tag,label,sentence
0,Best,ADJ,JJ,O,0
1,Regards,PROPN,NNPS,O,0
2,",",PUNCT,",",O,0
3,\n,SPACE,_SP,O,0
4,Dave,PROPN,NNP,O,0


In [8]:
# Turn every sentence into a list of lists (list of tokens + pos + label)
test_sents=test_data.groupby('sentence')[['text','pos','label']].apply(lambda x: x.values.tolist())
train_sents=train_data.groupby('sentence')[['text','pos','label']].apply(lambda x: x.values.tolist())

In [9]:
CRFModel.sent2features(train_sents[0])[0]

{'bias': 1.0,
 'word.lower()': 'sally',
 'word[-3:]': 'lly',
 'word[-2:]': 'ly',
 'word.isupper()': False,
 'word.istitle()': True,
 'word.isdigit()': False,
 'postag': 'PROPN',
 'postag[:2]': 'PR',
 'BOS': True,
 '+1:word.lower()': ',',
 '+1:word.istitle()': False,
 '+1:word.isupper()': False,
 '+1:postag': 'PUNCT',
 '+1:postag[:2]': 'PU'}

In [10]:
%time
X_train = [CRFModel.sent2features(s) for s in train_sents]
y_train = [CRFModel.sent2labels(s) for s in train_sents]

X_test = [CRFModel.sent2features(s) for s in test_sents]
y_test = [CRFModel.sent2labels(s) for s in test_sents]

CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 4.53 µs


In [11]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 3.42 s, sys: 2.21 ms, total: 3.42 s
Wall time: 3.42 s




CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.1, c2=0.1,
    keep_tempfiles=None, max_iterations=100)

In [12]:
import pickle
with open("../model_weights/crf.pickle",'wb') as f:
    pickle.dump(crf, f,protocol=pickle.HIGHEST_PROTOCOL)

In [13]:
with open("../model_weights/crf.pickle", 'rb') as f:
    crf = pickle.load(f)

In [14]:
labels = list(crf.classes_)
# labels.remove('O')
labels

['O', 'B-GPE', 'I-GPE', 'L-GPE', 'U-GPE']

In [15]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)

0.9742526374999367

In [16]:
y_5_pred = crf.predict([X_test[5]])
# y_5_pred[0]

In [17]:
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

              precision    recall  f1-score   support

           O      0.994     0.977     0.985     17487
       B-GPE      0.922     0.902     0.912       368
       I-GPE      0.883     0.977     0.928      2933
       L-GPE      0.880     0.878     0.879       368
       U-GPE      1.000     0.333     0.500         3

    accuracy                          0.974     21159
   macro avg      0.936     0.814     0.841     21159
weighted avg      0.975     0.974     0.974     21159





## Evaluate an existing spaCy trained model
* Using [this noteboook](https://github.com/microsoft/presidio-research/blob/master/notebooks/models/Evaluate%20spacy%20models.ipynb)

In [18]:
import spacy

from collections import Counter

from presidio_evaluator.models import SpacyModel

from presidio_evaluator.evaluation import Evaluator, ModelError
from presidio_evaluator.data_generator import read_synth_dataset
%reload_ext autoreload
%autoreload 2

synth_samples = read_synth_dataset(DATA_PATH)
print(len(synth_samples))
DATASET = synth_samples

1000


In [19]:
entity_counter = Counter()
for sample in DATASET:
    for span in sample.spans:
        entity_counter[span.entity_type]+=1

In [20]:
entity_counter

Counter({'LOCATION': 1250})

In [21]:
DATASET[1]

Full text: * Asia Pacific Energy - April 18 

Location and Accommodations: 

The Four Seasons Hotel 
19191 Vallco Pkwy , Cupertino  200 Forest Street
Tel.: +1 713 650 1300 
Fax: +1 713 650 1203 
* Please contact the hotel directly for room reservations.
Spans: [Type: LOCATION, value: 19191 Vallco Pkwy , Cupertino  200 Forest Street, start: 90, end: 138]
Tokens: [*, Asia, Pacific, Energy, -, April, 18, 

, Location, and, Accommodations, :, 

, The, Four, Seasons, Hotel, 
, 19191, Vallco, Pkwy, ,, Cupertino,  , 200, Forest, Street, 
, Tel, ., :, +1, 713, 650, 1300, 
, Fax, :, +1, 713, 650, 1203, 
, *, Please, contact, the, hotel, directly, for, room, reservations, .]
Tags: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOCATION', 'I-LOCATION', 'I-LOCATION', 'I-LOCATION', 'I-LOCATION', 'I-LOCATION', 'I-LOCATION', 'I-LOCATION', 'L-LOCATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'

In [22]:
#max length sentence
max([len(sample.tokens) for sample in DATASET])

332

In [23]:
models = [
    "en_core_web_lg", 
    # "en_core_web_trf",
    ]

In [24]:
for model in models:
    print("-----------------------------------")
    print("Evaluating model {}".format(model))
    nlp = spacy.load(model)
    spacy_model = SpacyModel(model=nlp,entities_to_keep=['GPE'])
    evaluator = Evaluator(model=spacy_model)
    evaluation_results = evaluator.evaluate_all(DATASET)
    scores = evaluator.calculate_score(evaluation_results)
    
    print("Confusion matrix:")
    print(scores.results)

    print("Precision and recall")
    scores.print()
    errors = scores.model_errors

-----------------------------------
Evaluating model en_core_web_lg


Evaluating <class 'presidio_evaluator.evaluation.evaluator.Evaluator'>:   1%|          | 11/1000 [00:00<00:09, 104.06it/s]

Translating entites using this dictionary: {'ORGANIZATION': 'ORG', 'COUNTRY': 'GPE', 'CITY': 'GPE', 'LOCATION': 'GPE', 'PERSON': 'PERSON', 'FIRST_NAME': 'PERSON', 'LAST_NAME': 'PERSON', 'NATION_MAN': 'GPE', 'NATION_WOMAN': 'GPE', 'NATION_PLURAL': 'GPE', 'NATIONALITY': 'GPE', 'GPE': 'GPE', 'ORG': 'ORG'}


Evaluating <class 'presidio_evaluator.evaluation.evaluator.Evaluator'>: 100%|██████████| 1000/1000 [00:10<00:00, 94.21it/s]

Confusion matrix:
Counter({('O', 'O'): 61596, ('GPE', 'O'): 10536, ('GPE', 'GPE'): 1536, ('O', 'GPE'): 468})
Precision and recall
                        Entity                     Precision                        Recall
                           GPE                        76.65%                        12.72%
                           PII                        76.65%                        12.72%
PII F measure: 0.14377566039197456



