CRF trainer using the sklearn_crfsuite package (Python wrapper for CRFSuite): https://sklearn-crfsuite.readthedocs.io/en/latest/

In [None]:
from collections import Counter

import sklearn_crfsuite
from sklearn_crfsuite import metrics

from presidio_evaluator import InputSample
from presidio_evaluator.models.crf_model import CRFModel

In [None]:
DATA_DATE = "Jan-15-2022"

Source a dataset to use for training / testing:

In [None]:
train_samples = InputSample.read_dataset_json(
    "../../data/train_{}.json".format(DATA_DATE)
)
test_samples = InputSample.read_dataset_json(
    "../../data/test_{}.json".format(DATA_DATE)
)

In [None]:
train_tagged = [sample for sample in train_samples if len(sample.spans) > 0]
print(
    "Kept {} train samples after removal of non-tagged samples".format(
        len(train_tagged)
    )
)
train_data = InputSample.create_conll_dataset(train_tagged)

test_data = InputSample.create_conll_dataset(test_samples)
test_data.head()

In [None]:
# Turn every sentence into a list of lists (list of tokens + pos + label)
test_sents = test_data.groupby("sentence")[["text", "pos", "label"]].apply(
    lambda x: x.values.tolist()
)
train_sents = train_data.groupby("sentence")[["text", "pos", "label"]].apply(
    lambda x: x.values.tolist()
)

Create features for CRF

In [None]:
CRFModel.sent2features(train_sents[0])[0]

In [None]:
%%time
X_train = [CRFModel.sent2features(s) for s in train_sents]
y_train = [CRFModel.sent2labels(s) for s in train_sents]

X_test = [CRFModel.sent2features(s) for s in test_sents]
y_test = [CRFModel.sent2labels(s) for s in test_sents]

In [None]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm="lbfgs", c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True
)
crf.fit(X_train, y_train)

Save trained model to pickle

In [None]:
import pickle
import os

os.makedirs("../../models/", exist_ok=True)

with open("../../models/crf.pickle", "wb") as f:
    pickle.dump(crf, f, protocol=pickle.HIGHEST_PROTOCOL)

Open saved model

In [None]:
with open("../../models/crf.pickle", "rb") as f:
    crf = pickle.load(f)

Extract info and predictions from model

In [None]:
labels = list(crf.classes_)
labels.remove("O")
labels

In [None]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred, average="weighted", labels=labels)

In [None]:
## predict one:
y_5_pred = crf.predict([X_test[5]])
y_5_pred[0]

In [None]:
# group B and I results
sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0]))
print(
    metrics.flat_classification_report(y_test, y_pred, labels=sorted_labels, digits=3)
)