After training a model on names from fewnerd, I used it to predict on the full training set. This notebook explores those predictions.

This is primarily to find examples that contain names that are not considered PII.

In [9]:
import json

data = json.load(open("../data/train.json"))

docs_with_pii = [d["document"] for d in data if len(set(d["labels"])) > 1]

len(data), len(docs_with_pii)

(6807, 945)

In [10]:
import pandas as pd


preds = pd.read_csv("../inference/outputs/fewnerd-d3b/preds.csv")
preds["text_len"] = [len(x) for x in preds.token_text]
preds["titled"] = [x[0].isupper() for x in preds.token_text]

print(preds.document.nunique())
print(len(preds))
print(preds.text_len.mean(), preds.text_len.median())
print(preds.titled.value_counts())
print(((preds.text_len > 3) & (preds.titled) * ~preds.document.isin(set(docs_with_pii))).sum())

preds.head()

2904
12331
5.179547481956046 5.0
True     10346
False     1985
Name: titled, dtype: int64
5082


Unnamed: 0,row_id,document,token,label,token_text,text_len,titled
0,0,7,9,PERSON,Nathalie,8,True
1,1,7,10,PERSON,Sylla,5,True
2,2,7,52,PERSON,Buzan,5,True
3,3,7,53,PERSON,T.,2,True
4,4,7,55,PERSON,Buzan,5,True


In [17]:
import json
import os
import random

import gradio as gr
import spacy
from spacy import displacy
from spacy.tokens import Span

# download spacy model ---
# os.system('python -m spacy download en_core_web_sm')

# # set up colors for PII types ---

options = {
    "colors": {
        "PERSON": "#6EB5FF",    # Lighter blue
        "EMAIL": "#42D4B5",           # Light teal
        "USERNAME": "#D8B4E2",        # Light lavender
        "ID_NUM": "#7AE88F",          # Light green
        "PHONE_NUM": "#FFB87D",       # Light peach
        "URL_PERSONAL": "#C9B4E2",    # Pale purple
        "STREET_ADDRESS": "#B4B77F"   # Light olive
    }
}

pii_types = list(options["colors"].keys())
pii_types.append("Random")

nlp = spacy.load("en_core_web_sm")
# render sample ---


def render_sample(pii_type, doc_id=-1):
    while True:
        sample = random.choice(data)

        if not all([x=="O" for x in sample["labels"]]):
            continue

        p = preds[preds['document'] == sample['document']]

        if len(p) == 0:
            continue

        if doc_id != -1:
            break
        if pii_type == "Random":
            break
        elif pii_type in sample['piis']:
            break

    if doc_id != -1:
        
        sample = [d for d in data if d['document'] == doc_id][0]


    print("---" * 10)
    print(sample['document'])
    print("---" * 10)
    print(p)
    # render
    doc = spacy.tokens.Doc(nlp.vocab, words=sample['tokens'], spaces=sample['trailing_whitespace'])

    #
    ents = []
    in_entity = False
    start, end = 0, 0

    print(p["token"])

    for index, label in enumerate(sample['labels']):
        if index in p["token"].tolist() and "NAME" not in label:
            print(index)
            if in_entity:
                continue
            in_entity = True
            start, end = index, index + 1
        else:
            if in_entity:
                end = index
                ents.append(Span(doc, start, index, "PERSON"))
                in_entity = False
        

    # Add the last entity if we're still in one
    if in_entity:
        ents.append(Span(doc, start, end, "PERSON"))

    doc.ents = ents
    output = displacy.render(doc, style="ent", jupyter=False, options=options)
    print(ents)
    return {'document': sample['document']}, output


# app layout & callback ---
# with gr.Blocks(theme=gr.themes.Soft()) as demo:
with gr.Blocks() as demo:
    with gr.Row():
        focus_pii = gr.Dropdown(
            pii_types,
            value="Random",
            label="PII Focus",
            info="Select a PII type to focus on..."
        )

        doc_id_input = gr.Number(
            value=-1,
            label="Document ID",
            info="Enter a document ID to focus on..."
        )

    sample_btn = gr.Button("Sample")
    document_id_display = gr.JSON(label="Document ID")

    sample_display = gr.HTML(label="Example")

    # callback ---
    sample_btn.click(
        fn=render_sample,
        inputs=[focus_pii, doc_id_input],
        outputs=[document_id_display, sample_display],
    )

# launch app ----
demo.launch()

Running on local URL:  http://127.0.0.1:7870

To create a public link, set `share=True` in `launch()`.




------------------------------
9286
------------------------------
      row_id  document  token   label token_text
3404    3404      9286    371  PERSON    Prakash
3405    3405      9286    572  PERSON    Prakash
3406    3406      9286    605  PERSON    Prakash
3407    3407      9286    725  PERSON    Prakash
3404    371
3405    572
3406    605
3407    725
Name: token, dtype: int64
[]




------------------------------
7858
------------------------------
      row_id  document  token   label  token_text
2500    2500      7858      5  PERSON     Mickael
2501    2501      7858      6  PERSON  Richardson
2502    2502      7858    857  PERSON     Mickael
2503    2503      7858    858  PERSON  Richardson
2504    2504      7858   1450  PERSON  Volunteers
2500       5
2501       6
2502     857
2503     858
2504    1450
Name: token, dtype: int64
1450
[Volunteers]
