# Visualizing essays

The purpose of this is to conveniently look at essays with the spans highlighted according to their label.

In [2]:
import json
import os
import random

import gradio as gr
import spacy
from spacy import displacy
from spacy.tokens import Span

# download spacy model ---
# os.system('python -m spacy download en_core_web_sm')

# # set up colors for PII types ---

options = {
    "colors": {
        "NAME_STUDENT": "#6EB5FF",    # Lighter blue
        "EMAIL": "#42D4B5",           # Light teal
        "USERNAME": "#D8B4E2",        # Light lavender
        "ID_NUM": "#7AE88F",          # Light green
        "PHONE_NUM": "#FFB87D",       # Light peach
        "URL_PERSONAL": "#C9B4E2",    # Pale purple
        "STREET_ADDRESS": "#B4B77F"   # Light olive
    }
}


# load data ---
with open("../../data/datamix.json") as f:
    data = json.load(f)


pii_types = list(options["colors"].keys())
pii_types.append("Random")

data["mixtral-v1a"] = json.load(open("/drive2/kaggle/pii-dd/data/mixtral-v1a.json"))
data["mixtral-v2"] = json.load(open("/drive2/kaggle/pii-dd/data/mixtral-v2.json"))
data["mixtral-v3"] = json.load(open("/drive2/kaggle/pii-dd/data/mixtral-v3.json"))

subsets = list(data.keys())

nlp = spacy.load("en_core_web_sm")
# render sample ---


def render_sample(subset, pii_type, doc_id=-1):
    candidates = data[subset]
    while True:
        sample = random.choice(candidates)
        if doc_id != -1:
            break
        if pii_type == "Random":
            break
        elif pii_type in sample['piis']:
            break

    if doc_id != -1:
        
        sample = [d for d in candidates if d['document'] == doc_id][0]

    # render
    doc = spacy.tokens.Doc(nlp.vocab, words=sample['tokens'], spaces=sample['trailing_whitespace'])

    #
    ents = []
    in_entity = False
    start, end = 0, 0

    for index, label in enumerate(sample['labels']):
        if label.startswith('B-'):
            if in_entity:  # End the previous entity
                ents.append(Span(doc, start, end, sample['labels'][start][2:]))
            start, end = index, index + 1  # Start a new entity
            in_entity = True
        elif label.startswith('I-') and in_entity:
            end = index + 1  # Continue the entity
        elif in_entity:
            # End the current entity and reset
            ents.append(Span(doc, start, end, sample['labels'][start][2:]))
            in_entity = False

    # Add the last entity if we're still in one
    if in_entity:
        ents.append(Span(doc, start, end, sample['labels'][start][2:]))

    doc.ents = ents
    output = displacy.render(doc, style="ent", jupyter=False, options=options)
    return {'document': sample['document']}, output


# app layout & callback ---
# with gr.Blocks(theme=gr.themes.Soft()) as demo:
with gr.Blocks() as demo:
    with gr.Row():
        subset_dropdown = gr.Dropdown(
            subsets,
            value=subsets[0],
            label="Subset",
            info="Select data subset..."
        )

        focus_pii = gr.Dropdown(
            pii_types,
            value="Random",
            label="PII Focus",
            info="Select a PII type to focus on..."
        )

        doc_id_input = gr.Number(
            value=-1,
            label="Document ID",
            info="Enter a document ID to focus on..."
        )

    sample_btn = gr.Button("Sample")
    document_id_display = gr.JSON(label="Document ID")

    sample_display = gr.HTML(label="Example")

    # callback ---
    sample_btn.click(
        fn=render_sample,
        inputs=[subset_dropdown, focus_pii, doc_id_input],
        outputs=[document_id_display, sample_display],
    )

# launch app ----
demo.launch()

Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.




In [4]:
len('�')

1

In [7]:
# from spacy.lang.en import English

# tokenizer = English().tokenizer
# [[str(x)] for x in tokenizer("Hello there \n\n good\n\nsir")]
# [['Hello'], ['there'], ['\n\n '], ['good'], ['\n\n'], ['sir']]

In [17]:
d["tokens"][562]

'Kolusu'

In [5]:
d["tokens"][11], d["trailing_whitespace"][11]

('\xa0', False)

In [6]:
d["full_text"][:200]

'Assignment:\xa0 Visualization\xa0Reflection\xa0 Submitted\xa0by:\xa0Nadine Born\xa0 Course:\xa0 Design\xa0Thinking\xa0for\xa0Innovation\xa0 \xa0 Trail\xa0Challenge:\xa0To\xa0Build\xa0or\xa0Not\xa0to\xa0Build\xa0 \xa0 An\xa0environmental\xa0charity\xa0wanted\xa0to\xa0conduct\xa0a\xa0f'

In [3]:
d = [d for d in data["competition"] if d["document"] == 56][0]

print(d["tokens"])

['Assignment', ':', '\xa0 ', 'Visualization', '\xa0', 'Reflection', '\xa0 ', 'Submitted', '\xa0', 'by', ':', '\xa0', 'Nadine', 'Born', '\xa0 ', 'Course', ':', '\xa0 ', 'Design', '\xa0', 'Thinking', '\xa0', 'for', '\xa0', 'Innovation', '\xa0 \xa0 ', 'Trail', '\xa0', 'Challenge', ':', '\xa0', 'To', '\xa0', 'Build', '\xa0', 'or', '\xa0', 'Not', '\xa0', 'to', '\xa0', 'Build', '\xa0 \xa0 ', 'An', '\xa0', 'environmental', '\xa0', 'charity', '\xa0', 'wanted', '\xa0', 'to', '\xa0', 'conduct', '\xa0', 'a', '\xa0', 'fundraising', '\xa0', 'campaign', '\xa0', 'to', '\xa0', 'raise', '\xa0', '$', '4', '\xa0', 'million', '\xa0', 'to', '\xa0', 'build', '\xa0', 'a', '\xa0 ', 'public', '\xa0', 'path', '\xa0', 'in', '\xa0', 'a', '\xa0', 'busy', '\xa0', 'tourist', '\xa0', 'area', '\xa0', 'of', '\xa0', 'a', '\xa0', 'small', '\xa0', 'town', '\xa0', 'in', '\xa0', 'British', '\xa0', 'Columbia', ',', '\xa0', 'Canada', '.', '\xa0', 'They', '\xa0', 'had', '\xa0', 'been', '\xa0', 'gifted', '\xa0', 'a', '\xa0 ', '