# Plotting NER Datasets

In this notebook, we'll try to plot the following characteristics:
- Length as a function of frequency
- BD and SD

I can just type the data instead of loading it in

In [1]:
import pandas as pd
import altair as alt

In [2]:
data = {
    "riqua": [7.03, 4026, 1.65, 1.16],
    "parc": [7.89, 16840, 1.34, 1.43],
    "conll00": [1.55, 37168, 1.27, 0.44],
    "conll03": [1.34, 5874, 2.79, 1.06],
    "ontonotes": [1.62, 16861, 3.35, 1.00],
    "ebmnlp": [3.65, 21788, 0.71, 0.59],
}

In [3]:
df = (
    pd.DataFrame.from_dict(
        data, orient="index", columns=["length", "frequency", "sd", "bd"]
    )
    .reset_index()
    .rename(columns={"index": "dataset"})
)
df

Unnamed: 0,dataset,length,frequency,sd,bd
0,riqua,7.03,4026,1.65,1.16
1,parc,7.89,16840,1.34,1.43
2,conll00,1.55,37168,1.27,0.44
3,conll03,1.34,5874,2.79,1.06
4,ontonotes,1.62,16861,3.35,1.0
5,ebmnlp,3.65,21788,0.71,0.59


In [25]:
chart = alt.Chart(df).mark_circle(size=180).encode(
    x=alt.X("sd", axis=alt.Axis(titleFontSize=12, title="Span Distinctiveness (SD)")),
    y=alt.Y("bd", axis=alt.Axis(titleFontSize=12, title="Boundary Distinctiveness (BD)")),
    tooltip=["dataset", "length", "frequency", "sd", "bd"], 
).interactive().configure_axis(grid=False).configure(background="#FFFFF8").configure_mark(color="#A00000")
chart.save("bd_vs_sd.html")

In [24]:
chart

In [27]:
chart = alt.Chart(df).mark_circle(size=180).encode(
    x=alt.X("length", axis=alt.Axis(titleFontSize=12, title="Span Length")),
    y=alt.Y("frequency", axis=alt.Axis(titleFontSize=12, title="Span Frequency")),
    tooltip=["dataset", "length", "frequency", "sd", "bd"]
).interactive().configure_axis(grid=False).configure(background="#FFFFF8").configure_mark(color="#A00000")
chart.save("length_vs_freq.html")

In [28]:
chart

In [6]:
print(df.to_markdown(index=False))

| dataset   |   length |   frequency |   sd |   bd |
|:----------|---------:|------------:|-----:|-----:|
| riqua     |     7.03 |        4026 | 1.65 | 1.16 |
| parc      |     7.89 |       16840 | 1.34 | 1.43 |
| conll00   |     1.55 |       37168 | 1.27 | 0.44 |
| conll03   |     1.34 |        5874 | 2.79 | 1.06 |
| ontonotes |     1.62 |       16861 | 3.35 | 1    |
| ebmnlp    |     3.65 |       21788 | 0.71 | 0.59 |
