# Setup 

In [None]:
SYS_INPUT_DIR = '/kaggle/input/pii-detection-removal-from-educational-data'

In [None]:
import os
import numpy as np
import warnings
import pandas as pd

warnings.filterwarnings('ignore', category=pd.errors.SettingWithCopyWarning)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

# Read train data

In [None]:
import json
train_json = json.load(open(os.path.join(SYS_INPUT_DIR, "train.json")))
train = pd.json_normalize(train_json)

In [None]:
train.head()

In [None]:
# Check train has one row per document
assert train['document'].nunique() == train.shape[0]

In [None]:
full_ner_labels = [
    'B-NAME_STUDENT', 'I-NAME_STUDENT',
    'B-URL_PERSONAL', 'I-URL_PERSONAL',
    'B-ID_NUM', 'I-ID_NUM',
    'B-EMAIL', 'I-EMAIL',
    'B-STREET_ADDRESS', 'I-STREET_ADDRESS',
    'B-PHONE_NUM', 'I-PHONE_NUM',
    'B-USERNAME', 'I-USERNAME'
]

# Calculate stats

In [None]:
train_eda = train.copy()
# Find documnets with high number of entities
train_eda['ner_labels'] = train_eda['labels'].apply(lambda x: [item for item in x if item != 'O'])
train_eda['count_ner_labels'] = train_eda['ner_labels'].apply(len)
train_eda['count_distinct_ner_labels'] = train_eda['ner_labels'].apply(lambda x: len(set(x)))
train_eda.sort_values(by='count_distinct_ner_labels', inplace=True, ascending=False)

exploded_df = train_eda['ner_labels'].explode()
dummies = pd.get_dummies(exploded_df).reset_index()

frequency = dummies.sum().sort_values(ascending=False)
ordered_columns = frequency.index.tolist() ; ordered_columns.remove("index")

counted = dummies.groupby('index').sum()
counted = counted.reindex(columns=full_ner_labels, fill_value=0)
counted = counted[ordered_columns + [i for i in full_ner_labels if i not in ordered_columns]]

train_eda = train_eda.join(counted)

# Explore most interesting documents

Interesting documents are those with many distinct NER labels

In [None]:
sample_train = train_eda.head(3)

## Display functions

In [None]:
from bs4 import BeautifulSoup

ner_colors = {
    'B-NAME_STUDENT': 'rgb(255, 179, 179)',
    'I-NAME_STUDENT': 'rgb(139, 0, 0)',
    'B-URL_PERSONAL': 'rgb(255, 223, 179)',
    'I-URL_PERSONAL': 'rgb(255, 140, 0)',
    'B-ID_NUM': 'rgb((255, 255, 179)',
    'I-ID_NUM':  'rgb(204, 204, 0)',
    'B-EMAIL': 'rgb(179, 255, 179)',
    'I-EMAIL': 'rgb(0, 100, 0)',
    'B-STREET_ADDRESS':  'rgb(179, 223, 255)',
    'I-STREET_ADDRESS': 'rgb(0, 0, 139)',
    'B-PHONE_NUM': 'rgb(223, 179, 255)',
    'I-PHONE_NUM': 'rgb(75, 0, 130)',
    'B-USERNAME': 'rgb(255, 179, 223)',
    'I-USERNAME': 'rgb(231, 84, 128)',
}

def generate_legend():
    legend_html = '<div class="legend" style="margin-bottom: 10px;">'
    for entity, color in ner_colors.items():
        legend_html += f'<span style="color: {color}; margin-right: 10px;">{entity}</span>'
    legend_html += '</div>'
    return legend_html

def highlight_entities(text, tokens, labels):
    legend = generate_legend()  # Generate the legend
    soup = BeautifulSoup(legend, 'html.parser')  # Start with the legend

    last_idx = 0
    for token, label in zip(tokens, labels):
        start, end = text.find(token, last_idx), text.find(token, last_idx) + len(token)

        if start != -1:
            soup.append(BeautifulSoup(text[last_idx:start], 'html.parser'))

            if label != 'O':
                token_span = soup.new_tag('span', style=f'background-color: {ner_colors.get(label, "black")}; font-family: "Tahoma"; padding: 0 2px; border-radius: 3px;')
                token_span.string = token
                soup.append(token_span)
                
                label_span = soup.new_tag('span', style=f'background-color: {ner_colors.get(label, "black")}; font-family: Tahoma; font-weight: bold; padding: 0 2px; border-radius: 3px;')
                label_span.string = f" [{label}]"
                soup.append(label_span)
            else:
                  soup.append(token)

            last_idx = end

    soup.append(BeautifulSoup(text[last_idx:], 'html.parser'))
    return str(soup).replace('\n', '<br/>')

def label_color(label):
    return ner_colors.get(label, 'black')  # Default color

## Display top interesting documents

In [None]:
sample_train.loc[:, 'html'] = sample_train.apply(lambda x: highlight_entities(x['full_text'], x['tokens'], x['labels']), axis=1)

In [None]:
from IPython.display import display, HTML

for html in sample_train['html']:
    display(HTML(html))

You can adjust coloring in ner_colors dictionary

# NER labels data distribution

## Data

In [None]:
import pandas 

num_documents = train_eda.shape[0]
ner_labels_data = train_eda[full_ner_labels].melt(var_name='ner_label', value_name='count')
ner_labels_stat = ner_labels_data.groupby('ner_label').agg(
    doc_count=pd.NamedAgg(column='count', aggfunc=lambda x: (x > 0).sum()),
    ner_count=pd.NamedAgg(column='count', aggfunc="sum"),
).reset_index()
ner_labels_stat['doc_count_percentage'] = np.round(ner_labels_stat['doc_count'] /num_documents,4)
ner_labels_stat['ner_count_percentage'] = np.round(ner_labels_stat['ner_count'] /sum(ner_labels_stat['ner_count']),4)

ner_labels_stat = ner_labels_stat.sort_values('doc_count', ascending=False)

In [None]:
ner_labels_stat

## Plot function

In [None]:
def plot_ner_distribution(ner_labels_stat, count_col, percentage_col):
    
    import numpy as np
    import matplotlib.pyplot as plt
    import matplotlib.colors as mcolors
    
    plt.figure(figsize=(16,8))
    unique_labels = ner_labels_stat['ner_label'].unique()
    colors = plt.cm.hsv(np.linspace(0, 1, len(unique_labels)))
    color_dict = dict(zip(unique_labels, colors))
    
    for label in unique_labels:
        subset = ner_labels_stat[ner_labels_stat['ner_label'] == label]
        plt.bar(subset['ner_label'], subset[count_col], color=color_dict[label])
        
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    
    # Create secondary y-axis for percentage
    sec_axis = plt.twinx()
    sec_axis.plot(ner_labels_stat['ner_label'], ner_labels_stat[percentage_col], color='r')
    sec_axis.set_ylabel('Percentage')
    
    # Titles and labels
    plt.title('Count / Percentage of NER Labels')
    plt.xlabel('NER Label')

## NER labels vs. Number of documents

In [None]:
plot_ner_distribution(ner_labels_stat, 'doc_count', 'doc_count_percentage')

## NER labels distributions

In [None]:
plot_ner_distribution(ner_labels_stat, 'ner_count', 'ner_count_percentage')

# NER label vs. POS tag correlation

In [None]:
import spacy

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

In [None]:
train_eda_ner_vs_pos = train_eda[train_eda['count_ner_labels'] > 0][['document','tokens','labels','ner_labels']]

train_eda_ner_vs_pos['pos_text'] = train_eda_ner_vs_pos.apply(lambda row: ' '.join([token for token, label in zip(row['tokens'], row['labels']) if label != 'O']), axis=1)
docs = nlp.pipe(train_eda_ner_vs_pos['pos_text'])
train_eda_ner_vs_pos['pos_tags'] = [[token.pos_ for token in doc] for doc in docs]
train_eda_ner_vs_pos.drop(columns=['pos_text'], inplace=True)

In [None]:
exploded_df = train_eda_ner_vs_pos[['ner_labels', 'pos_tags']].apply(pd.Series.explode).reset_index(drop=True)
pivot_table = pd.pivot_table(exploded_df, index='ner_labels', columns='pos_tags', aggfunc=len, fill_value=0)
pivot_percentages = pivot_table.div(pivot_table.sum(axis=1), axis=0) * 100

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Create the figure and axis
fig, ax = plt.subplots(figsize=(12, 12))

# Create the heatmap using imshow, with pivot_percentages
cax = ax.imshow(pivot_percentages, cmap="YlOrBr", aspect='auto')

# Add color bar at the top
cbar = fig.colorbar(cax, ax=ax, location='top', fraction=0.05, pad=0.04)
cbar.set_label('Percentage (%)', labelpad=10)

# Set the tick labels for the bottom and top X-axis
ax.set_xticks(np.arange(len(pivot_percentages.columns)))
ax.set_xticklabels(pivot_percentages.columns)
plt.xticks(rotation=90)

# Set the tick labels for the left and right Y-axis
ax.set_yticks(np.arange(len(pivot_percentages.index)))
ax.set_yticklabels(pivot_percentages.index)

# Display ticks and labels on the top and right axes
ax.tick_params(top=True, labeltop=True, right=True, labelright=True)

# Annotate the heatmap with percentage values and counts in round brackets
for i in range(len(pivot_percentages.index)):
    for j in range(len(pivot_percentages.columns)):
        percentage = f"{pivot_percentages.iloc[i, j]:.1f}%"
        count = f"({pivot_table.iloc[i, j]})"
        ax.text(j, i, f"{percentage}\n{count}", ha="center", va="center", color="black", fontsize=9)

# Add labels and a title
ax.set_xlabel('POS Tags', labelpad=20)  # labelpad adds padding to the label
ax.xaxis.set_label_position('top')  # Position the X-axis label at the top
ax.set_ylabel('NER Labels', labelpad=20)  # labelpad adds padding to the label
ax.yaxis.set_label_position('right')  # Position the Y-axis label on the right
ax.set_title('Percentage and Count of NER Label per POS Tag')

plt.show();


# Train vs Test token distribution

The test data is very limited, making any visualization appear less meaningful. However, once you acquire more extensive test or unlabeled data, calculating joint tokens could be useful.

In [None]:
import json
test_json = json.load(open(os.path.join(SYS_INPUT_DIR, "test.json")))
test = pd.json_normalize(test_json)

In [None]:
input_ner_label = 'B-NAME_STUDENT'

In [None]:
train_eda_ner_label = train_eda[train_eda[input_ner_label] > 0][['document','tokens','labels', input_ner_label]]

def process_row(row, value):
    selected_indices = [i for i, x in enumerate(row['labels']) if x == value]
    selected_values = [row['tokens'][i] for i in selected_indices]
    return pd.Series([selected_indices, selected_values])
    
train_eda_ner_label[['ner_label_idxs','ner_label_tokens']] = train_eda_ner_label.apply(process_row, axis=1, value=input_ner_label)

In [None]:
test_eda_ner_label = test.copy()
test_eda_ner_label['ner_label_tokens'] = train_eda_ner_label['ner_label_tokens']
test_eda_ner_label['joint_tokens'] = test_eda_ner_label.apply(lambda row: list(set(row['tokens']) & set(row['ner_label_tokens'])), axis=1)

In [None]:
test_eda_ner_label

# To be continued ...

Looks like it's time to find more data and start building some model

Please share in comments any useful EDA techniques that you find interesting and would like to see implemented