# Applying and Visualizing NER Predictions on Unseen Data

In this notebook, we will apply the best performing Named Entity Recognition (NER) model to make predictions on unseen data, which includes the Leeuwarder Courant test set, Het Parool dataset, and the Trouw dataset. The key steps in this process are as follows:

1. **Model Application:** Use the best NER model to predict book titles in the unseen datasets.
2. **Visualization:** Highlight the predicted tokens classified as book titles in the text.
3. **Export:** Export these visualizations to HTML format to facilitate manual analysis of the NER predictions.

By the end of this notebook, we will have a comprehensive set of visualizations that highlight the NER model's predictions, allowing us to manually inspect and analyze the accuracy and effectiveness of the model.

Let's get started!


## Import necessary libraries

In [1]:
# !python -m spacy download nl_core_news_sm

In [39]:
# Import necessary libraries
import pandas as pd
import numpy as np
import string
import re
import random

from datasets import Dataset, load_metric
from transformers import DataCollatorForTokenClassification, pipeline, AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments, TrainerCallback, XLMRobertaForTokenClassification, XLMRobertaTokenizer

import torch

import spacy
from spacy import displacy

import os
from datetime import datetime
import json

from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score
import pickle
import math

In [2]:
# Get the current notebook directory
current_dir = os.path.abspath('')

# Set the main directory (modify as needed to point to your main project directory)
main_dir = os.path.abspath(os.path.join(current_dir, '../'))

# Change the working directory to the main directory
os.chdir(main_dir)

# Verify that the working directory has been set correctly
print(f"Current working directory: {os.getcwd()}")

Current working directory: C:\Users\niels\PycharmProjects\BookReviewsThesis


In [3]:
# Import custom functions from the dataset preparation module
from scripts.loss_functions import DiceLoss, MoMLoss
from scripts.dataset_preparation import remove_punctuation, find_sentence_in_text, create_mask_for_sentence, process_text, \
                                        create_data_set, trouw_parool_create_dataset, save_dataset, load_dataset, split_samples, remove_extra_spaces

## Data Preparation

In [4]:
# Set pandas display option to show all columns
pd.set_option('display.max_columns', None)

In [5]:
# Load data from Excel and CSV files into DataFrames

# Load leeuwarde courant Excel file into a DataFrame
df_lc = pd.read_excel('data/raw/manullay_check_partially_matched_titles.xlsx', engine='openpyxl')

# Load Trouw and Het Parool annotated book review file into a DataFrame
df_trouw_parool = pd.read_csv('data/raw/trouw_and_parool_annotated_book_titles.csv')

In [6]:
# Apply the remove_extra_spaces function to relevant columns in df_lc
df_lc['content'] = df_lc['content'].apply(remove_extra_spaces)
df_lc['title1'] = df_lc['title1'].apply(remove_extra_spaces)
df_lc['title4'] = df_lc['title4'].apply(remove_extra_spaces)

In [7]:
# Filter out rows with 'manually_removed' set to 1 and get unique 'content' values
content_removed = df_lc[df_lc['manually_removed'] == 1]['content'].unique()

# Filter out the removed 'content' values from the main DataFrame
df_lc_clean = df_lc[~df_lc['content'].isin(content_removed)]

In [8]:
# Set the model checkpoint to be used for token classification

model_checkpoint = "models/transformer/2024-06-05_08_04/model"

In [9]:
# Define the list of labels for NER tagging
label_list = ['O', 'I']

In [10]:
# Load the spaCy model for Dutch language processing
nlp = spacy.load("nl_core_news_sm")

In [11]:
# Set flags for punctuation removal and case conversion
remove_punc = False
force_lower_case = False

In [12]:
# Define file paths for saving/loading datasets
lc_train_filename = 'C:/Users/niels/PycharmProjects/BookReviewsThesis/data/processed/lc_train_dataset.pkl'
lc_val_filename = 'C:/Users/niels/PycharmProjects/BookReviewsThesis/data/processed/lc_val_dataset.pkl'
lc_test_filename = 'C:/Users/niels/PycharmProjects/BookReviewsThesis/data/processed/lc_test_dataset.pkl'

trouw_parool_test_filename = 'C:/Users/niels/PycharmProjects/BookReviewsThesis/data/processed/trouw_parool_test_dataset.pkl'

In [13]:
# Split the samples into training, validation, and test sets

# Set the random seed for reproducibility
np.random.seed(42)

# Get unique content samples
samples = df_lc_clean['content'].unique()

# Split the samples into training, validation, and test sets
lc_train_samples, lc_val_samples, lc_test_samples = split_samples(samples=samples, train_ratio=0.7, val_ratio=0.15, test_ratio=0.15)

# Check if datasets already exist, otherwise create them
if os.path.exists(lc_train_filename) and os.path.exists(lc_val_filename) and os.path.exists(lc_test_filename):
    print("Loading training, validation, and test datasets....")
    lc_train_dataset = load_dataset(lc_train_filename)
    lc_val_dataset = load_dataset(lc_val_filename)
    lc_test_dataset = load_dataset(lc_test_filename)
else:
    print("Creating training, validation, and test datasets....")
    # Create dataset
    lc_train_dataset = Dataset.from_list(create_data_set(samples=lc_train_samples, df=df_lc_clean, nlp=nlp, remove_punc=remove_punc, force_lower_case=force_lower_case))
    lc_val_dataset = Dataset.from_list(create_data_set(samples=lc_val_samples, df=df_lc_clean, nlp=nlp, remove_punc=remove_punc, force_lower_case=force_lower_case))
    lc_test_dataset = Dataset.from_list(create_data_set(samples=lc_test_samples, df=df_lc_clean, nlp=nlp, remove_punc=remove_punc, force_lower_case=force_lower_case))

    # Save dataset, so we don't have to create it everytime again
    save_dataset(lc_train_dataset, lc_train_filename)
    save_dataset(lc_val_dataset, lc_val_filename)
    save_dataset(lc_test_dataset, lc_test_filename)

print("Done...")

Loading training, validation, and test datasets....
Done...


In [14]:
# Display the leeuwarde Courant train dataset
lc_train_dataset

Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 8774
})

In [15]:
# Display the leeuwarde Courant validation dataset
lc_val_dataset

Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 1880
})

In [16]:
# Display the leeuwarde Courant test dataset
lc_test_dataset

Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 1881
})

### Create Trouw & Parool TEST dataset

In [17]:
# Create or load the Trouw/Parool test dataset

if os.path.exists(trouw_parool_test_filename):
    print("Loading trouw parool test dataset....")
    trouw_parool_test_dataset = load_dataset(trouw_parool_test_filename)
else:
    print("Creating trouw parool test dataset....")
    trouw_parool_test_dataset = Dataset.from_list(trouw_parool_create_dataset(df=df_trouw_parool, nlp=nlp, remove_punc=remove_punc, force_lower_case=force_lower_case))
    
    # Save dataset, so we don't have to create it everytime again
    save_dataset(trouw_parool_test_dataset, trouw_parool_test_filename)

Loading trouw parool test dataset....


In [18]:
# Display the Trouw/Parool test dataset
trouw_parool_test_dataset

Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 308
})

## Tokenize text representation and align subtokens with labels

In [19]:
# Load the tokenizer from the specified model checkpoint
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# from transformers import RobertaTokenizerFast

# # Instantiate the tokenizer with add_prefix_space=True
# tokenizer = RobertaTokenizerFast.from_pretrained(model_checkpoint, add_prefix_space=True, truncation=True, max_length=512, padding='max_length')

In [20]:
# Set the device to use for training (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [21]:
# Load the pre-trained model for token classification
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint, num_labels=2, ignore_mismatched_sizes=False
)
model.config.id2label = {0: 'NO_BOOK', 1: 'BOOK'}

In [22]:
# Move the model to the specified device (GPU/CPU)
model.to(device)

XLMRobertaForTokenClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=1024, out_featu

## Visualization of predictions on unseen data

In [23]:
# Define functions to merge overlapping intervals and visualize NER output

def merge_overlapping_intervals(intervals):
    """
    Merge overlapping intervals in a list of intervals.

    Args:
        intervals (list of tuple): List of intervals (start, end, label).

    Returns:
        list of tuple: List of merged intervals.
    """
    merged_intervals = []
    if not intervals:
        return merged_intervals

    # Sort intervals based on the start value
    intervals.sort(key=lambda x: x[0])

    # Initialize variables for the first interval
    start, end, label = intervals[0]

    # Iterate through the intervals
    for interval in intervals[1:]:
        next_start, next_end, next_label = interval

        # If the intervals overlap, merge them
        if next_start <= end + 1:
            end = max(end, next_end)
        else:
            # If no overlap, add the merged interval to the result and update start, end, label
            merged_intervals.append((start, end, label))
            start, end, label = next_start, next_end, next_label

    # Add the last merged interval
    merged_intervals.append((start, end, "BOOK"))

    return merged_intervals

def visualize_output(output, text, export_path=None):
    """
    Visualize NER output using spaCy's displaCy.

    Args:
        output (list): List of dictionaries containing NER predictions.
        text (str): The input text.
        export_path (str, optional): Path to save the visualization as an HTML file.

    Returns:
        None
    """
    spans = [(res['start'], res['end'], res['entity']) for res in output if res['entity'] == 'BOOK']
    spans = merge_overlapping_intervals(spans)

    nlp = spacy.blank('nl')
    doc = nlp.make_doc(text)
    ents = []
    for span_start, span_end, label in spans:
        ent = doc.char_span(span_start, span_end, label=label)
        if ent is None:
            continue
    
        ents.append(ent)
    
    doc.ents = ents
    
    if export_path:
        html = displacy.render(doc, style="ent", jupyter=False, page=True)
        # Save the HTML to a file
        with open(export_path, "w", encoding="utf-8") as file:
            file.write(html)
    else:
        displacy.render(doc, style="ent", jupyter=True)

In [24]:
# Initialize the pipeline for token classification
pipe = pipeline(task="token-classification", model=model, tokenizer=tokenizer)

## Export predictions on unseen data

In [32]:
export_path = "prediction_visualisation/"

### Leeuwarde Courant

In [33]:
export_path_lc = export_path + "leeuwarden_courant/"

In [34]:
os.makedirs(export_path_lc, exist_ok=True)

In [43]:
# Set the random seed for reproducibility
np.random.seed(42)

for i, lc_sample in enumerate(np.random.choice(lc_test_samples, 300, replace=False)):
    # Get NER predictions for the sample using the pipeline
    output = pipe(lc_sample)
    # Export example of output
    visualize_output(output, lc_sample, export_path=export_path_lc + f"{str(i)}.html")

In [30]:
# Display the corresponding book titles in the sample from df_lc_clean
# df_lc_clean[df_lc_clean['content'] == lc_sample].title4

6064    ontsporingen
Name: title4, dtype: object

### Het Parool

In [44]:
# Filter Trouw/Parool data for the Parool newspaper
df_parool = df_trouw_parool[df_trouw_parool['newspaper'] == 'Parool']

In [50]:
export_path_parool = export_path + "parool/"
os.makedirs(export_path_parool, exist_ok=True)

In [51]:
# Get a sample text from the Parool data
parool_samples = df_parool.text.unique()

In [57]:
for i, parool_sample in enumerate(parool_samples):
    # Get NER predictions for the sample using the pipeline
    output = pipe(parool_sample)
    # Export example of output
    visualize_output(output, parool_sample, export_path=export_path_parool + f"{str(i)}.html")

In [58]:
# Display the corresponding book titles in the Parool sample from df_parool
# df_parool[df_parool['text'] == parool_sample].book_title

### Trouw

In [59]:
# Filter Trouw/Parool data for the Trouw newspaper
df_trouw = df_trouw_parool[df_trouw_parool['newspaper'] == 'Trouw']

In [60]:
export_path_trouw = export_path + "trouw/"
os.makedirs(export_path_trouw, exist_ok=True)

In [61]:
# Get a sample text from the Trouw data
trouw_samples = df_trouw.text.unique()

In [62]:
for i, trouw_sample in enumerate(trouw_samples):
    # Get NER predictions for the sample using the pipeline
    output = pipe(trouw_sample)
    # Export example of output
    visualize_output(output, trouw_sample, export_path=export_path_trouw + f"{str(i)}.html")