# Evaluating NER Model Performance with the Nederlandse Bibliografie Totaal (NBT)

In this notebook, we will utilize the best performing Named Entity Recognition (NER) model to identify book titles from historical newspaper texts. The output of the NER model, which consist of a few tokens representing a book title, will be compared to the book titles listed in the Nederlandse Bibliografie Totaal (NBT), a comprehensive bibliography of all Dutch books.

The main steps involved in this notebook are:
1. **Model Application:** Use the best performing NER model to extract texts representing book titles from our dataset, which includes articles from the Leeuwarder Courant test, Trouw, and Het Parool.
2. **Data Comparison:** Match the predicted book title text to the book titles in the NBT.
3. **Performance Evaluation:** Determine the NER system's performance by evaluating its ability to identify actual books mentioned in the newspaper archives, in addition to assessing the token classification performance.

By comparing the predicted book titles to the NBT, we can gain a more comprehensive understanding of the NER model's effectiveness in identifying real-world books from historical newspaper texts.

Let's get started!


In [1]:
# !python -m spacy download nl_core_news_sm

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import string
import re

from datasets import Dataset, load_metric
from transformers import DataCollatorForTokenClassification, pipeline, AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments, TrainerCallback

import torch

import spacy
from spacy import displacy

import os
from datetime import datetime
import json
import random

from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score
import pickle
import math

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from rapidfuzz import process, fuzz
from concurrent.futures import ThreadPoolExecutor, as_completed
import concurrent.futures
from concurrent.futures import ProcessPoolExecutor, as_completed
import multiprocessing
import itertools

In [3]:
# Get the current notebook directory
current_dir = os.path.abspath('')

# Set the main directory (modify as needed to point to your main project directory)
main_dir = os.path.abspath(os.path.join(current_dir, '../'))

# Change the working directory to the main directory
os.chdir(main_dir)

# Verify that the working directory has been set correctly
print(f"Current working directory: {os.getcwd()}")

Current working directory: C:\Users\niels\PycharmProjects\BookReviewsThesis


In [4]:
# Import custom functions from the dataset preparation module
from scripts.loss_functions import DiceLoss, MoMLoss
from scripts.dataset_preparation import remove_punctuation, find_sentence_in_text, create_mask_for_sentence, process_text, \
                                        create_data_set, trouw_parool_create_dataset, save_dataset, load_dataset, split_samples

## Data Preparation

In [5]:
# Set pandas display option to show all columns
pd.set_option('display.max_columns', None)

In [6]:
# Load data from Excel and CSV files into DataFrames

# Load leeuwarde courant Excel file into a DataFrame
df_lc = pd.read_excel('data/raw/manullay_check_partially_matched_titles.xlsx', engine='openpyxl')

# Load Trouw and Het Parool annotated book review file into a DataFrame
df_trouw_parool = pd.read_csv('data/raw/trouw_and_parool_annotated_book_titles.csv')

In [7]:
# Define function to remove extra spaces from text
def remove_extra_spaces(text):
    """
    Remove extra spaces from a string by replacing multiple spaces with a single space.

    Args:
        text (str): The input text with potential extra spaces.

    Returns:
        str: The cleaned text with single spaces between words.
    """
    cleaned_text = re.sub(r'\s+', ' ', text)
    return cleaned_text.strip()

In [8]:
# Apply the remove_extra_spaces function to relevant columns in df_lc
df_lc['content'] = df_lc['content'].apply(remove_extra_spaces)
df_lc['title1'] = df_lc['title1'].apply(remove_extra_spaces)
df_lc['title4'] = df_lc['title4'].apply(remove_extra_spaces)

In [9]:
# Filter out rows with 'manually_removed' set to 1 and get unique 'content' values
content_removed = df_lc[df_lc['manually_removed'] == 1]['content'].unique()

# Filter out the removed 'content' values from the main DataFrame
df_lc_clean = df_lc[~df_lc['content'].isin(content_removed)]

In [10]:
# Load Nederlandse Bibliografie Totaal database

NBT = pd.read_csv("NBT/full_NBT_database.csv", sep=';', low_memory=False)
NBT.head(2)

Unnamed: 0.1,Id_Pub1,givenName,familyName,title1,title2,year,place,publisher,numberOfPages,bookEdition,inLanguage,isPartOf,translationOfWork,genre,ISBN,bookFormat,description,comment,alternateName,name,fullName,birthDate,deathDate,subject1,subject2,subject3,publicationId,publisherId,Id_Pers,Unnamed: 0,issn
0,p036003166,Johannes Stephanus Antonius Joseph Maria,van Aken,Kalk-suiker : melasse ontsuikeren met kalk : k...,Kalk-suiker : melasse ontsuikeren met kalk : k...,1930.0,Delft,Meinema,"184 p, 5 vouwbl",,ned,,,Book,,PrintBook,Proefschrift Delft,,,Johannes Stephanus Antonius Joseph Maria van Aken,"Aken, Johannes Stephanus Antonius Joseph Maria...",1903,1988,,,,_:b11670325,_:b7149548,p072312238,,
1,p036004146,J.H.R.,Boumans,Driedaagsche retraite / J.H.R. Boumans,Driedaagsche retraite,1930.0,[S.l.,s.n.],4 dl,,ned,,,Book,,PrintBook,,Jean,,J.H.R. Boumans,"Boumans, J.H.R. (1833-1907 ; C.ss.R.)",1833,1907,,,,_:b11670360,_:b7149583,p070405441,,


In [11]:
len(NBT)

1954801

In [12]:
# Load the spaCy model for Dutch language processing
nlp = spacy.load("nl_core_news_sm")

In [13]:
# Set flags for punctuation removal and case conversion
remove_punc = False
force_lower_case = False

In [14]:
# Define file paths for saving/loading datasets
lc_val_filename = 'C:/Users/niels/PycharmProjects/BookReviewsThesis/data/processed/lc_val_dataset.pkl'
lc_test_filename = 'C:/Users/niels/PycharmProjects/BookReviewsThesis/data/processed/lc_test_dataset.pkl'

trouw_parool_test_filename = 'C:/Users/niels/PycharmProjects/BookReviewsThesis/data/processed/trouw_parool_test_dataset.pkl'

In [15]:
# Split the samples into training, validation, and test sets

# Set the random seed for reproducibility
np.random.seed(42)

# Get unique content samples
samples = df_lc_clean['content'].unique()

# Split the samples into training, validation, and test sets
lc_train_samples, lc_val_samples, lc_test_samples = split_samples(samples=samples, train_ratio=0.7, val_ratio=0.15, test_ratio=0.15)

In [None]:
# Define functions to merge overlapping intervals and visualize NER output

def merge_overlapping_intervals(intervals):
    """
    Merge overlapping intervals in a list of intervals.

    Args:
        intervals (list of tuple): List of intervals (start, end, label).

    Returns:
        list of tuple: List of merged intervals.
    """
    merged_intervals = []
    if not intervals:
        return merged_intervals

    # Sort intervals based on the start value
    intervals.sort(key=lambda x: x[0])

    # Initialize variables for the first interval
    start, end, label = intervals[0]

    # Iterate through the intervals
    for interval in intervals[1:]:
        next_start, next_end, next_label = interval

        # If the intervals overlap, merge them
        if next_start <= end + 1:
            end = max(end, next_end)
        else:
            # If no overlap, add the merged interval to the result and update start, end, label
            merged_intervals.append((start, end, label))
            start, end, label = next_start, next_end, next_label

    # Add the last merged interval
    merged_intervals.append((start, end, "BOOK"))

    return merged_intervals

def visualize_output(output, text, export_path=None):
    """
    Visualize NER output using spaCy's displaCy.

    Args:
        output (list): List of dictionaries containing NER predictions.
        text (str): The input text.
        export_path (str, optional): Path to save the visualization as an HTML file.

    Returns:
        None
    """
    spans = [(res['start'], res['end'], res['entity']) for res in output if res['entity'] == 'BOOK']
    spans = merge_overlapping_intervals(spans)

    nlp = spacy.blank('nl')
    doc = nlp.make_doc(text)
    ents = []
    for span_start, span_end, label in spans:
        ent = doc.char_span(span_start, span_end, label=label)
        if ent is None:
            continue
    
        ents.append(ent)
    
    doc.ents = ents
    
    if export_path:
        html = displacy.render(doc, style="ent", jupyter=False, page=True)
        # Save the HTML to a file
        with open(export_path, "w", encoding="utf-8") as file:
            file.write(html)
    else:
        displacy.render(doc, style="ent", jupyter=True)

### Load best NER model

In [17]:
export_path = "D:/UU/models/transformer/2024-06-05_08_04/"

In [18]:
model = AutoModelForTokenClassification.from_pretrained(export_path + "model")
tokenizer = AutoTokenizer.from_pretrained(export_path + "tokenizer")

In [19]:
pipe = pipeline(task="token-classification", model=model, tokenizer=tokenizer)

### Load Trouw and Het Parool data

In [20]:
trouw_parool_titles = pd.read_csv("data/processed/trouw_and_parool_annotated_book_titles_manually_matched.csv", sep=';', low_memory=False)
trouw_parool_titles.head(2)

Unnamed: 0,text,start_index,end_index,book_title,RecId,genre,date,year,article_tit,newspaper,month,title2,name,comment
0,Macabere trekken bij Cortázar In een van zijn ...,0.0,29.0,Macabere trekken bij Cortázar,ddd:010818290:mpeg21:a0275,artikel,1971-11-13,1971,,Trouw,11.0,,,Not a title
1,Macabere trekken bij Cortázar In een van zijn ...,1651.0,1664.0,'Het Gerucht',ddd:010818290:mpeg21:a0275,artikel,1971-11-13,1971,,Trouw,11.0,,,Not a title


In [25]:
# Filter out rows
content_removed = trouw_parool_titles[trouw_parool_titles['title2'].isnull()]['text'].unique()

# Filter out the removed 'content' values from the main DataFrame
trouw_parool_titles_clean = trouw_parool_titles[~trouw_parool_titles['text'].isin(content_removed)]

In [26]:
trouw_parool_titles_clean

Unnamed: 0,text,start_index,end_index,book_title,RecId,genre,date,year,article_tit,newspaper,month,title2,name,comment
3,'Ik ben altijd gelijk’ door Ad Zuiderent J. Be...,73.0,83.0,Het verlof,ddd:010828144:mpeg21:a0319,artikel,1971-09-18,1971,,Trouw,9.0,Het verlof,J. Bernlef,
4,'Ik ben altijd gelijk’ door Ad Zuiderent J. Be...,5601.0,5611.0,Het verlof,ddd:010828144:mpeg21:a0319,artikel,1971-09-18,1971,,Trouw,9.0,Het verlof,J. Bernlef,
5,Het conflict van de mens met zijn vorm door Ad...,158.0,166.0,Inspraak,ddd:010828180:mpeg21:a0293,artikel,1971-10-30,1971,,Trouw,10.0,Inspraak,Bert Schierbeek,
6,Het conflict van de mens met zijn vorm door Ad...,4733.0,4742.0,Inspraak-,ddd:010828180:mpeg21:a0293,artikel,1971-10-30,1971,,Trouw,10.0,Inspraak,Bert Schierbeek,
7,Goede pleiter Ze waren eigenlijk te vreemd om ...,1703.0,1741.0,"Nederlandse Cultuur in de Gouden Eeuw,",ddd:010826784:mpeg21:a0697,artikel,1988-01-23,1988,,Trouw,1.0,Nederlandse cultuur in de gouden eeuw,J.L. Price,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
531,'We zullen Galbraith weer omarmen' door KERS T...,6221.0,6285.0,John Kenneth Galbraith: het economisch denken ...,ddd:010841776:mpeg21:a0144,artikel,1994-11-28,1994,,Parool,11.0,John Kenneth Galbraith : het economisch denken...,J. Beishuizen,
532,JAN PEN Bewondering Mint het van kritiek in Ga...,6123.0,6187.0,John Kenneth Galbraith: het economisch denken ...,ddd:010841777:mpeg21:a0196,artikel,1994-11-29,1994,,Parool,11.0,John Kenneth Galbraith : het economisch denken...,J. Beishuizen,
533,FINANCIEN & ECONOMIE Hoge inkomens hebben mees...,7615.0,7643.0,Profijt van de overheid 111.,ddd:010842199:mpeg21:a0297,artikel,1994-12-24,1994,,Parool,12.0,Profijt van de overheid III : de verdeling van...,Evert Pommer,
534,BART TROMP De contramine voorbij ER ZIJN TWEE ...,5446.0,5469.0,Ik kan alles uitleggen.,ddd:010842201:mpeg21:a0117,artikel,1994-12-28,1994,,Parool,12.0,Ik kan alles uitleggen,Michel Korzec,


In [27]:
# Ensure that NBT has unique title2 values
unique_NBT = NBT.drop_duplicates(subset=['title2'])

# Add title1
trouw_parool_titles_clean = pd.merge(trouw_parool_titles_clean, unique_NBT[['title2', 'title1']], on='title2', how='left')

### Use multiprocessing to make matches of NER output to the NBT database

In [29]:
def process_review(review):
    """
    Processes a book review article to extract predicted text representing book titles.

    Args:
        review (str): The text of the book review.

    Returns:
        dict: A dictionary containing the original review and a list of predicted book title spans.
              Each span is represented as a tuple (start_index, end_index, entity_type).
    """
    output = pipe(review)
    spans = [(res['start'], res['end'], res['entity']) for res in output if res['entity'] == 'BOOK']
    spans = merge_overlapping_intervals(spans)
    return {"review": review, "predictions": spans}

# Use ThreadPoolExecutor for parallel processing
with ThreadPoolExecutor() as executor:
    futures = {executor.submit(process_review, review): review for review in lc_test_samples}
    lc_test_predictions = []

    for future in tqdm(as_completed(futures), total=len(futures)):
        lc_test_predictions.append(future.result())

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1881/1881 [21:50<00:00,  1.43it/s]


In [30]:
# Use ThreadPoolExecutor for parallel processing TROUW
with ThreadPoolExecutor() as executor:
    futures = {executor.submit(process_review, review): review for review in trouw_parool_titles_clean[trouw_parool_titles_clean['newspaper'] == 'Trouw'].text.unique()}
    trouw_test_predictions = []

    for future in tqdm(as_completed(futures), total=len(futures)):
        trouw_test_predictions.append(future.result())

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 82/82 [00:59<00:00,  1.37it/s]


In [31]:
# Use ThreadPoolExecutor for parallel processing PAROOL
with ThreadPoolExecutor() as executor:
    futures = {executor.submit(process_review, review): review for review in trouw_parool_titles_clean[trouw_parool_titles_clean['newspaper'] == 'Parool'].text.unique()}
    parool_test_predictions = []

    for future in tqdm(as_completed(futures), total=len(futures)):
        parool_test_predictions.append(future.result())

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 45/45 [00:32<00:00,  1.40it/s]


In [32]:
def process_item_lc(item, title1=True, year_range=3, scorer=fuzz.partial_ratio):
    """
    Matches predicted book titles from a review to titles in the Nederlandse Bibliografie Totaal (NBT) for Leeuwarder Courant.

    Args:
        item (dict): A dictionary containing the review text and NER predictions.
        title1 (bool): Whether to use the primary title (title1) or an alternate title (title2) for matching. Defaults to True.
        year_range (int): The range of years to consider for matching titles. Defaults to 3.
        scorer (function): The scoring function to use for matching titles. Defaults to fuzz.partial_ratio.

    Returns:
        dict: A dictionary containing the original review and a list of matched book titles.
    """
    review = item['review']
    predictions = item['predictions']
    
    review_year = df_lc_clean[df_lc_clean['content'] == review].iloc[0].yearRev

    if title1:
        NBT_titles = NBT[(NBT['year'] <= review_year ) & (NBT['year'] > review_year - year_range)].title1.tolist()
    else:
        NBT_titles = NBT[(NBT['year'] <= review_year ) & (NBT['year'] > review_year - year_range)].title2.tolist()
    
    book_titles = [
        process.extractOne(review[prediction[0]:prediction[1]], NBT_titles, scorer=scorer)
        for prediction in predictions
    ]

    return {"review": review, "book_titles": book_titles}

def process_item_trouw_parool(item, year_range=3, scorer=fuzz.partial_ratio):
    """
    Matches predicted book titles from a review to titles in the Nederlandse Bibliografie Totaal (NBT) for Trouw and Het Parool.

    Args:
        item (dict): A dictionary containing the review text and predictions.
        year_range (int): The range of years to consider for matching titles. Defaults to 3.
        scorer (function): The scoring function to use for matching titles. Defaults to fuzz.partial_ratio.

    Returns:
        dict: A dictionary containing the original review and a list of matched book titles.
    """
    review = item['review']
    predictions = item['predictions']
    
    review_year = trouw_parool_titles_clean[trouw_parool_titles_clean['text'] == review].iloc[0].year

    NBT_titles = NBT[(NBT['year'] <= review_year ) & (NBT['year'] > review_year - year_range)].title1.tolist()
    
    book_titles = [
        process.extractOne(review[prediction[0]:prediction[1]], NBT_titles, scorer=scorer)
        for prediction in predictions
    ]
    
    return {"review": review, "book_titles": book_titles}


In [33]:
def calculate_tp_fp_fn(predicted_titles, actual_titles):
    """ Calcuate F1 score """
    # Convert lists to sets for easier comparison
    predicted_set = set(predicted_titles)
    actual_set = set(actual_titles)
    
    # Calculate True Positives (TP): Titles present in both predicted and actual sets
    TP = len(predicted_set & actual_set)
    
    # Calculate False Positives (FP): Titles in predicted set but not in actual set
    FP = len(predicted_set - actual_set)
    
    # Calculate False Negatives (FN): Titles in actual set but not in predicted set
    FN = len(actual_set - predicted_set)
    
    return TP, FP, FN


def calculate_precision(tp, fp):
    """Calculate precision."""
    if tp + fp == 0:
        return 0
    return tp / (tp + fp)

def calculate_recall(tp, fn):
    """Calculate recall."""
    if tp + fn == 0:
        return 0
    return tp / (tp + fn)

def calculate_f1_score(tp, fp, fn):
    """Calculate F1 score using precision and recall."""
    precision = calculate_precision(tp, fp)
    recall = calculate_recall(tp, fn)
    
    if precision + recall == 0:
        return 0
    return 2 * (precision * recall) / (precision + recall)

In [34]:
def validate_estimated_book_titles_lc(item, title1=True):
    """
    Validates the predicted book titles against the actual book titles for Leeuwarder Courant.

    Args:
        item (dict): A dictionary containing the review text and the identified book titles.
        title1 (bool): Whether to validate against the primary title (title1) or an alternate title (title2). Defaults to True.

    Returns:
        dict: A dictionary containing true positives, false positives, and false negatives.
    """
    review = item['review']
                
    predicted_book_titles = [title[0] for title in item['book_titles']]
    if title1:
        actual_book_titles = df_lc_clean[df_lc_clean['content'] == review].title1.tolist()
    else:
        actual_book_titles = df_lc_clean[df_lc_clean['content'] == review].title2.tolist()
    return calculate_tp_fp_fn(predicted_titles=predicted_book_titles, actual_titles=actual_book_titles)

def validate_estimated_book_titles_trouw_parool(item):
    """
    Validates the predicted book titles against the actual book titles for Trouw and Het Parool.

    Args:
        item (dict): A dictionary containing the review text and identified book titles.

    Returns:
        dict: A dictionary containing true positives, false positives, and false negatives.
    """
    review = item['review']
                
    predicted_book_titles = [title[0] for title in item['book_titles']]
    actual_book_titles = trouw_parool_titles_clean[trouw_parool_titles_clean['text'] == review].title1.tolist()
    
    return calculate_tp_fp_fn(predicted_titles=predicted_book_titles, actual_titles=actual_book_titles)

### Leeuwarder Courant

In [37]:
lc_test_book_titles = [process_item_lc(x, title1=True, year_range=3, scorer=fuzz.partial_ratio) for x in tqdm(lc_test_predictions)]



  0%|                                                                                                                                                                                                     | 0/1881 [00:00<?, ?it/s][A[A

  0%|                                                                                                                                                                                             | 1/1881 [00:00<06:50,  4.58it/s][A[A

  0%|▏                                                                                                                                                                                            | 2/1881 [00:00<06:03,  5.17it/s][A[A

  0%|▎                                                                                                                                                                                            | 3/1881 [00:00<05:33,  5.64it/s][A[A

  0%|▍                                                    

In [39]:
total_TP, total_FP, total_FN = 0, 0, 0 

for item in lc_test_book_titles:
    TP, FP, FN = validate_estimated_book_titles_lc(item=item, title1=True)
    
    total_TP += TP
    total_FP += FP
    total_FN += FN

f1_score = calculate_f1_score(tp=total_TP, fp=total_FP, fn=total_FN)
precision = calculate_precision(tp=total_TP, fp=total_FP)
recall = calculate_recall(tp=total_TP, fn=total_FN)

print(f"F1 Score: {f1_score}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")

F1 Score: 0.5938092229943147
Precision: 0.6587245970567625
Recall: 0.5405405405405406


In [40]:
total_TP, total_FP, total_FN

(1880, 974, 1598)

### Trouw

In [41]:
trouw_test_book_titles = [process_item_trouw_parool(x, year_range=3, scorer=fuzz.partial_ratio) for x in tqdm(trouw_test_predictions)]

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 82/82 [00:28<00:00,  2.83it/s]


In [44]:
total_TP, total_FP, total_FN = 0, 0, 0 

for item in trouw_test_book_titles:
    TP, FP, FN = validate_estimated_book_titles_trouw_parool(item=item)
    
    total_TP += TP
    total_FP += FP
    total_FN += FN

f1_score = calculate_f1_score(tp=total_TP, fp=total_FP, fn=total_FN)
precision = calculate_precision(tp=total_TP, fp=total_FP)
recall = calculate_recall(tp=total_TP, fn=total_FN)

print(f"F1 Score: {f1_score}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")

F1 Score: 0.5882352941176471
Precision: 0.6875
Recall: 0.514018691588785


In [45]:
total_TP, total_FP, total_FN

(55, 25, 52)

### Parool

In [46]:
parool_test_book_titles = [process_item_trouw_parool(x, year_range=3, scorer=fuzz.partial_ratio) for x in tqdm(parool_test_predictions)]

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 45/45 [00:13<00:00,  3.37it/s]


In [47]:
total_TP, total_FP, total_FN = 0, 0, 0 

for item in parool_test_book_titles:
    TP, FP, FN = validate_estimated_book_titles_trouw_parool(item=item)
    
    total_TP += TP
    total_FP += FP
    total_FN += FN

f1_score = calculate_f1_score(tp=total_TP, fp=total_FP, fn=total_FN)
precision = calculate_precision(tp=total_TP, fp=total_FP)
recall = calculate_recall(tp=total_TP, fn=total_FN)

print(f"F1 Score: {f1_score}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")

F1 Score: 0.5416666666666666
Precision: 0.6190476190476191
Recall: 0.48148148148148145


In [48]:
total_TP, total_FP, total_FN

(26, 16, 28)