## Import necessary libraries

In [1]:
# !python -m spacy download nl_core_news_sm

In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
import string
import re

from datasets import Dataset, load_metric
from transformers import DataCollatorForTokenClassification, pipeline, AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments, TrainerCallback

import torch

import spacy
from spacy import displacy

import os
from datetime import datetime
import json
import random

from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score
import pickle
import math

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from rapidfuzz import process, fuzz
from concurrent.futures import ThreadPoolExecutor, as_completed
import concurrent.futures
from concurrent.futures import ProcessPoolExecutor, as_completed
import multiprocessing
import itertools

In [4]:
# Get the current notebook directory
current_dir = os.path.abspath('')

# Set the main directory (modify as needed to point to your main project directory)
main_dir = os.path.abspath(os.path.join(current_dir, '../'))

# Change the working directory to the main directory
os.chdir(main_dir)

# Verify that the working directory has been set correctly
print(f"Current working directory: {os.getcwd()}")

Current working directory: C:\Users\niels\PycharmProjects\BookReviewsThesis


In [5]:
# Import custom functions from the dataset preparation module
from scripts.loss_functions import DiceLoss, MoMLoss
from scripts.dataset_preparation import remove_punctuation, find_sentence_in_text, create_mask_for_sentence, process_text, \
                                        create_data_set, trouw_parool_create_dataset, save_dataset, load_dataset, split_samples

## Data Preparation

In [6]:
# Set pandas display option to show all columns
pd.set_option('display.max_columns', None)

In [7]:
# Load data from Excel and CSV files into DataFrames

# Load leeuwarde courant Excel file into a DataFrame
df_lc = pd.read_excel('data/raw/manullay_check_partially_matched_titles.xlsx', engine='openpyxl')

# Load Trouw and Het Parool annotated book review file into a DataFrame
df_trouw_parool = pd.read_csv('data/raw/trouw_and_parool_annotated_book_titles.csv')

In [8]:
# Define function to remove extra spaces from text
def remove_extra_spaces(text):
    """
    Remove extra spaces from a string by replacing multiple spaces with a single space.

    Args:
        text (str): The input text with potential extra spaces.

    Returns:
        str: The cleaned text with single spaces between words.
    """
    cleaned_text = re.sub(r'\s+', ' ', text)
    return cleaned_text.strip()

In [9]:
# Apply the remove_extra_spaces function to relevant columns in df_lc
df_lc['content'] = df_lc['content'].apply(remove_extra_spaces)
df_lc['title1'] = df_lc['title1'].apply(remove_extra_spaces)
df_lc['title4'] = df_lc['title4'].apply(remove_extra_spaces)

In [10]:
# Filter out rows with 'manually_removed' set to 1 and get unique 'content' values
content_removed = df_lc[df_lc['manually_removed'] == 1]['content'].unique()

# Filter out the removed 'content' values from the main DataFrame
df_lc_clean = df_lc[~df_lc['content'].isin(content_removed)]

In [11]:
NBT = pd.read_csv("NBT/full_NBT_database.csv", sep=';', low_memory=False)
NBT.head(2)

Unnamed: 0.1,Id_Pub1,givenName,familyName,title1,title2,year,place,publisher,numberOfPages,bookEdition,inLanguage,isPartOf,translationOfWork,genre,ISBN,bookFormat,description,comment,alternateName,name,fullName,birthDate,deathDate,subject1,subject2,subject3,publicationId,publisherId,Id_Pers,Unnamed: 0,issn
0,p036003166,Johannes Stephanus Antonius Joseph Maria,van Aken,Kalk-suiker : melasse ontsuikeren met kalk : k...,Kalk-suiker : melasse ontsuikeren met kalk : k...,1930.0,Delft,Meinema,"184 p, 5 vouwbl",,ned,,,Book,,PrintBook,Proefschrift Delft,,,Johannes Stephanus Antonius Joseph Maria van Aken,"Aken, Johannes Stephanus Antonius Joseph Maria...",1903,1988,,,,_:b11670325,_:b7149548,p072312238,,
1,p036004146,J.H.R.,Boumans,Driedaagsche retraite / J.H.R. Boumans,Driedaagsche retraite,1930.0,[S.l.,s.n.],4 dl,,ned,,,Book,,PrintBook,,Jean,,J.H.R. Boumans,"Boumans, J.H.R. (1833-1907 ; C.ss.R.)",1833,1907,,,,_:b11670360,_:b7149583,p070405441,,


In [12]:
# Set the model checkpoint to be used for token classification

model_checkpoint = "Babelscape/wikineural-multilingual-ner"

# Alternative model checkpoints (commented out):
# model_checkpoint = "FacebookAI/xlm-roberta-large-finetuned-conll03-english"
# model_checkpoint = "pdelobelle/robbert-v2-dutch-ner"
# model_checkpoint = "GroNLP/bert-base-dutch-cased"
# model_checkpoint = "pdelobelle/robbert-v2-dutch-base"

In [13]:
# Define the list of labels for NER tagging
label_list = ['O', 'I']

In [14]:
# Load the spaCy model for Dutch language processing
nlp = spacy.load("nl_core_news_sm")

In [15]:
# Set flags for punctuation removal and case conversion
remove_punc = False
force_lower_case = False

In [16]:
# Define file paths for saving/loading datasets
lc_train_filename = 'C:/Users/niels/PycharmProjects/BookReviewsThesis/data/processed/lc_train_dataset.pkl'
lc_val_filename = 'C:/Users/niels/PycharmProjects/BookReviewsThesis/data/processed/lc_val_dataset.pkl'
lc_test_filename = 'C:/Users/niels/PycharmProjects/BookReviewsThesis/data/processed/lc_test_dataset.pkl'

trouw_parool_test_filename = 'C:/Users/niels/PycharmProjects/BookReviewsThesis/data/processed/trouw_parool_test_dataset.pkl'

In [17]:
# Split the samples into training, validation, and test sets

# Set the random seed for reproducibility
np.random.seed(42)

# Get unique content samples
samples = df_lc_clean['content'].unique()

# Split the samples into training, validation, and test sets
lc_train_samples, lc_val_samples, lc_test_samples = split_samples(samples=samples, train_ratio=0.7, val_ratio=0.15, test_ratio=0.15)

In [18]:
# Load the tokenizer from the specified model checkpoint
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [19]:
# Define functions to merge overlapping intervals and visualize NER output

def merge_overlapping_intervals(intervals):
    """
    Merge overlapping intervals in a list of intervals.

    Args:
        intervals (list of tuple): List of intervals (start, end, label).

    Returns:
        list of tuple: List of merged intervals.
    """
    merged_intervals = []
    if not intervals:
        return merged_intervals

    # Sort intervals based on the start value
    intervals.sort(key=lambda x: x[0])

    # Initialize variables for the first interval
    start, end, label = intervals[0]

    # Iterate through the intervals
    for interval in intervals[1:]:
        next_start, next_end, next_label = interval

        # If the intervals overlap, merge them
        if next_start <= end + 1:
            end = max(end, next_end)
        else:
            # If no overlap, add the merged interval to the result and update start, end, label
            merged_intervals.append((start, end, label))
            start, end, label = next_start, next_end, next_label

    # Add the last merged interval
    merged_intervals.append((start, end, "BOOK"))

    return merged_intervals

def visualize_output(output, text, export_path=None):
    """
    Visualize NER output using spaCy's displaCy.

    Args:
        output (list): List of dictionaries containing NER predictions.
        text (str): The input text.
        export_path (str, optional): Path to save the visualization as an HTML file.

    Returns:
        None
    """
    spans = [(res['start'], res['end'], res['entity']) for res in output if res['entity'] == 'BOOK']
    spans = merge_overlapping_intervals(spans)

    nlp = spacy.blank('nl')
    doc = nlp.make_doc(text)
    ents = []
    for span_start, span_end, label in spans:
        ent = doc.char_span(span_start, span_end, label=label)
        if ent is None:
            continue
    
        ents.append(ent)
    
    doc.ents = ents
    
    if export_path:
        html = displacy.render(doc, style="ent", jupyter=False, page=True)
        # Save the HTML to a file
        with open(export_path, "w", encoding="utf-8") as file:
            file.write(html)
    else:
        displacy.render(doc, style="ent", jupyter=True)

In [20]:
export_path = "models/transformer/2024-05-28_15_55/"

In [21]:
model = AutoModelForTokenClassification.from_pretrained(export_path + "model")
tokenizer = AutoTokenizer.from_pretrained(export_path + "tokenizer")

In [22]:
pipe = pipeline(task="token-classification", model=model, tokenizer=tokenizer)

In [23]:
# Shuffle the list with the given seed
random.seed(42)
random.shuffle(lc_test_samples)

In [24]:
def process_review(review):
    output = pipe(review)
    spans = [(res['start'], res['end'], res['entity']) for res in output if res['entity'] == 'BOOK']
    spans = merge_overlapping_intervals(spans)
    return {"review": review, "predictions": spans}

# Use ThreadPoolExecutor for parallel processing
with ThreadPoolExecutor() as executor:
    futures = {executor.submit(process_review, review): review for review in lc_test_samples[:200]}
    lc_test_predictions = []

    for future in tqdm(as_completed(futures), total=len(futures)):
        lc_test_predictions.append(future.result())

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:42<00:00,  4.66it/s]


In [25]:
# Function to adjust predictions with a single adjustment value
def adjust_prediction_indices(prediction, review_length, adjustment):
    if adjustment > 0: 
        new_start = max(prediction[0] - adjustment, 0)
        new_end = min(prediction[1] + adjustment, review_length)
        return (new_start, new_end)
    else:
        return (prediction[0], prediction[1])

def process_item(item, adjustment=0, title1=True, year_range=5, scorer=fuzz.partial_ratio):
    review = item['review']
    predictions = item['predictions']
    
    
    review_year = df_lc_clean[df_lc_clean['content'] == review].iloc[0].yearRev

    if title1:
        NBT_titles = NBT[(NBT['year'] <= review_year ) & (NBT['year'] > review_year - year_range)].title1.tolist()
    else:
        NBT_titles = NBT[(NBT['year'] <= review_year ) & (NBT['year'] > review_year - year_range)].title2.tolist()
    
    book_titles = [
        process.extractOne(review[adjusted[0]:adjusted[1]], NBT_titles, scorer=scorer)
        for prediction in predictions
        for adjusted in [adjust_prediction_indices(prediction, review_length=len(review), adjustment=adjustment)]
    ]

    # labels = df_lc_clean[df_lc_clean['content'] == review].title4.tolist()


    # book_titles = [
    #     process.extractOne(label, NBT_titles, scorer=scorer)
    #     for label in labels]

    
    return {"review": review, "book_titles": book_titles}

In [26]:
def calculate_tp_fp_fn(predicted_titles, actual_titles):
    # Convert lists to sets for easier comparison
    predicted_set = set(predicted_titles)
    actual_set = set(actual_titles)
    
    # Calculate True Positives (TP): Titles present in both predicted and actual sets
    TP = len(predicted_set & actual_set)
    
    # Calculate False Positives (FP): Titles in predicted set but not in actual set
    FP = len(predicted_set - actual_set)
    
    # Calculate False Negatives (FN): Titles in actual set but not in predicted set
    FN = len(actual_set - predicted_set)
    
    return TP, FP, FN


def calculate_precision(tp, fp):
    """Calculate precision."""
    if tp + fp == 0:
        return 0
    return tp / (tp + fp)

def calculate_recall(tp, fn):
    """Calculate recall."""
    if tp + fn == 0:
        return 0
    return tp / (tp + fn)

def calculate_f1_score(tp, fp, fn):
    """Calculate F1 score using precision and recall."""
    precision = calculate_precision(tp, fp)
    recall = calculate_recall(tp, fn)
    
    if precision + recall == 0:
        return 0
    return 2 * (precision * recall) / (precision + recall)

In [27]:
def validate_estimated_book_titles(item, title1=True):
    review = item['review']
                
    predicted_book_titles = [title[0] for title in item['book_titles']]
    if title1:
        actual_book_titles = df_lc_clean[df_lc_clean['content'] == review].title1.tolist()
    else:
        actual_book_titles = df_lc_clean[df_lc_clean['content'] == review].title2.tolist()
    return calculate_tp_fp_fn(predicted_titles=predicted_book_titles, actual_titles=actual_book_titles)

In [28]:
# lc_test_book_titles = [process_item(x, adjustment=10, title1=True, year_range=3, scorer=fuzz.partial_ratio) for x in tqdm(lc_test_predictions)]

In [29]:
# total_TP, total_FP, total_FN = 0, 0, 0 

# for item in lc_test_book_titles:
#     TP, FP, FN = validate_estimated_book_titles(item=item, title1=True)
    
#     total_TP += TP
#     total_FP += FP
#     total_FN += FN

# f1_score = calculate_f1_score(tp=total_TP, fp=total_FP, fn=total_FN)
# precision = calculate_precision(tp=total_TP, fp=total_FP)
# recall = calculate_recall(tp=total_TP, fn=total_FN)

# print(f"F1 Score: {f1_score}")
# print(f"Precision: {precision}")
# print(f"Recall: {recall}")

In [30]:
# year range 2 0.5859473023839399
# year range 3 0.5772485114384206

In [31]:
# Convert the columns to numeric
df_lc_clean['yearPub'] = pd.to_numeric(df_lc_clean['yearPub'], errors='coerce')
df_lc_clean['yearRev'] = pd.to_numeric(df_lc_clean['yearRev'], errors='coerce')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_lc_clean['yearPub'] = pd.to_numeric(df_lc_clean['yearPub'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_lc_clean['yearRev'] = pd.to_numeric(df_lc_clean['yearRev'], errors='coerce')


In [32]:
# Compute the difference
differences = df_lc_clean['yearRev'] - df_lc_clean['yearPub']

# Calculate the 2.5th percentile and the 97.5th percentile
lower_bound = np.percentile(differences.dropna(), 2.5)
upper_bound = np.percentile(differences.dropna(), 97.5)

print(f"The 95% interval for the years difference between the publication of the review and the publication of the book is: ({lower_bound:.2f}, {upper_bound:.2f})")

The 95% interval for the years difference between the publication of the review and the publication of the book is: (0.00, 2.00)


In [33]:
# Define the range of values for each parameter
title1_options = [True]
adjustment_options = [0, 10, 20, 60, 100] 
year_range_options = [3]
scorers = [
    fuzz.ratio,
    fuzz.partial_ratio,
    fuzz.token_sort_ratio,
    fuzz.token_set_ratio,
    fuzz.partial_token_sort_ratio,
    fuzz.partial_token_set_ratio
]

# Calculate total combinations
len_title1 = len(title1_options)
len_adjustment = len(adjustment_options)
len_year_range = len(year_range_options)
len_scorers = len(scorers)

total_combinations = len_title1 * len_adjustment * len_year_range * len_scorers
print(f"Total combinations: {total_combinations}")

best_f1 = 0
best_params = {'title1': None, 'adjustment': None, 'year_range': None, 'scorer': None}
results = []  # List to store all F1 scores and parameters

Total combinations: 30


In [None]:
# Grid search
iteration_count = 0  # Initialize the iteration counter

for title1, adjustment, year_range, scorer in tqdm(itertools.product(title1_options, adjustment_options, year_range_options, scorers), total=total_combinations):
    lc_test_book_titles = [process_item(x, adjustment=adjustment, title1=title1, year_range=year_range, scorer=scorer) for x in lc_test_predictions]
    
    total_TP, total_FP, total_FN = 0, 0, 0 

    for item in lc_test_book_titles:
        TP, FP, FN = validate_estimated_book_titles(item=item, title1=title1)
        
        total_TP += TP
        total_FP += FP
        total_FN += FN
    
    f1 = calculate_f1_score(tp=total_TP, fp=total_FP, fn=total_FN)
    
    results.append({
        'title1': title1,
        'adjustment': adjustment,
        'year_range': year_range,
        'scorer': scorer.__name__,
        'f1_score': f1
    })
    
    if f1 > best_f1:
        best_f1 = f1
        best_params = {
            'title1': title1,
            'adjustment': adjustment,
            'year_range': year_range,
            'scorer': scorer.__name__
        }

    iteration_count += 1  # Increment the iteration counter

    # Print best F1 score and parameters every 10 iterations
    if iteration_count % 10 == 0:
        print(f"Iteration {iteration_count}")
        print(f"Best F1 Score so far: {best_f1}")
        print(f"Best Parameters so far: {best_params}")

# Final best F1 score and parameters
print(f"Best F1 Score: {best_f1}")
print(f"Best Parameters: {best_params}")


 33%|███████████████████████████████████████████████████████████████▎                                                                                                                              | 10/30 [12:48<31:17, 93.85s/it]

Iteration 10
Best F1 Score so far: 0.6427503736920777
Best Parameters so far: {'title1': True, 'adjustment': 0, 'year_range': 3, 'scorer': 'partial_ratio'}


 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                              | 20/30 [45:00<1:03:49, 382.92s/it]

Iteration 20
Best F1 Score so far: 0.6427503736920777
Best Parameters so far: {'title1': True, 'adjustment': 0, 'year_range': 3, 'scorer': 'partial_ratio'}


 83%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                               | 25/30 [1:10:38<23:31, 282.38s/it]

In [None]:
results

In [None]:
df = pd.DataFrame(results)

# Plotting f1 vs adjustment
plt.figure(figsize=(10, 6))
plt.plot(df['adjustment'], df['f1_score'], 'o-')
plt.xlabel('Adjustment')
plt.ylabel('F1 Score')
plt.title('F1 Score vs Adjustment')
plt.grid(True)
plt.show()

# Plotting f1 vs year_range
plt.figure(figsize=(10, 6))
plt.plot(df['year_range'], df['f1_score'], 'o-')
plt.xlabel('Year Range')
plt.ylabel('F1 Score')
plt.title('F1 Score vs Year Range')
plt.grid(True)
plt.show()

# Plotting f1 vs scorer
plt.figure(figsize=(10, 6))
plt.bar(df['scorer'], df['f1_score'])
plt.xlabel('Scorer')
plt.ylabel('F1 Score')
plt.title('F1 Score vs Scorer')
plt.xticks(rotation=45)
plt.grid(True)
plt.show()

In [None]:
# Best Parameters so far: {'title1': True, 'adjustment': 0, 'year_range': 3, 'scorer': 'partial_ratio'}
# Best F1 Score so far: 0.6397608370702541