In [1]:
# !python -m spacy download nl_core_news_sm

In [2]:
import pandas as pd
import numpy as np
import string
import re

from datasets import Dataset, load_metric
from transformers import DataCollatorForTokenClassification, pipeline, AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments

import torch

import spacy
from spacy import displacy

import os
from datetime import datetime
import json

from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from loss_functions import DiceLoss, MoMLoss

In [4]:
pd.set_option('display.max_columns', None)

In [5]:
# Load an Excel file into a DataFrame
df = pd.read_excel('manullay_check_partially_matched_titles.xlsx', engine='openpyxl')
# df = pd.read_excel('LCReviewsIntegrated_1962-1994.xlsx', engine='openpyxl')

In [6]:
def remove_extra_spaces(text):
    # Replace multiple spaces with a single space
    cleaned_text = re.sub(r'\s+', ' ', text)
    return cleaned_text.strip()

In [7]:
df['content'] = df['content'].apply(remove_extra_spaces)
df['title1'] = df['title1'].apply(remove_extra_spaces)
df['title4'] = df['title4'].apply(remove_extra_spaces)

In [8]:
# def remove_punctuation(input_string):
#     # Create a translation table that maps each punctuation character to None
#     translator = str.maketrans('', '', string.punctuation)
#     # Translate the input string using the translation table
#     return input_string.translate(translator)

In [9]:
# First, filter rows where 'manually_removed' is 1 and get unique 'content' values in these rows
content_removed = df[df['manually_removed'] == 1]['content'].unique()

# Now, filter out these 'content' values from the main DataFrame and find unique 'content' not removed
df_clean = df[~df['content'].isin(content_removed)]

## Check what tokens are present before the title

In [10]:
task = "ner"  # Should be one of "ner", "pos" or "chunk"
# model_checkpoint = "distilbert-base-uncased"

model_checkpoint = "Babelscape/wikineural-multilingual-ner"   # BEST F1 0.65

# TO USE:
# model_checkpoint = "FacebookAI/xlm-roberta-large-finetuned-conll03-english"
# model_checkpoint = "pdelobelle/robbert-v2-dutch-ner"
# model_checkpoint = "GroNLP/bert-base-dutch-cased"   # BEST F1 0.59
# model_checkpoint = "pdelobelle/robbert-v2-dutch-base"

In [11]:
label_list = ['O', 'I']

In [12]:
def find_sentence_in_text(full_text, sentence):
    start_index = full_text.find(sentence)
    if start_index == -1:
        raise ValueError("Sentence not found in text.")
    end_index = start_index + len(sentence)
    return start_index, end_index


def create_mask_for_sentence(full_text, sentence, nlp, force_lower_case=False):
    # Use the already loaded nlp model to process the text
    doc = nlp(full_text)
    start_index, end_index = find_sentence_in_text(full_text.lower(), sentence.lower())
    if start_index is None:
        return None, None
    if force_lower_case:
        tokens = [token.text.lower() for token in doc]
    else:
        tokens = [token.text for token in doc]
    mask = [0] * len(doc)

    for i, token in enumerate(doc):
        token_end_idx = token.idx + len(token.text)
        if token.idx <= end_index and token_end_idx >= start_index:
            mask[i] = 1

    return tokens, mask


def create_data_set(samples, df, nlp, remove_punc=False, force_lower_case=False):
    data = []
    for sample in tqdm(samples):
        unique_content_df = df[df['content'] == sample]
        masks = []

        if remove_punc:
            review = remove_punctuation(sample)
        else:
            review = sample
        
        for _, row in unique_content_df.iterrows():
            if remove_punc:
                book = remove_punctuation(row['title4'])
            else:
                book = row['title4']
                
            tokens, mask = create_mask_for_sentence(full_text=review, sentence=book, nlp=nlp, force_lower_case=force_lower_case) 
            if mask is not None:
                masks.append(mask)

        if masks:
            combined_mask = np.bitwise_or.reduce(np.array(masks), axis=0)
            data.append({"tokens": tokens, "ner_tags": combined_mask})

    return data

In [13]:
nlp = spacy.load("nl_core_news_sm")

In [None]:
remove_punc = False
force_lower_case = False

In [14]:
# Set the random seed for reproducibility
np.random.seed(42)

# Shuffle the unique samples
samples = df_clean['content'].unique()
np.random.shuffle(samples)

# Calculate the split index
split_idx = int(len(samples) * 0.85)

# Split the samples into training and validation sets
train_samples = samples[:split_idx]
val_samples = samples[split_idx:]

# Create training and validation datasets
train_dataset = Dataset.from_list(create_data_set(samples=train_samples, df=df_clean, nlp=nlp, remove_punc=remove_punc, force_lower_case=force_lower_case))
val_dataset = Dataset.from_list(create_data_set(samples=val_samples, df=df_clean, nlp=nlp, remove_punc=remove_punc, force_lower_case=force_lower_case))

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10654/10654 [17:06<00:00, 10.37it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1881/1881 [02:58<00:00, 10.54it/s]


In [15]:
train_dataset

Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 10654
})

In [16]:
val_dataset

Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 1881
})

In [17]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [18]:
label_all_tokens = True

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(examples[f"{task}_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [19]:
tokenized_dataset_train = train_dataset.map(tokenize_and_align_labels, batched=True)
tokenized_dataset_val = val_dataset.map(tokenize_and_align_labels, batched=True)

tokenized_dataset_train

Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10654/10654 [00:20<00:00, 526.69 examples/s]
Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1881/1881 [00:03<00:00, 529.63 examples/s]


Dataset({
    features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 10654
})

In [20]:
metric = load_metric("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)
    ]

    total_count_O = 0
    total_count_I = 0
    for pred in true_predictions:
        total_count_O = total_count_O + sum(s.count("O") for s in pred)
        total_count_I = total_count_I + sum(s.count("I") for s in pred)



    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "total_count_O": total_count_O,
        "total_count_I": total_count_I,
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

  metric = load_metric("seqeval")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [21]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [22]:
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint, num_labels=2, ignore_mismatched_sizes=True
)
model.config.id2label = {0: 'NO_BOOK', 1: 'BOOK'}

Some weights of BertForTokenClassification were not initialized from the model checkpoint at Babelscape/wikineural-multilingual-ner and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([2]) in the model instantiated
- classifier.weight: found shape torch.Size([9, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
class MyTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        """ MoM LOSS """
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits  # Assuming logits are stored in outputs.logits

        # Compute loss using DiceLoss
        loss_fn = MoMLoss()
        loss = loss_fn(logits, labels)

        return (loss, outputs) if return_outputs else loss

    # def compute_loss(self, model, inputs, return_outputs=False):
    #     # """ Custom weighted CrossEntropyLoss """
    #     labels = inputs.pop("labels")
    #     outputs = model(**inputs)
    #     logits = outputs[0]

    #     # Reshape logits to [batch_size * sequence_length, num_classes]
    #     logits = logits.view(-1, logits.size(-1))

    #     # Reshape labels to [batch_size * sequence_length]
    #     labels = labels.view(-1)

    #     class_weights = torch.tensor([0.5, 28.27], dtype=torch.float32, device=torch.device("cuda"))
    #     loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights)

    #     loss = loss_fn(logits.to(torch.device("cuda")), labels.to(torch.device("cuda")) )

    #     return (loss, outputs) if return_outputs else loss

    # def compute_loss(self, model, inputs, return_outputs=False):
    #     """ DICE LOSS """
    #     # TODO: IGNORE -100 labels
    #     labels = inputs.pop("labels")
    #     outputs = model(**inputs)
    #     logits = outputs.logits  # Assuming logits are stored in outputs.logits

    #     # Compute loss using DiceLoss
    #     loss_fn = DiceLoss()
    #     loss = loss_fn(logits, labels)

    #     return (loss, outputs) if return_outputs else loss

In [24]:
training_args = TrainingArguments(
    output_dir="test_model",
    learning_rate=2e-5,
    per_device_train_batch_size=14,
    per_device_eval_batch_size=14,
    num_train_epochs=30,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    greater_is_better=True,
    push_to_hub=False
)

In [25]:
# Create a custom Trainer instance
trainer = MyTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset_train,
    eval_dataset=tokenized_dataset_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [26]:
del df
del train_dataset
del val_dataset

In [27]:
trainer.train()

Epoch,Training Loss,Validation Loss,Total Count O,Total Count I,F1,Accuracy
1,0.3893,0.265928,813573,51896,0.151651,0.960353
2,0.2395,0.268553,827138,38331,0.268569,0.975451
3,0.1747,0.244735,822122,43347,0.214773,0.970264
4,0.1297,0.448475,836294,29175,0.345721,0.984316
5,0.1057,0.541129,835258,30211,0.349723,0.983475
6,0.0828,0.457607,836598,28871,0.392194,0.984921
7,0.0721,0.678063,839269,26200,0.450374,0.987132
8,0.0599,0.815517,841087,24382,0.460865,0.988481
9,0.0518,0.736648,839147,26322,0.416176,0.986931
10,0.0483,0.832799,840807,24662,0.441864,0.988484


TrainOutput(global_step=22830, training_loss=0.054971887101429466, metrics={'train_runtime': 25102.9699, 'train_samples_per_second': 12.732, 'train_steps_per_second': 0.909, 'total_flos': 8.351566938796032e+16, 'train_loss': 0.054971887101429466, 'epoch': 30.0})

#### Export model, tokenizer & history

In [28]:
export_path = f"C:/Users/niels/PycharmProjects/BookReviewsThesis/models/{datetime.now().strftime('%Y-%m-%d_%H_%M')}/"
export_path

'C:/Users/niels/PycharmProjects/BookReviewsThesis/models/2024-05-06_22_33/'

In [29]:
os.makedirs(export_path, exist_ok=True)

In [30]:
trainer.save_model(export_path + "model")
tokenizer.save_pretrained(export_path + "tokenizer")

('C:/Users/niels/PycharmProjects/BookReviewsThesis/models/2024-05-06_22_33/tokenizer\\tokenizer_config.json',
 'C:/Users/niels/PycharmProjects/BookReviewsThesis/models/2024-05-06_22_33/tokenizer\\special_tokens_map.json',
 'C:/Users/niels/PycharmProjects/BookReviewsThesis/models/2024-05-06_22_33/tokenizer\\vocab.txt',
 'C:/Users/niels/PycharmProjects/BookReviewsThesis/models/2024-05-06_22_33/tokenizer\\added_tokens.json',
 'C:/Users/niels/PycharmProjects/BookReviewsThesis/models/2024-05-06_22_33/tokenizer\\tokenizer.json')

In [31]:
with open(export_path + "history.json", 'w') as file:
    json.dump(trainer.state.log_history, file, indent=4)
with open(export_path + "model_name.txt", 'w') as file:
    file.write(model_checkpoint)  # Writing the string to the file

#### Make and visualize predictions on unseen data

In [45]:
def merge_overlapping_intervals(intervals):
    merged_intervals = []
    if not intervals:
        return merged_intervals

    # Sort intervals based on the start value
    intervals.sort(key=lambda x: x[0])

    # Initialize variables for the first interval
    start, end, label = intervals[0]

    # Iterate through the intervals
    for interval in intervals[1:]:
        next_start, next_end, next_label = interval

        # If the intervals overlap, merge them
        if next_start <= end + 1:
            end = max(end, next_end)
        else:
            # If no overlap, add the merged interval to the result and update start, end, label
            merged_intervals.append((start, end, label))
            start, end, label = next_start, next_end, next_label

    # Add the last merged interval
    merged_intervals.append((start, end, "BOOK"))

    return merged_intervals

def visualize_output(output, text):
    spans = [(res['start'], res['end'], res['entity']) for res in output if res['entity'] == 'BOOK']
    spans = merge_overlapping_intervals(spans)

    nlp = spacy.blank('nl')
    doc = nlp.make_doc(text)
    ents = []
    for span_start, span_end, label in spans:
        ent = doc.char_span(span_start, span_end, label=label)
        if ent is None:
            continue
    
        ents.append(ent)
    
    doc.ents = ents
    displacy.render(doc, style="ent", jupyter=True)

In [85]:
pipe = pipeline(task="token-classification", model=model, tokenizer=tokenizer)

In [110]:
visualize_index = -7

In [111]:
validation_df = df_clean[df_clean['content'].isin(val_samples)]

In [112]:
output = pipe(validation_df.iloc[visualize_index]["content"].lower())

In [113]:
validation_df[validation_df['content'] == validation_df.iloc[visualize_index]["content"]].title4

25900        Vreugde van het wildspoor
25901                100 kleine tuinen
25902                     Het keerpunt
25903                      Beter skiën
25904    Eleazar, de rabbijn van Worms
Name: title4, dtype: object

In [114]:
visualize_output(output, validation_df.iloc[visualize_index]["content"])

#### Load model and tokenizer

In [None]:
model_new = AutoModelForTokenClassification.from_pretrained(export_path + "model")
tokenizer_new = AutoTokenizer.from_pretrained(export_path + "tokenizer")

In [None]:
pipe = pipeline(task="token-classification", model=model_new, tokenizer=tokenizer_new)

In [None]:
sentence = remove_punctuation(df_clean['content'].unique()[0].lower())

In [None]:
output = pipe(sentence)

In [None]:
visualize_output(output)