In [25]:
!python -m spacy download nl_core_news_sm

Collecting nl-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/nl_core_news_sm-3.7.0/nl_core_news_sm-3.7.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------- 0.0/12.8 MB 393.8 kB/s eta 0:00:33
      --------------------------------------- 0.2/12.8 MB 1.2 MB/s eta 0:00:11
     - -------------------------------------- 0.4/12.8 MB 1.9 MB/s eta 0:00:07
     - -------------------------------------- 0.6/12.8 MB 2.7 MB/s eta 0:00:05
     -- ------------------------------------- 0.9/12.8 MB 3.2 MB/s eta 0:00:04
     ---- ----------------------------------- 1.4/12.8 MB 4.1 MB/s eta 0:00:03
     ---- ----------------------------------- 1.5/12.8 MB 4.1 MB/s eta 0:00:03
     ------ --------------------------------- 2.0/1

In [1]:
import pandas as pd
import numpy as np
import string
import re

from datasets import Dataset, load_metric
from transformers import DataCollatorForTokenClassification, pipeline, AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments

import torch

import spacy
from spacy import displacy

import os
from datetime import datetime
import json

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from loss_functions import DiceLoss, MoMLoss

In [3]:
pd.set_option('display.max_columns', None)

In [4]:
# from google.colab import drive
# drive.mount('/content/drive')
# folder_path = '/content/drive/MyDrive/UU/Thesis'

# Load an Excel file into a DataFrame
# df = pd.read_excel(folder_path + '/LCReviewsIntegrated_1962-1994.xlsx', engine='openpyxl')

In [5]:
# Load an Excel file into a DataFrame
df = pd.read_excel('cleaned.xlsx', engine='openpyxl')
# df = pd.read_excel('LCReviewsIntegrated_1962-1994.xlsx', engine='openpyxl')

In [6]:
def remove_extra_spaces(text):
    # Replace multiple spaces with a single space
    cleaned_text = re.sub(r'\s+', ' ', text)
    return cleaned_text.strip()

In [7]:
# df['content'] = df['content'].apply(remove_extra_spaces)
# df['title1'] = df['title1'].apply(remove_extra_spaces)

In [8]:
def remove_punctuation(input_string):
    # Create a translation table that maps each punctuation character to None
    translator = str.maketrans('', '', string.punctuation)
    # Translate the input string using the translation table
    return input_string.translate(translator)

In [9]:
# def extract_title(x):
#     if not (type(x.title1) == str):
#         return np.nan
#     if not (type(x.content) == str):
#         return np.nan
#     if x.title1.lower() in x.content.lower():
#         return x.title1

#     sentence_parts = re.split(r' / | : ', x.title1.lower())
#     sentence_parts = sorted(sentence_parts, key=len, reverse=True)
#     for part in sentence_parts:
#         if part in x.content.lower():
#             return part
#         elif remove_punctuation(part) in x.content.lower():
#             return remove_punctuation(part)

#     if remove_punctuation(x.title1).lower() in x.content.lower():
#         return remove_punctuation(x.title1)

#     return "error"

In [10]:
df_clean = df

## Check what tokens are present before the title

In [12]:
task = "ner"  # Should be one of "ner", "pos" or "chunk"
# model_checkpoint = "distilbert-base-uncased"
model_checkpoint = "Babelscape/wikineural-multilingual-ner"

# TO USE:
# "FacebookAI/xlm-roberta-large-finetuned-conll03-english"
# "pdelobelle/robbert-v2-dutch-ner"
# "GroNLP/bert-base-dutch-cased"
# "pdelobelle/robbert-v2-dutch-base"

In [13]:
label_list = ['O', 'I']

In [24]:
def find_sentence_in_text(full_text, sentence):
    start_index = full_text.find(sentence)
    if start_index == -1:
        raise ValueError("Sentence not found in text.")
    end_index = start_index + len(sentence)
    return start_index, end_index


def create_mask_for_sentence(full_text, sentence, nlp):
    # Use the already loaded nlp model to process the text
    doc = nlp(full_text)
    start_index, end_index = find_sentence_in_text(full_text, sentence)
    if start_index is None:
        return None, None

    tokens = [token.text for token in doc]
    mask = [0] * len(doc)

    for i, token in enumerate(doc):
        token_end_idx = token.idx + len(token.text)
        if token.idx <= end_index and token_end_idx >= start_index:
            mask[i] = 1

    return tokens, mask


def create_data_set(samples, df, nlp):
    data = []
    for sample in samples:
        unique_content_df = df[df['content'] == sample]
        masks = []
        review = remove_punctuation(sample.lower())
        for _, row in unique_content_df.iterrows():
            book = remove_punctuation(row['title4'].lower())
            tokens, mask = create_mask_for_sentence(review, book, nlp)
            if mask is not None:
                masks.append(mask)

        if masks:
            combined_mask = np.bitwise_or.reduce(np.array(masks), axis=0)
            data.append({"tokens": tokens, "ner_tags": combined_mask})

    return data

In [19]:
nlp = spacy.load("nl_core_news_sm")

In [46]:
samples = df_clean['content'].unique()

train_dataset = Dataset.from_list(create_data_set(samples[:int(len(samples) * 0.85)], df_clean, nlp))
val_dataset = Dataset.from_list(create_data_set(samples[int(len(samples) * 0.85):], df_clean, nlp))

In [47]:
train_dataset

Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 6299
})

In [48]:
val_dataset

Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 1112
})

In [49]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [50]:
label_all_tokens = True

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(examples[f"{task}_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [51]:
tokenized_dataset_train = train_dataset.map(tokenize_and_align_labels, batched=True)
tokenized_dataset_val = val_dataset.map(tokenize_and_align_labels, batched=True)

tokenized_dataset_train

Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6299/6299 [00:11<00:00, 548.07 examples/s]
Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1112/1112 [00:02<00:00, 508.93 examples/s]


Dataset({
    features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 6299
})

In [52]:
metric = load_metric("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)
    ]

    total_count_O = 0
    total_count_I = 0
    for pred in true_predictions:
        total_count_O = total_count_O + sum(s.count("O") for s in pred)
        total_count_I = total_count_I + sum(s.count("I") for s in pred)



    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "total_count_O": total_count_O,
        "total_count_I": total_count_I,
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

  metric = load_metric("seqeval")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [53]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [54]:
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint, num_labels=2, ignore_mismatched_sizes=True
)
model.config.id2label = {0: 'NO_BOOK', 1: 'BOOK'}

Some weights of BertForTokenClassification were not initialized from the model checkpoint at Babelscape/wikineural-multilingual-ner and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([2]) in the model instantiated
- classifier.weight: found shape torch.Size([9, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [55]:
class MyTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        """ MoM LOSS """
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits  # Assuming logits are stored in outputs.logits

        # Compute loss using DiceLoss
        loss_fn = MoMLoss()
        loss = loss_fn(logits, labels)

        return (loss, outputs) if return_outputs else loss

    # def compute_loss(self, model, inputs, return_outputs=False):
    #     # """ Custom weighted CrossEntropyLoss """
    #     labels = inputs.pop("labels")
    #     outputs = model(**inputs)
    #     logits = outputs[0]

    #     # Reshape logits to [batch_size * sequence_length, num_classes]
    #     logits = logits.view(-1, logits.size(-1))

    #     # Reshape labels to [batch_size * sequence_length]
    #     labels = labels.view(-1)

    #     class_weights = torch.tensor([0.5, 28.27], dtype=torch.float32, device=torch.device("cuda"))
    #     loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights)

    #     loss = loss_fn(logits.to(torch.device("cuda")), labels.to(torch.device("cuda")) )

    #     return (loss, outputs) if return_outputs else loss

    # def compute_loss(self, model, inputs, return_outputs=False):
    #     """ DICE LOSS """
    #     # TODO: IGNORE -100 labels
    #     labels = inputs.pop("labels")
    #     outputs = model(**inputs)
    #     logits = outputs.logits  # Assuming logits are stored in outputs.logits

    #     # Compute loss using DiceLoss
    #     loss_fn = DiceLoss()
    #     loss = loss_fn(logits, labels)

    #     return (loss, outputs) if return_outputs else loss

In [56]:
training_args = TrainingArguments(
    output_dir="test_model",
    learning_rate=2e-5,
    per_device_train_batch_size=12,
    per_device_eval_batch_size=12,
    num_train_epochs=30,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    greater_is_better=True,
    push_to_hub=False
)

In [57]:
# Create a custom Trainer instance
trainer = MyTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset_train,
    eval_dataset=tokenized_dataset_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [58]:
del df
del train_dataset
del val_dataset

In [59]:
trainer.train()

Epoch,Training Loss,Validation Loss,Total Count O,Total Count I,F1,Accuracy
1,0.5108,0.436569,469231,38464,0.062181,0.932207
2,0.3171,0.440617,474522,33173,0.090609,0.942566
3,0.2443,0.666952,486777,20918,0.196432,0.965247
4,0.1761,0.938795,490292,17403,0.223087,0.97152
5,0.1483,1.04385,493725,13970,0.27207,0.977451
6,0.1224,1.054049,489527,18168,0.221188,0.970073
7,0.1113,1.400311,495110,12585,0.309162,0.979785
8,0.0939,1.820776,497806,9889,0.349967,0.983996
9,0.087,1.539507,495093,12602,0.319496,0.979736
10,0.0757,2.013481,496640,11055,0.336403,0.982157


TrainOutput(global_step=15750, training_loss=0.07869710150219145, metrics={'train_runtime': 12633.4373, 'train_samples_per_second': 14.958, 'train_steps_per_second': 1.247, 'total_flos': 4.937724812040192e+16, 'train_loss': 0.07869710150219145, 'epoch': 30.0})

#### Export model, tokenizer & history

In [61]:
export_path = f"C:/Users/niels/PycharmProjects/BookReviewsThesis/models/{datetime.now().strftime('%Y-%m-%d_%H_%M')}/"
export_path

'C:/Users/niels/PycharmProjects/BookReviewsThesis/models/2024-04-30_10_04/'

In [62]:
os.makedirs(export_path, exist_ok=True)

In [63]:
trainer.save_model(export_path + "model")
tokenizer.save_pretrained(export_path + "tokenizer")

('C:/Users/niels/PycharmProjects/BookReviewsThesis/models/2024-04-30_10_04/tokenizer\\tokenizer_config.json',
 'C:/Users/niels/PycharmProjects/BookReviewsThesis/models/2024-04-30_10_04/tokenizer\\special_tokens_map.json',
 'C:/Users/niels/PycharmProjects/BookReviewsThesis/models/2024-04-30_10_04/tokenizer\\vocab.txt',
 'C:/Users/niels/PycharmProjects/BookReviewsThesis/models/2024-04-30_10_04/tokenizer\\added_tokens.json',
 'C:/Users/niels/PycharmProjects/BookReviewsThesis/models/2024-04-30_10_04/tokenizer\\tokenizer.json')

In [64]:
with open(export_path + "history.json", 'w') as file:
    json.dump(trainer.state.log_history, file, indent=4)
with open(export_path + "model_name.txt", 'w') as file:
    file.write(model_checkpoint)  # Writing the string to the file

#### Make and visualize predictions on unseen data

In [65]:
def merge_overlapping_intervals(intervals):
    merged_intervals = []
    if not intervals:
        return merged_intervals

    # Sort intervals based on the start value
    intervals.sort(key=lambda x: x[0])

    # Initialize variables for the first interval
    start, end, label = intervals[0]

    # Iterate through the intervals
    for interval in intervals[1:]:
        next_start, next_end, next_label = interval

        # If the intervals overlap, merge them
        if next_start <= end + 1:
            end = max(end, next_end)
        else:
            # If no overlap, add the merged interval to the result and update start, end, label
            merged_intervals.append((start, end, label))
            start, end, label = next_start, next_end, next_label

    # Add the last merged interval
    merged_intervals.append((start, end, "BOOK"))

    return merged_intervals

def visualize_output(output):
    spans = [(res['start'], res['end'], res['entity']) for res in output if res['entity'] == 'BOOK']
    spans = merge_overlapping_intervals(spans)

    nlp = spacy.blank('nl')
    doc = nlp.make_doc(sentence)
    ents = []
    for span_start, span_end, label in spans:
        ent = doc.char_span(span_start, span_end, label=label)
        if ent is None:
            continue
    
        ents.append(ent)
    
    doc.ents = ents
    displacy.render(doc, style="ent", jupyter=True)

In [88]:
visualize_index = -3

In [89]:
sentence = remove_punctuation(df_clean['content'].unique()[visualize_index].lower())

In [90]:
pipe = pipeline(task="token-classification", model=model, tokenizer=tokenizer)

In [91]:
output = pipe(sentence)

In [92]:
df_clean[df_clean['content'] == df_clean['content'].unique()[visualize_index]].title1

11279          Volgend jaar in Holysloot / Eric Terduyn
11280    Een huis vol angst / [door] Aleida Leeuwenberg
11281       Het lot van de kunst / [door] Joop Waasdorp
Name: title1, dtype: object

In [93]:
visualize_output(output)

#### Load model and tokenizer

In [96]:
model_new = AutoModelForTokenClassification.from_pretrained(export_path + "model")
tokenizer_new = AutoTokenizer.from_pretrained(export_path + "tokenizer")

In [97]:
pipe = pipeline(task="token-classification", model=model_new, tokenizer=tokenizer_new)

In [98]:
sentence = remove_punctuation(df_clean['content'].unique()[0].lower())

In [99]:
output = pipe(sentence)

In [103]:
visualize_output(output)