In [1]:
import pandas as pd
import numpy as np
import string
import re

import nltk
from nltk.tokenize import word_tokenize

from datasets import Dataset, load_metric
from transformers import DataCollatorForTokenClassification, pipeline, AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments

import torch

import spacy
from spacy import displacy

import os
from datetime import datetime
import json


# Ensure you have the necessary NLTK tokenizer models downloaded
# nltk.download('punkt')

  from .autonotebook import tqdm as notebook_tqdm


In [85]:
from loss_functions import DiceLoss, MoMLoss

In [3]:
pd.set_option('display.max_columns', None)

In [2]:
# from google.colab import drive
# drive.mount('/content/drive')
# folder_path = '/content/drive/MyDrive/UU/Thesis'

# Load an Excel file into a DataFrame
# df = pd.read_excel(folder_path + '/LCReviewsIntegrated_1962-1994.xlsx', engine='openpyxl')

In [4]:
# Load an Excel file into a DataFrame
df = pd.read_excel('LCReviewsIntegrated_1962-1994.xlsx', engine='openpyxl')

In [5]:
def remove_extra_spaces(text):
    # Replace multiple spaces with a single space
    cleaned_text = re.sub(r'\s+', ' ', text)
    return cleaned_text.strip()

In [6]:
df['content'] = df['content'].apply(remove_extra_spaces)
df['title1'] = df['title1'].apply(remove_extra_spaces)

In [7]:
def remove_punctuation(input_string):
    # Create a translation table that maps each punctuation character to None
    translator = str.maketrans('', '', string.punctuation)
    # Translate the input string using the translation table
    return input_string.translate(translator)

In [8]:
def extract_title(x):
    if not (type(x.title1) == str):
        return np.nan
    if not (type(x.content) == str):
        return np.nan
    if x.title1.lower() in x.content.lower():
        return x.title1

    sentence_parts = re.split(r' / | : ', x.title1.lower())
    sentence_parts = sorted(sentence_parts, key=len, reverse=True)
    for part in sentence_parts:
        if part in x.content.lower():
            return part
        elif remove_punctuation(part) in x.content.lower():
            return remove_punctuation(part)

    if remove_punctuation(x.title1).lower() in x.content.lower():
        return remove_punctuation(x.title1)

    return "error"

In [9]:
def find_sentence_in_text(full_text, sentence):
    start_index = full_text.find(sentence)
    if start_index == -1:
        print("EROR!!!")
        return False
    end_index = start_index + len(sentence)
    return start_index, end_index

In [10]:
result = []
for index, row in df.iterrows():
    result.append(extract_title(row))

df['title3'] = result

In [11]:
df_clean = df[df['title3'] != 'error']

## Check what tokens are present before the title

In [12]:
def create_mask_for_sentence(full_text, sentence):
    # Tokenize the full text
    tokens = word_tokenize(full_text)
    # Find the start and end indices of the sentence in the full text
    start_index = full_text.find(sentence)
    if start_index == -1:
        return None, None
    end_index = start_index + len(sentence)
    # Tokenize the sentence separately to match tokens exactly
    sentence_tokens = word_tokenize(sentence)
    # Initialize mask with zeros
    mask = [0] * len(tokens)
    # Loop through the full text tokens to set the mask
    sentence_pos = 0
    for i, token in enumerate(tokens):
        if sentence_pos < len(sentence_tokens) and token == sentence_tokens[sentence_pos]:
            # Check if the full sequence matches and ensure it doesn't go out of bounds
            if i + len(sentence_tokens) - sentence_pos <= len(tokens) and \
                all(tokens[i + j] == sentence_tokens[sentence_pos + j] for j in range(len(sentence_tokens) - sentence_pos)):
                # Mark the mask for the length of the sentence tokens
                mask[i:i + len(sentence_tokens)] = [1] * len(sentence_tokens)
                break
            else:
                # Increment if it's not a full sequence match
                sentence_pos += 1
        elif token == sentence_tokens[0]:  # Reset if it's the beginning of sentence tokens
            sentence_pos = 0
    return tokens, mask

In [13]:
task = "ner"  # Should be one of "ner", "pos" or "chunk"
# model_checkpoint = "distilbert-base-uncased"
model_checkpoint = "Babelscape/wikineural-multilingual-ner"

In [14]:
label_list = ['O', 'I']

In [15]:
def create_data_set(samples):
    data = []

    for sample in samples:
        unique_content_df = df_clean[df_clean['content'] == sample]
        masks = []

        review = remove_punctuation(sample.lower())

        for index, row in unique_content_df.iterrows():
            book = remove_punctuation(row['title3'].lower())

            tokens, mask = create_mask_for_sentence(review, book)  # Assuming this returns a mask of the same length as tokens
            masks.append(mask)
        masks = np.bitwise_or.reduce(np.array(masks), axis=0)
        mask[mask == 0] = "O"
        mask[mask == 1] = "I"

        data.append({
            "tokens": tokens,
            "ner_tags": masks
        })

    return data

In [16]:
samples = df_clean['content'].unique()[:11000]

train_dataset = Dataset.from_list(create_data_set(samples[:int(len(samples) * 0.8)]))
val_dataset = Dataset.from_list(create_data_set(samples[int(len(samples) * 0.8):]))

In [17]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [18]:
label_all_tokens = True

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(examples[f"{task}_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [19]:
tokenized_dataset_train = train_dataset.map(tokenize_and_align_labels, batched=True)
tokenized_dataset_val = val_dataset.map(tokenize_and_align_labels, batched=True)

tokenized_dataset_train

Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8800/8800 [00:15<00:00, 580.77 examples/s]
Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2200/2200 [00:03<00:00, 569.87 examples/s]


Dataset({
    features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 8800
})

In [20]:
metric = load_metric("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)
    ]

    total_count_O = 0
    total_count_I = 0
    for pred in true_predictions:
        total_count_O = total_count_O + sum(s.count("O") for s in pred)
        total_count_I = total_count_I + sum(s.count("I") for s in pred)



    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "total_count_O": total_count_O,
        "total_count_I": total_count_I,
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

  metric = load_metric("seqeval")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [21]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [22]:
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint, num_labels=2, ignore_mismatched_sizes=True
)
model.config.id2label = {0: 'NO_BOOK', 1: 'BOOK'}

Some weights of BertForTokenClassification were not initialized from the model checkpoint at Babelscape/wikineural-multilingual-ner and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([2]) in the model instantiated
- classifier.weight: found shape torch.Size([9, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
class MyTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        """ MoM LOSS """
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits  # Assuming logits are stored in outputs.logits

        # Compute loss using DiceLoss
        loss_fn = MoMLoss()
        loss = loss_fn(logits, labels)

        return (loss, outputs) if return_outputs else loss

    # def compute_loss(self, model, inputs, return_outputs=False):
    #     # """ Custom weighted CrossEntropyLoss """
    #     labels = inputs.pop("labels")
    #     outputs = model(**inputs)
    #     logits = outputs[0]

    #     # Reshape logits to [batch_size * sequence_length, num_classes]
    #     logits = logits.view(-1, logits.size(-1))

    #     # Reshape labels to [batch_size * sequence_length]
    #     labels = labels.view(-1)

    #     class_weights = torch.tensor([0.5, 28.27], dtype=torch.float32, device=torch.device("cuda"))
    #     loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights)

    #     loss = loss_fn(logits.to(torch.device("cuda")), labels.to(torch.device("cuda")) )

    #     return (loss, outputs) if return_outputs else loss

    # def compute_loss(self, model, inputs, return_outputs=False):
    #     """ DICE LOSS """
    #     # TODO: IGNORE -100 labels
    #     labels = inputs.pop("labels")
    #     outputs = model(**inputs)
    #     logits = outputs.logits  # Assuming logits are stored in outputs.logits

    #     # Compute loss using DiceLoss
    #     loss_fn = DiceLoss()
    #     loss = loss_fn(logits, labels)

    #     return (loss, outputs) if return_outputs else loss

In [26]:
training_args = TrainingArguments(
    output_dir="test_model",
    learning_rate=2e-5,
    per_device_train_batch_size=12,
    per_device_eval_batch_size=12,
    num_train_epochs=50,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    greater_is_better=True,
    push_to_hub=False
)

In [27]:
# Create a custom Trainer instance
trainer = MyTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset_train,
    eval_dataset=tokenized_dataset_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [28]:
del df
del train_dataset
del val_dataset

In [29]:
trainer.train()

Epoch,Training Loss,Validation Loss,Total Count O,Total Count I,F1,Accuracy
1,0.5351,0.447255,935816,62185,0.140778,0.950552
2,0.3868,0.406253,938136,59865,0.198198,0.95323
3,0.2649,0.447727,934936,63065,0.152897,0.950105
4,0.2272,0.633426,948626,49375,0.234079,0.962628
5,0.1689,0.635409,954619,43382,0.268703,0.96806
6,0.143,1.00692,963678,34323,0.340251,0.975504
7,0.1172,1.133216,964220,33781,0.337367,0.975666
8,0.1097,1.390558,969124,28877,0.37937,0.979474
9,0.0994,1.62874,971388,26613,0.403083,0.980967
10,0.0818,1.468858,968423,29578,0.38781,0.978872


SafetensorError: Error while serializing: IoError(Os { code: 112, kind: StorageFull, message: "There is not enough space on the disk." })

#### Export model, tokenizer & history

In [32]:
export_path = f"C:/Users/niels/PycharmProjects/BookReviewsThesis/models/{datetime.now().strftime('%Y-%m-%d_%H_%M')}/"
export_path

'C:/Users/niels/PycharmProjects/BookReviewsThesis/models/2024-04-25_10_15/'

In [33]:
os.makedirs(export_path, exist_ok=True)

In [34]:
trainer.save_model(export_path + "model")
tokenizer.save_pretrained(export_path + "tokenizer")

('C:/Users/niels/PycharmProjects/BookReviewsThesis/models/2024-04-25_10_15/tokenizer\\tokenizer_config.json',
 'C:/Users/niels/PycharmProjects/BookReviewsThesis/models/2024-04-25_10_15/tokenizer\\special_tokens_map.json',
 'C:/Users/niels/PycharmProjects/BookReviewsThesis/models/2024-04-25_10_15/tokenizer\\vocab.txt',
 'C:/Users/niels/PycharmProjects/BookReviewsThesis/models/2024-04-25_10_15/tokenizer\\added_tokens.json',
 'C:/Users/niels/PycharmProjects/BookReviewsThesis/models/2024-04-25_10_15/tokenizer\\tokenizer.json')

In [35]:
with open(export_path + "history.json", 'w') as file:
    json.dump(trainer.state.log_history, file, indent=4)
with open(export_path + "model_name.txt", 'w') as file:
    file.write(model_checkpoint)  # Writing the string to the file

#### Make and visualize predictions on unseen data

In [102]:
def merge_overlapping_intervals(intervals):
    merged_intervals = []
    if not intervals:
        return merged_intervals

    # Sort intervals based on the start value
    intervals.sort(key=lambda x: x[0])

    # Initialize variables for the first interval
    start, end, label = intervals[0]

    # Iterate through the intervals
    for interval in intervals[1:]:
        next_start, next_end, next_label = interval

        # If the intervals overlap, merge them
        if next_start <= end + 1:
            end = max(end, next_end)
        else:
            # If no overlap, add the merged interval to the result and update start, end, label
            merged_intervals.append((start, end, label))
            start, end, label = next_start, next_end, next_label

    # Add the last merged interval
    merged_intervals.append((start, end, "BOOK"))

    return merged_intervals

def visualize_output(output):
    spans = [(res['start'], res['end'], res['entity']) for res in output if res['entity'] == 'BOOK']
    spans = merge_overlapping_intervals(spans)

    nlp = spacy.blank('nl')
    doc = nlp.make_doc(sentence)
    ents = []
    for span_start, span_end, label in spans:
        ent = doc.char_span(span_start, span_end, label=label)
        if ent is None:
            continue
    
        ents.append(ent)
    
    doc.ents = ents
    displacy.render(doc, style="ent", jupyter=True)

In [86]:
sentence = remove_punctuation(df_clean['content'].unique()[-1].lower())

In [87]:
pipe = pipeline(task="token-classification", model=model, tokenizer=tokenizer)

In [88]:
output = pipe(sentence)

In [95]:
visualize_output(output)

#### Load model and tokenizer

In [96]:
model_new = AutoModelForTokenClassification.from_pretrained(export_path + "model")
tokenizer_new = AutoTokenizer.from_pretrained(export_path + "tokenizer")

In [97]:
pipe = pipeline(task="token-classification", model=model_new, tokenizer=tokenizer_new)

In [98]:
sentence = remove_punctuation(df_clean['content'].unique()[0].lower())

In [99]:
output = pipe(sentence)

In [103]:
visualize_output(output)