In [None]:
# !python -m spacy download nl_core_news_sm

In [1]:
import pandas as pd
import numpy as np
import string
import re

from datasets import Dataset, load_metric
from transformers import DataCollatorForTokenClassification, pipeline, AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments

import torch

import spacy
from spacy import displacy

import os
from datetime import datetime
import json

from tqdm import tqdm
from sklearn.metrics import f1_score

from spacy.training import Example
from spacy.util import minibatch, compounding
from spacy.tokens import Doc
import random

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
pd.set_option('display.max_columns', None)

In [3]:
# Load an Excel file into a DataFrame
df = pd.read_excel('manullay_check_partially_matched_titles.xlsx', engine='openpyxl')

In [4]:
def remove_extra_spaces(text):
    # Replace multiple spaces with a single space
    cleaned_text = re.sub(r'\s+', ' ', text)
    return cleaned_text.strip()

In [5]:
df['content'] = df['content'].apply(remove_extra_spaces)
df['title1'] = df['title1'].apply(remove_extra_spaces)
df['title4'] = df['title4'].apply(remove_extra_spaces)

In [6]:
# First, filter rows where 'manually_removed' is 1 and get unique 'content' values in these rows
content_removed = df[df['manually_removed'] == 1]['content'].unique()

# Now, filter out these 'content' values from the main DataFrame and find unique 'content' not removed
df_clean = df[~df['content'].isin(content_removed)]

In [7]:
def find_sentence_in_text(full_text, sentence):
    start_index = full_text.find(sentence)
    if start_index == -1:
        raise ValueError("Sentence not found in text.")
    end_index = start_index + len(sentence)
    return start_index, end_index


def create_mask_for_sentence(full_text, sentence, nlp, force_lower_case=False):
    # Use the already loaded nlp model to process the text
    doc = nlp(full_text)
    start_index, end_index = find_sentence_in_text(full_text.lower(), sentence.lower())
    if start_index is None:
        return None, None
    if force_lower_case:
        tokens = [token.text.lower() for token in doc]
    else:
        tokens = [token.text for token in doc]
    mask = [0] * len(doc)

    for i, token in enumerate(doc):
        token_end_idx = token.idx + len(token.text)
        if token.idx <= end_index and token_end_idx >= start_index:
            mask[i] = 1

    return tokens, mask


def create_data_set(samples, df, nlp, remove_punc=False, force_lower_case=False):
    data = []
    for sample in tqdm(samples):
        unique_content_df = df[df['content'] == sample]
        masks = []

        if remove_punc:
            review = remove_punctuation(sample)
        else:
            review = sample
        
        for _, row in unique_content_df.iterrows():
            if remove_punc:
                book = remove_punctuation(row['title4'])
            else:
                book = row['title4']
                
            tokens, mask = create_mask_for_sentence(full_text=review, sentence=book, nlp=nlp, force_lower_case=force_lower_case) 
            if mask is not None:
                masks.append(mask)

        if masks:
            combined_mask = np.bitwise_or.reduce(np.array(masks), axis=0)
            data.append({"tokens": tokens, "ner_tags": combined_mask})

    return data

In [8]:
nlp = spacy.load("nl_core_news_lg")

In [None]:
# Set the random seed for reproducibility
np.random.seed(42)

# Shuffle the unique samples
samples = df_clean['content'].unique()
np.random.shuffle(samples)

# Calculate the split index
split_idx = int(len(samples) * 0.85)

# Split the samples into training and validation sets
train_samples = samples[:split_idx]
val_samples = samples[split_idx:]

# Create training and validation datasets
train_dataset = create_data_set(samples=train_samples, df=df_clean, nlp=nlp, remove_punc=False, force_lower_case=False)
val_dataset = create_data_set(samples=val_samples, df=df_clean, nlp=nlp, remove_punc=False, force_lower_case=False)

 14%|███████████████████▋                                                                                                                         | 1487/10654 [03:28<15:59,  9.55it/s]

In [None]:
def merge_entities(entities):
    if not entities:
        return []

    # Sort entities by the start index
    entities.sort()

    # Initialize with the first entity
    merged_entities = [entities[0]]

    for current_start, current_end, current_label in entities[1:]:
        last_start, last_end, last_label = merged_entities[-1]

        # Check if the current entity overlaps or is adjacent to the last entity
        if current_start <= last_end + 1 and current_label == last_label:
            # Merge the current entity with the last one if they are the same type
            merged_entities[-1] = (last_start, max(last_end, current_end), last_label)
        else:
            # Otherwise, add the current entity as a new entity
            merged_entities.append((current_start, current_end, current_label))

    return merged_entities


def convert_data_to_spacy_format(data):
    texts, ner_tags = data['tokens'], data['ner_tags']
    text = ' '.join(texts)
    start_offset, entities = 0, []
    for token, tag in zip(texts, ner_tags):
        end_offset = start_offset + len(token)
        if tag != 0:
            entities.append((start_offset, end_offset, 'WORK_OF_ART'))
        start_offset = end_offset + 1
    entities = merge_entities(entities)
    return Example.from_dict(nlp.make_doc(text), {'entities': entities})

In [None]:
def evaluate_on_unseen_data(val_data_set, trained_nlp):
    all_preds, all_truths = [], []
    for val_data in val_data_set:
        doc = Doc(vocab=trained_nlp.vocab, words=val_data['tokens'])
        preds = [1 if token.ent_type_ == "WORK_OF_ART" else 0 for token in doc]
        all_preds.extend(preds)
        all_truths.extend(val_data['ner_tags'])
    return f1_score(y_true=all_truths, y_pred=all_preds, average='macro')

In [None]:
spacy_train_dataset = [convert_data_to_spacy_format(data) for data in train_dataset[:1000]]

In [None]:
disabled_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*disabled_pipes):
    optimizer = nlp.resume_training()
    for iteration in range(60):
        random.shuffle(spacy_train_dataset)
        losses = {}
        batches = minibatch(spacy_train_dataset, size=compounding(4., 32., 1.001))
        for batch in batches:
            nlp.update(batch, drop=0.5, losses=losses, sgd=optimizer)
        print(f"Iteration {iteration}, Losses: {losses}, Validation F1: {evaluate_on_unseen_data(val_dataset, nlp)}")

In [None]:
doc = nlp(samples[-7])

In [None]:
displacy.render(doc, style="ent", jupyter=True)