In [1]:
import pandas as pd
import numpy as np
import string
import re

import nltk
from nltk.tokenize import word_tokenize

from wordcloud import WordCloud
import matplotlib.pyplot as plt

import spacy
from spacy.training import Example
import random

from datasets import Dataset
from transformers import AutoModelForTokenClassification, AutoTokenizer
from transformers import Trainer, TrainingArguments

import sklearn
import numpy as np


# Ensure you have the necessary NLTK tokenizer models downloaded
# nltk.download('punkt')

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
from datasets import load_metric
from transformers import AutoTokenizer
from transformers import TFAutoModelForTokenClassification
from transformers import create_optimizer
import tensorflow as tf
from transformers import DataCollatorForTokenClassification
from transformers.keras_callbacks import KerasMetricCallback

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

# folder_path = '/content/drive/MyDrive/UU/Thesis'

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
# Load an Excel file into a DataFrame
df = pd.read_excel('LCReviewsIntegrated_1962-1994.xlsx', engine='openpyxl')
# df = pd.read_excel(folder_path + '/LCReviewsIntegrated_1962-1994.xlsx', engine='openpyxl')

In [4]:
def remove_extra_spaces(text):
    # Replace multiple spaces with a single space
    cleaned_text = re.sub(r'\s+', ' ', text)
    return cleaned_text.strip()

In [5]:
df['content'] = df['content'].apply(remove_extra_spaces)
df['title1'] = df['title1'].apply(remove_extra_spaces)

In [6]:
def remove_punctuation(input_string):
    # Create a translation table that maps each punctuation character to None
    translator = str.maketrans('', '', string.punctuation)
    # Translate the input string using the translation table
    return input_string.translate(translator)

In [7]:
def extract_title(x):
    if not (type(x.title1) == str):
        return np.nan
    if not (type(x.content) == str):
        return np.nan
    if x.title1.lower() in x.content.lower():
        return x.title1
    
    sentence_parts = re.split(r' / | : ', x.title1.lower())
    sentence_parts = sorted(sentence_parts, key=len, reverse=True)
    for part in sentence_parts:
        if part in x.content.lower():
            return part
        elif remove_punctuation(part) in x.content.lower():
            return remove_punctuation(part)

    if remove_punctuation(x.title1).lower() in x.content.lower():
        return remove_punctuation(x.title1)
        
    return "error"

In [8]:
def find_sentence_in_text(full_text, sentence):
    start_index = full_text.find(sentence)
    if start_index == -1:
        print("EROR!!!")
        return False
    end_index = start_index + len(sentence)
    return start_index, end_index

In [9]:
result = []
for index, row in df.iterrows():
    result.append(extract_title(row))

df['title3'] = result

In [10]:
df_clean = df[df['title3'] != 'error']

## Check what tokens are present before the title

In [11]:
def create_mask_for_sentence(full_text, sentence):
    # Tokenize the full text
    tokens = word_tokenize(full_text)
    # Find the start and end indices of the sentence in the full text
    start_index = full_text.find(sentence)
    if start_index == -1:
        return None, None
    end_index = start_index + len(sentence)
    # Tokenize the sentence separately to match tokens exactly
    sentence_tokens = word_tokenize(sentence)
    # Initialize mask with zeros
    mask = [0] * len(tokens)
    # Loop through the full text tokens to set the mask
    sentence_pos = 0
    for i, token in enumerate(tokens):
        if sentence_pos < len(sentence_tokens) and token == sentence_tokens[sentence_pos]:
            # Check if the full sequence matches and ensure it doesn't go out of bounds
            if i + len(sentence_tokens) - sentence_pos <= len(tokens) and \
                all(tokens[i + j] == sentence_tokens[sentence_pos + j] for j in range(len(sentence_tokens) - sentence_pos)):
                # Mark the mask for the length of the sentence tokens
                mask[i:i + len(sentence_tokens)] = [1] * len(sentence_tokens)
                break
            else:
                # Increment if it's not a full sequence match
                sentence_pos += 1
        elif token == sentence_tokens[0]:  # Reset if it's the beginning of sentence tokens
            sentence_pos = 0
    return tokens, mask

In [12]:
task = "ner"  # Should be one of "ner", "pos" or "chunk"
model_checkpoint = "distilbert-base-uncased"
batch_size = 16

In [15]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [47]:
def create_data_set(samples):
    data = []

    for sample in samples:
        unique_content_df = df_clean[df_clean['content'] == sample]
        masks = []
    
        review = remove_punctuation(sample.lower())
        input_ids = tokenizer(review, truncation=True, padding='max_length', max_length=512, return_tensors='pt')['input_ids'][0]
    
        for index, row in unique_content_df.iterrows():
            book = remove_punctuation(row['title3'].lower())
            
            tokens, mask = create_mask_for_sentence(review, book)  # Assuming this returns a mask of the same length as tokens
            masks.append(mask)
        masks = np.bitwise_or.reduce(np.array(masks), axis=0)
        
        data.append({
            "tokens": tokens,
            "ner_tags": masks
        })

    return data

In [40]:
samples = np.random.choice(df_clean['content'].unique(), size=100, replace=False)

train_dataset = Dataset.from_list(create_data_set(samples[:int(len(samples) * 0.8)]))
val_dataset = Dataset.from_list(create_data_set(samples[int(len(samples) * 0.8):]))

In [20]:
label_all_tokens = True

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(examples[f"{task}_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [21]:
tokenized_dataset_train = train_dataset.map(tokenize_and_align_labels, batched=True)
tokenized_dataset_val = val_dataset.map(tokenize_and_align_labels, batched=True)

tokenized_dataset_train

Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 248.16 examples/s]
Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:00<00:00, 240.11 examples/s]


Dataset({
    features: ['tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 100
})

In [23]:
model = TFAutoModelForTokenClassification.from_pretrained(
    model_checkpoint, num_labels=2
)





Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForTokenClassification: ['vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing TFDistilBertForTokenClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForTokenClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able t

In [25]:
num_train_epochs = 3
num_train_steps = (len(tokenized_dataset_train) // batch_size) * num_train_epochs
optimizer, lr_schedule = create_optimizer(
    init_lr=2e-5,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
    num_warmup_steps=0,
)

In [27]:
model.compile(optimizer=optimizer)

In [29]:
data_collator = DataCollatorForTokenClassification(tokenizer, return_tensors="np")

In [30]:
train_set = model.prepare_tf_dataset(
    tokenized_dataset_train,
    shuffle=True,
    batch_size=batch_size,
    collate_fn=data_collator,
)

validation_set = model.prepare_tf_dataset(
    tokenized_dataset_val,
    shuffle=False,
    batch_size=batch_size,
    collate_fn=data_collator,
)

In [33]:
metric = load_metric("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [p for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [l for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)
    ]
    
    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "macro_f1": sklearn.metrics.f1_score(y_true=true_labels, y_pred=true_predictions, average='macro'),
        "accuracy": results["overall_accuracy"],
    }


metric_callback = KerasMetricCallback(
    metric_fn=compute_metrics, eval_dataset=validation_set
)

  metric = load_metric("seqeval")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [34]:
callbacks = [metric_callback]

model.fit(
    train_set,
    validation_data=validation_set,
    epochs=10,
    callbacks=callbacks,
)

Epoch 1/10
Cause: for/else statement not yet supported
Cause: for/else statement not yet supported


 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


[[-100    0    0 ...    0    0 -100]
 [-100    0    0 ...    0    0 -100]
 [-100    0    0 ...    0    0 -100]
 ...
 [-100    0    0 ...    0    0 -100]
 [-100    0    0 ...    0    0 -100]
 [-100    0    0 ...    0    0 -100]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


Epoch 2/10

KeyboardInterrupt: 

In [None]:
tokenized = tokenizer([sample], return_tensors="np", truncation=True)

In [None]:
tokenized

In [None]:
outputs = model(tokenized).logits
classes = np.argmax(outputs, axis=-1)[0]
print(classes)