In [1]:
# !python -m spacy download nl_core_news_sm

In [2]:
# pip install regex datasets spacy scikit-learn tqdm pickle5 matplotlib torchtext torchcrf torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121


In [3]:
import pandas as pd
import numpy as np
import re

from datasets import Dataset
import spacy
import gensim
import os
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from tqdm import tqdm
import pickle
import math
import time
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
import torch
# from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torchtext.vocab import Vectors
from torchtext.vocab import GloVe
from torch.optim import Adam
import torch.nn as nn
from TorchCRF import CRF
from torch.cuda.amp import GradScaler, autocast



In [5]:
from loss_functions import compute_class_weights

In [6]:
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [7]:
pd.set_option('display.max_columns', None)

In [8]:
# Load leeuwarde courant Excel file into a DataFrame
df_lc = pd.read_excel('dataset/data/manullay_check_partially_matched_titles.xlsx', engine='openpyxl')

In [9]:
# Load trouw and het parool annotated book review file into a DataFrame
df_trouw_parool = pd.read_csv('dataset/data/trouw_and_parool_annotated_book_titles.csv')

In [10]:
def remove_extra_spaces(text):
    # Replace multiple spaces with a single space
    cleaned_text = re.sub(r'\s+', ' ', text)
    return cleaned_text.strip()

In [11]:
df_lc['content'] = df_lc['content'].apply(remove_extra_spaces)
df_lc['title1'] = df_lc['title1'].apply(remove_extra_spaces)
df_lc['title4'] = df_lc['title4'].apply(remove_extra_spaces)

In [12]:
def remove_punctuation(input_string):
    # Adding old Dutch quotation marks to the punctuation list
    extended_punctuation = string.punctuation + '„“'
    # Create a translation table that maps each punctuation character to None
    translator = str.maketrans('', '', extended_punctuation)
    # Translate the input string using the translation table
    return input_string.translate(translator)

In [13]:
# First, filter rows where 'manually_removed' is 1 and get unique 'content' values in these rows
content_removed = df_lc[df_lc['manually_removed'] == 1]['content'].unique()

# Now, filter out these 'content' values from the main DataFrame and find unique 'content' not removed
df_lc_clean = df_lc[~df_lc['content'].isin(content_removed)]

## Preprare for training

In [14]:
label_list = ['O', 'I']

In [15]:
def find_sentence_in_text(full_text, sentence):
    start_index = full_text.find(sentence)
    if start_index == -1:
        raise ValueError("Sentence not found in text.")
    end_index = start_index + len(sentence)
    return start_index, end_index


def create_mask_for_sentence(full_text, sentence, nlp, force_lower_case=False):
    # Use the already loaded nlp model to process the text
    doc = nlp(full_text)
    start_index, end_index = find_sentence_in_text(full_text.lower(), sentence.lower())
    if start_index is None:
        return None, None
    if force_lower_case:
        tokens = [token.text.lower() for token in doc]
    else:
        tokens = [token.text for token in doc]
    mask = [0] * len(doc)

    for i, token in enumerate(doc):
        token_end_idx = token.idx + len(token.text)
        if token.idx <= end_index and token_end_idx >= start_index:
            mask[i] = 1

    return tokens, mask


def create_data_set(samples, df, nlp, remove_punc=False, force_lower_case=False):
    data = []
    for sample in tqdm(samples):
        unique_content_df = df[df['content'] == sample]
        masks = []

        if remove_punc:
            review = remove_punctuation(sample)
        else:
            review = sample
        
        for _, row in unique_content_df.iterrows():
            if remove_punc:
                book = remove_punctuation(row['title4'])
            else:
                book = row['title4']
                
            tokens, mask = create_mask_for_sentence(full_text=review, sentence=book, nlp=nlp, force_lower_case=force_lower_case) 
            if mask is not None:
                masks.append(mask)

        if masks:
            combined_mask = np.bitwise_or.reduce(np.array(masks), axis=0)
            data.append({"tokens": tokens, "ner_tags": combined_mask})

    return data

def trouw_parool_create_dataset(df, nlp, remove_punc=False, force_lower_case=False):
    data = []
    
    for sample in tqdm(df['text'].unique()):
        unique_content_df = df[df['text'] == sample]
        masks = []

        if remove_punc:
            review = remove_punctuation(sample)
        else:
            review = sample

        
        doc = nlp(review)
        if force_lower_case:
            tokens = [token.text.lower() for token in doc]
        else:
            tokens = [token.text for token in doc]

        
        for _, row in unique_content_df.iterrows():
            start_index, end_index = row['start_index'], row['end_index']

            mask = [0] * len(doc)
            
            for i, token in enumerate(doc):
                token_end_idx = token.idx + len(token.text)
                if token.idx <= end_index and token_end_idx >= start_index:
                    mask[i] = 1
                
            masks.append(mask)

        if masks:
            combined_mask = np.bitwise_or.reduce(np.array(masks), axis=0)
            data.append({"tokens": tokens, "ner_tags": combined_mask})
            
    return data

In [16]:
nlp = spacy.load("nl_core_news_sm")

In [17]:
remove_punc = False
force_lower_case = False

In [18]:
lc_train_filename = 'C:/Users/niels/PycharmProjects/BookReviewsThesis/dataset/data/lc_train_dataset.pkl'
lc_val_filename = 'C:/Users/niels/PycharmProjects/BookReviewsThesis/dataset/data/lc_val_dataset.pkl'
lc_test_filename = 'C:/Users/niels/PycharmProjects/BookReviewsThesis/dataset/data/lc_test_dataset.pkl'

trouw_parool_test_filename = 'C:/Users/niels/PycharmProjects/BookReviewsThesis/dataset/data/trouw_parool_test_dataset.pkl'

In [19]:
# Function to save dataset to file
def save_dataset(dataset, filename):
    os.makedirs(os.path.dirname(filename), exist_ok=True)  # Ensure the directory exists
    with open(filename, 'wb') as f:
        pickle.dump(dataset, f)

# Function to load dataset from file
def load_dataset(filename):
    with open(filename, 'rb') as f:
        return pickle.load(f)

In [20]:
# Set the random seed for reproducibility
np.random.seed(42)

# Shuffle the unique samples
samples = df_lc_clean['content'].unique()
np.random.shuffle(samples)

# Define the split ratios
train_ratio = 0.7
val_ratio = 0.15
test_ratio = 0.15

# Compute the split indices
train_end = int(len(samples) * train_ratio)
val_end = train_end + int(len(samples) * val_ratio)

# Split the data
lc_train_samples = samples[:train_end]
lc_val_samples = samples[train_end:val_end]
lc_test_samples = samples[val_end:]


if os.path.exists(lc_train_filename) and os.path.exists(lc_val_filename) and os.path.exists(lc_test_filename):
    print("Loading training, validation, and test datasets....")
    lc_train_dataset = load_dataset(lc_train_filename)
    lc_val_dataset = load_dataset(lc_val_filename)
    lc_test_dataset = load_dataset(lc_test_filename)
else:
    print("Creating training, validation, and test datasets....")
    # Create dataset
    lc_train_dataset = Dataset.from_list(create_data_set(samples=lc_train_samples, df=df_lc_clean, nlp=nlp, remove_punc=remove_punc, force_lower_case=force_lower_case))
    lc_val_dataset = Dataset.from_list(create_data_set(samples=lc_val_samples, df=df_lc_clean, nlp=nlp, remove_punc=remove_punc, force_lower_case=force_lower_case))
    lc_test_dataset = Dataset.from_list(create_data_set(samples=lc_test_samples, df=df_lc_clean, nlp=nlp, remove_punc=remove_punc, force_lower_case=force_lower_case))

    # Save dataset, so we don't have to create it everytime again
    save_dataset(lc_train_dataset, lc_train_filename)
    save_dataset(lc_val_dataset, lc_val_filename)
    save_dataset(lc_test_dataset, lc_test_filename)

print("Done...")

Loading training, validation, and test datasets....
Done...


In [21]:
lc_train_dataset

Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 8774
})

In [22]:
lc_val_dataset

Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 1880
})

In [23]:
lc_test_dataset

Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 1881
})

In [24]:
if os.path.exists(trouw_parool_test_filename):
    print("Loading trouw parool test dataset....")
    trouw_parool_test_dataset = load_dataset(trouw_parool_test_filename)
else:
    print("Creating trouw parool test dataset....")
    trouw_parool_test_dataset = Dataset.from_list(trouw_parool_create_dataset(df=df_trouw_parool, nlp=nlp, remove_punc=remove_punc, force_lower_case=force_lower_case))
    
    # Save dataset, so we don't have to create it everytime again
    save_dataset(trouw_parool_test_dataset, trouw_parool_test_filename)

Loading trouw parool test dataset....


In [25]:
trouw_parool_test_dataset

Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 308
})

In [None]:
w2v_model = gensim.models.KeyedVectors.load_word2vec_format("C:\\Users\\niels\\PycharmProjects\\BookReviewsThesis\\wordembeddings\\cc.nl.300.vec")

vocab = {word: idx for idx, word in enumerate(w2v_model.index_to_key)}
unk_index = len(vocab)  # Index for unknown words

In [None]:
# Convert Word2Vec embeddings to torch tensor and add an extra row for unknown tokens
pretrained_embeddings = torch.FloatTensor(np.vstack((w2v_model.vectors, np.zeros((1, w2v_model.vector_size)))))

In [None]:
# Padding function
def pad_sequence(sequence, max_len, pad_value=0):
    return sequence + [pad_value] * (max_len - len(sequence))

# Collate function for DataLoader
def collate_fn(batch):
    max_len = max(len(item['tokens']) for item in batch)
    tokens = [pad_sequence([vocab.get(token, unk_index) for token in item['tokens']], max_len) for item in batch]
    ner_tags = [pad_sequence(item['ner_tags'], max_len) for item in batch]

    # Debugging: Check the range of indices
    tokens_tensor = torch.tensor(tokens)
    ner_tags_tensor = torch.tensor(ner_tags)
    # Debugging: Check the range of indices
    assert tokens_tensor.max().item() < pretrained_embeddings.size(0), f"Found index out of range: {tokens_tensor.max().item()}"
    assert tokens_tensor.min().item() >= 0, f"Found negative index: {tokens_tensor.min().item()}"
    
    return torch.tensor(tokens), torch.tensor(ner_tags)




# Optimize DataLoader
train_dataloader = DataLoader(lc_train_dataset, batch_size=64, collate_fn=collate_fn, shuffle=True)
val_dataloader = DataLoader(lc_val_dataset, batch_size=64, collate_fn=collate_fn, shuffle=False)

In [None]:
# Function to count unknown tokens in the dataset
def count_unknown_tokens(dataset, vocab, unk_index=0):
    unknown_token_count = 0
    total_token_count = 0

    for item in dataset:
        for token in item['tokens']:
            total_token_count += 1
            if vocab.get(token, unk_index) == unk_index:
                unknown_token_count += 1

    return unknown_token_count, total_token_count

# Calculate the number of unknown tokens in the train_dataset
unknown_token_count, total_token_count = count_unknown_tokens(lc_train_dataset, vocab, unk_index)

print(f"Percentage of unknown tokens: {100 * unknown_token_count / total_token_count:.2f}%")

In [None]:
# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class BiLSTMCRF(nn.Module):
    def __init__(self, vocab_size, tagset_size, embedding_dim, hidden_dim, pretrained_embeddings):
        super(BiLSTMCRF, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(pretrained_embeddings)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.hidden2tag = nn.Linear(hidden_dim * 2, tagset_size)
        self.crf = CRF(tagset_size)
    
    def forward(self, sentences):
        embeddings = self.embedding(sentences)
        lstm_out, _ = self.lstm(embeddings)
        emissions = self.hidden2tag(lstm_out)
        return emissions
    
    def loss(self, emissions, tags, mask):
        return -self.crf(emissions, tags, mask=mask)
    
    def predict(self, emissions, mask):
        return self.crf.viterbi_decode(emissions, mask=mask)

# Hyperparameters
EMBEDDING_DIM = 300
HIDDEN_DIM = 50
TAGSET_SIZE = 2

# Initialize model
model = BiLSTMCRF(len(vocab), TAGSET_SIZE, EMBEDDING_DIM, HIDDEN_DIM, pretrained_embeddings)


# Move the model to the GPU
model.to(device)

In [None]:
def evaluate(model, dataloader):
    model.eval()
    all_predictions = []
    all_labels = []
    with torch.no_grad():
        for sentences, tags in dataloader:
            sentences, tags = sentences.to(device), tags.to(device)  # Move data to GPU
            mask = sentences != 0
            mask[:, 0] = 1  # Ensure the first timestep mask is always on
            emissions = model(sentences)
            predictions = model.predict(emissions, mask)
            for i, sentence in enumerate(sentences):
                valid_length = mask[i].sum().item()
                all_predictions.extend(predictions[i][:valid_length])
                all_labels.extend(tags[i][:valid_length].tolist())

    macro_f1 = f1_score(y_true=all_labels, y_pred=all_predictions, average='macro')
    macro_precision = precision_score(y_true=all_labels, y_pred=all_predictions, average='macro')
    macro_recall = recall_score(y_true=all_labels, y_pred=all_predictions, average='macro')
    accuracy = accuracy_score(y_true=all_labels, y_pred=all_predictions)
    
    return {
        "I_sum": sum(all_predictions),
        "accuracy": accuracy,
        "macro_f1": macro_f1,
        "macro_precision": macro_precision,
        "macro_recall": macro_recall
    }

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

num_parameters = count_parameters(model)
print(f'Total number of trainable parameters: {num_parameters}')

In [None]:
class MoMLoss(nn.Module):
    def __init__(self, model):
        super(MoMLoss, self).__init__()
        self.model = model

    def forward(self, emissions, tags, mask):
        # Compute class weights
        flattened_tags = tags.view(-1)
        weights = compute_class_weights(flattened_tags)
        
        # Calculate the CRF loss using the model's loss function
        loss = self.model.loss(emissions, tags, mask)  # Shape: [batch_size]

        # Calculate the weights for each sequence in the batch
        sequence_weights = weights[tags]  # Shape: [batch_size, seq_len]

        # Apply the mask to the weights
        sequence_weights = sequence_weights * mask.float()

        # Compute the average weight for each sequence
        sequence_weights = sequence_weights.sum(dim=1) / mask.sum(dim=1).float()

        # Apply the sequence weights to the loss
        weighted_loss = loss * sequence_weights

        # Return the mean of the weighted loss
        return weighted_loss.mean()

In [None]:
# Define your training function
def train(model, dataloader, val_dataloader, epochs=5):
    len_dataloader = len(dataloader)
    optimizer = Adam(model.parameters())
    scaler = GradScaler()  # Mixed precision scaler
    criterion = MoMLoss(model)  # Use the custom MoMLoss with CRF loss

    
    # History dictionary to store loss and evaluation metrics
    history = {
        "epoch": [],
        "loss": [],
        "eval_f1": [],
        "eval_precision": [],
        "eval_recall": [],
        "accuracy": [],
        "total_I_pred": []
    }

    # Start the timer
    start_time = time.time()
    
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for sentences, tags in tqdm(dataloader):
            sentences, tags = sentences.to(device), tags.to(device)  # Move data to GPU
            mask = sentences != 0
            mask[:, 0] = 1  # Ensure the first timestep mask is always on

            optimizer.zero_grad()
            with autocast():  # Mixed precision context
                emissions = model(sentences)
                loss = criterion(emissions, tags, mask)  # Use custom MoMLoss with CRF loss

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

            total_loss += loss.item()
            
        eval_metrics = evaluate(model, val_dataloader)
        avg_loss = total_loss / len_dataloader
        
        print(f'Epoch {epoch + 1}, Loss: {avg_loss}, Eval F1: {eval_metrics["macro_f1"]}, '
              f'Macro Precision: {eval_metrics["macro_precision"]}, Macro Recall: {eval_metrics["macro_recall"]}, '
              f'Accuracy: {eval_metrics["accuracy"]}, Number of I preds {eval_metrics["I_sum"]}')

        # Update history
        history["epoch"].append(epoch + 1)
        history["loss"].append(avg_loss)
        history["eval_f1"].append(eval_f1)
        history["total_I_pred"].append(I_total_pred)
    
    # End the timer
    training_time = time.time() - start_time

    history['total_train_time'] = training_time
    
    print(f'Training completed in: {training_time:.2f} seconds')

    return history

In [None]:
training_history = train(model, train_dataloader, val_dataloader, epochs=10)

In [None]:
training_history

In [None]:
# Create the plot for smoothed training loss
fig, ax1 = plt.subplots(figsize=(10, 6))
line1, = ax1.plot(training_history['epoch'], training_history['loss'], marker='o', color='b', label='Training Loss (MoM)')
ax1.set_title("Training Loss and Evaludation F1 Score vs. Epochs")
ax1.set_xlabel("Epochs")
ax1.set_ylabel("Training Loss")
ax1.grid(True)

# Create secondary y-axis for F1 scores
ax2 = ax1.twinx()
line2, = ax2.plot(training_history['epoch'], training_history['eval_f1'], marker='s', color='r', label='Evaluation F1 Score')
ax2.set_ylabel('F1 Score')

# Combine legends from both axes
lines = [line1, line2]
labels = [l.get_label() for l in lines]
ax1.legend(lines, labels, loc='center right')

## Evaluate on Leeuwarde Courant test data and Trouw & Parool test data

In [None]:
lc_test_dataloader = DataLoader(lc_test_dataset, batch_size=64, collate_fn=collate_fn, shuffle=False)
trouw_parool_test_dataloader = DataLoader(trouw_parool_test_dataset, batch_size=64, collate_fn=collate_fn, shuffle=False)

In [None]:
evaluate(model, lc_test_dataloader)

In [None]:
evaluate(model, trouw_parool_test_dataloader)