In [26]:
"""
Author: Stanley Joel Gona
Last modified: 29-03-2023
Description: An LSTM-based neural network for classifying spoiler types in clickbait posts.
Dataset: https://pan.webis.de/semeval23/pan23-web/clickbait-challenge.html#data
"""

'\nAuthor: Stanley Joel Gona\nLast modified: 29-03-2023\nDescription: An LSTM-based neural network for classifying spoiler types in clickbait posts.\nDataset: https://pan.webis.de/semeval23/pan23-web/clickbait-challenge.html#data\n'

In [27]:
import nltk
nltk.download('punkt') # Downloading the 'punkt' package from NLTK
nltk.download('stopwords') # Downloading the 'stopwords' package from NLTK

import json
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
import torch

def load_jsonl(file_path, field):
    """
    Load values from a specific field in a JSON Lines file.

    Args:
        file_path: Path to the JSON Lines file.
        field: Field to extract values from.

    Returns:
        List of extracted field values.
    """
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            data.append(json.loads(line))
    return data

# Load training and validation data
train_data = load_jsonl("/content/train.jsonl", ['postText', 'targetTitle', 'tags'])
valid_data = load_jsonl("/content/validation.jsonl", ['postText', 'targetTitle', 'tags'])

# Count the frequency of each word in the training data
word_counter = Counter()
for data_entry in train_data:
    if not data_entry['postText'] or not data_entry['targetTitle']:
        continue
    text_content = data_entry['postText'][0].lower() + ' ' + data_entry['targetTitle'][0].lower()
    text_content = re.sub(f"[{re.escape(string.punctuation)}]", " ", text_content)
    text_content = re.sub(r'\d+', '', text_content)
    token_list = word_tokenize(text_content)
    stop_word_set = set(stopwords.words('english'))
    token_list = [token for token in token_list if token not in stop_word_set]
    word_counter.update(token_list)

# Create the vocabulary dictionary
vocab_map = {'<PAD>': 0, '<UNK>': 1}
for word, count in word_counter.most_common():
    vocab_map[word] = len(vocab_map)

def clean_dataset(dataset, vocab_map):
    """
    Cleans the input dataset by performing the following operations:
    converts all text to lowercase
    removes punctuations and digits
    tokenizes the text
    removes stop words
    maps each token to its corresponding index in the provided vocabulary dictionary


    Args:
        dataset: List of dictionaries containing 'postText' and 'targetTitle' keys.
        vocab_map: Dictionary mapping tokens to their corresponding indices.

    Returns:
        List of cleaned dictionaries.
    """
    cleaned_data = []
    for data_entry in dataset:
        if not data_entry['postText'] or not data_entry['targetTitle']:
            continue
        cleaned_entry = data_entry.copy()
        content = data_entry['postText'][0].lower() + ' ' + data_entry['targetTitle'][0].lower()  # converts all text to lowercase
        content = re.sub(f"[{re.escape(string.punctuation)}]", " ", content)  # Remove punctuations
        content = re.sub(r'\d+', '', content)  # Removes digit
        tokens = word_tokenize(content)  # Tokenize the text
        stop_words = set(stopwords.words('english'))  # Remove stop words
        tokens = [token for token in tokens if token not in stop_words]
        cleaned_entry['text'] = [vocab_map.get(token, 0) for token in tokens]  # 0 is the index for unknown tokens
        cleaned_data.append(cleaned_entry)  # Map each token to its corresponding index in the provided vocabulary dictionary
    return cleaned_data  # A list of dictionaries representing the cleaned dataset

# Clean the training and validation datasets

train_preprocessed = clean_dataset(train_data, vocab_map)
valid_preprocessed = clean_dataset(valid_data, vocab_map)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [28]:
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader

def encode_tags(tags, tag_dict):
    """
    This function encodes the given list of tags using the provided tag dictionary

    Args:
        tags (list): A list of tags to be encoded.
        tag_dict (dict): A dictionary mapping tags to their corresponding indices

    Returns:
        list: A list of encoded tags as integers.
    """
    return [tag_dict[tag] for tag in tags] 


# Extracts the text sequences from the preprocessed training and validation data
train_sequences = [data_entry['text'] for data_entry in train_preprocessed]
valid_sequences = [data_entry['text'] for data_entry in valid_preprocessed]

tag_dict = {'multi': 0, 'phrase': 1, 'passage': 2} # A dictionary that maps tag names to integer indices.

train_encoded_tags = [encode_tags(data_entry['tags'], tag_dict) for data_entry in train_preprocessed]
valid_encoded_tags = [encode_tags(data_entry['tags'], tag_dict) for data_entry in valid_preprocessed]


class TextTaggingDataset(Dataset):
    """
    Defines a PyTorch Dataset class for text tagging tasks, where each data entry consists of a sequence of tokenized words and a corresponding tag.
    """
    def __init__(self, sequences, tags):
        """
        Args:
          sequences (list): A list of tokenized word sequences
          tags (list): A list of corresponding tag sequences 
        """
        self.sequences = sequences
        self.tags = tags
        
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        sequence = torch.LongTensor(self.sequences[idx])
        tag = torch.LongTensor(self.tags[idx])
        return sequence, tag
    
def collate_fn(batch):
    """
    This function collates a batch of sequences and tags into padded sequences and tags.

    Args:
        batch (list): A list of tuples containing a sequence tensor and the corresponding tag tensor

    Returns:
        tuple: A tuple containing the padded sequences tensor and the padded tags tensor
    """
    sequences = [item[0] for item in batch]
    tags = [item[1] for item in batch]
    sequences_padded = pad_sequence(sequences, batch_first=True)
    tags_padded = torch.stack(tags, dim=0)
    return sequences_padded, tags_padded

# Creating instances of the TextTaggingDataset class for the preprocessed training and validation datasets  
train_dataset = TextTaggingDataset(train_sequences, train_encoded_tags)
valid_dataset = TextTaggingDataset(valid_sequences, valid_encoded_tags)

batch_size = 64  # Choose an appropriate batch size

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)


In [29]:
import numpy as np

def create_vocabulary(tokenized_texts): 
    """
    This function creates a vocabulary set from tokenized texts, including the token '<UNK>' for unknown words

    Args:
        A list of tokenized text

    Returns:
        Returns the sorted list of vocabulary words

    """
    vocab = set()
    for text in tokenized_texts:
        vocab.update(str(word) for word in text['text'])
    vocab.add("<UNK>")
    return sorted(list(vocab)) 


def create_word_embeddings(vocab, embedding_dim):
    """
    This function returns the word-to-index and index-to-word mappings, as well as the embeddings for the vocabulary

    Args:
        vocab (list): A list of unique vocabulary words.
        embedding_dim (int): The dimension of the word embeddings.

    Returns:
        tuple: A tuple containing the word-to-index dictionary, the index-to-word dictionary, and the embeddings array.
    """
    word_to_index = {word: idx for idx, word in enumerate(vocab)}
    index_to_word = {idx: word for idx, word in enumerate(vocab)}
    embeddings = np.random.uniform(-1, 1, (len(vocab), embedding_dim))
    return word_to_index, index_to_word, embeddings

# Creates a list of tokenized texts for the training data.
train_tokenized_texts = [entry['text'] for entry in train_preprocessed]

# Create vocabulary
vocab = create_vocabulary(train_preprocessed)

# Create word embeddings
embedding_dim = 100 
word_to_index, index_to_word, embeddings = create_word_embeddings(vocab, embedding_dim)

def texts_to_sequences(tokenized_texts, word_to_index):
    """
    Converts a list of tokenized texts to a list of sequences of word indices using the provided vocabulary dictionary.
    """
    sequences = []
    for text in tokenized_texts:
        sequence = [word_to_index.get(word, word_to_index["<UNK>"]) for word in text['text']]
        sequences.append(sequence)
    return sequences # returns a list of sequences

train_sequences = texts_to_sequences(train_preprocessed, word_to_index)
valid_sequences = texts_to_sequences(valid_preprocessed, word_to_index)


def create_tag_mapping(tags):
    """
    This function creates a mapping from unique tags to indices and vice versa for a list of tags
    """
    unique_tags = []
    for tags_entry in tags:
        unique_tags.extend(tags_entry)
    unique_tags = sorted(list(set(unique_tags)))
    tag_to_index = {tag: idx for idx, tag in enumerate(unique_tags)}
    index_to_tag = {idx: tag for idx, tag in enumerate(unique_tags)}
    return tag_to_index, index_to_tag

tags = [entry['tags'] for entry in train_preprocessed]
tag_to_index, index_to_tag = create_tag_mapping(tags)

def encode_tags(tag_data, tag_to_index):
    """
    Encodes tag data using a provided tag-to-index mapping dictionary.
    """
    encoded_tags = [[tag_to_index[tag] for tag in tags_entry] for tags_entry in tag_data]
    return encoded_tags # Returns a list of lists of encoded tags

train_encoded_tags = encode_tags([entry['tags'] for entry in train_preprocessed], tag_to_index)
valid_encoded_tags = encode_tags([entry['tags'] for entry in valid_preprocessed], tag_to_index)


In [30]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class LSTMClassifier(torch.nn.Module):
    """
    Defines a PyTorch module using an LSTM model
    """
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_classes, num_layers=1):
        super().__init__()
        self.embedding = torch.nn.Embedding(vocab_size, embedding_dim)
        self.lstm = torch.nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, batch_first=True)
        self.fc = torch.nn.Linear(hidden_dim, num_classes)
        self.hidden_size = hidden_dim
        self.num_layers = num_layers
        
    def forward(self, x):
        """
        This function defines the forward pass of the LSTMClassifier model, which takes in a sequence of tokenized text as input 
        and returns the logits for each class as output.
        """
        embedded = self.embedding(x)
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        output, _ = self.lstm(embedded, (h0, c0))
        last_output = output[:, -1, :]
        logits = self.fc(last_output)
        return logits


# Define the model hyperparameters
vocab_size = len(vocab_map)
hidden_dim = [64,96,128]
num_classes = len(tag_to_index)

hidden_dims = [64, 96, 128]

for hidden_dim in hidden_dims:
    print(f'Training with hidden_dim: {hidden_dim}')
    
    # Define the model with the current hidden_dim
    model = LSTMClassifier(vocab_size, embedding_dim, hidden_dim, num_classes)

    # Loss function and optimizer
    learning_rate = 0.007
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    # Print header
    header = "{:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10}".format("Epoch", 'Hidden dim', 'Learning Rate', "Train Loss", "Train Acc", "Val Loss", "Val Acc")
    print(header)
    print("-" * len(header))

    # Train the model
    num_epochs = 25

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        train_correct = 0
        for sequences, tags in train_loader:
            optimizer.zero_grad()
            logits = model(sequences)
            loss = criterion(logits, tags[:, 0])
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * sequences.size(0)
            train_correct += (torch.argmax(logits, dim=1) == tags[:, 0]).sum().item()
        train_loss /= len(train_dataset)
        train_acc = train_correct / len(train_dataset)

        model.eval()
        valid_loss = 0.0
        valid_correct = 0
        with torch.no_grad():
            for sequences, tags in valid_loader:
                logits = model(sequences)
                loss = criterion(logits, tags[:, 0])
                valid_loss += loss.item() * sequences.size(0)
                valid_correct += (torch.argmax(logits, dim=1) == tags[:, 0]).sum().item()
        valid_loss /= len(valid_dataset)
        valid_acc = valid_correct / len(valid_dataset)

        # Print formatted results for this epoch
        row = "{:<10} {:<10} {:<10} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f}".format(epoch + 1, hidden_dim, learning_rate, train_loss, train_acc, valid_loss, valid_acc)
        print(row)

    print("\n")


Training with hidden_dim: 64
Epoch      Hidden dim Learning Rate Train Loss Train Acc  Val Loss   Val Acc   
-------------------------------------------------------------------------------
1          64         0.007      1.0475     0.4186     1.0444     0.3980    
2          64         0.007      1.0359     0.4108     1.0339     0.4318    
3          64         0.007      0.9918     0.4617     1.0130     0.4456    
4          64         0.007      0.8125     0.6283     1.0910     0.4681    
5          64         0.007      0.5610     0.7937     1.2448     0.4831    
6          64         0.007      0.3321     0.8887     1.5274     0.4768    
7          64         0.007      0.1982     0.9409     1.7132     0.4856    
8          64         0.007      0.1185     0.9672     1.9275     0.4756    
9          64         0.007      0.0698     0.9806     2.0733     0.4931    
10         64         0.007      0.0422     0.9900     2.2209     0.4743    
11         64         0.007      0.0270  