We initially experimented with logistic regression, RoBERTa-base, and RoBERTa-large.

The following code implements the final model, an ensemble of DeBERTa-v3-large and ELECTRA-large-discriminator, which achieved the highest F1 score and outperformed all previous models.


In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, TrainerCallback
from torch.utils.data import Dataset
import torch
import re
import os
from scipy.stats import mode
import transformers
import sys
import torch
import os
import shutil

print(f"Transformers version: {transformers.__version__}")

In [None]:
class PrintMetricsCallback(TrainerCallback):
    def __init__(self):
        self.train_loss = None

    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs and 'loss' in logs:
            self.train_loss = logs['loss']
            print(f"Step {state.global_step}: Train Loss: {logs['loss']:.4f}")
            sys.stdout.flush()

    def on_epoch_end(self, args, state, control, metrics=None, **kwargs):
        if metrics is not None:
            train_loss = self.train_loss if self.train_loss is not None else 'N/A'
            val_loss = metrics.get('eval_loss', 'N/A')
            eval_f1 = metrics.get('eval_f1', 'N/A')
            print(f"\nEpoch | Training Loss | Validation Loss | F1")
            print(f"{int(state.epoch)}     | {train_loss:.4f}       | {val_loss:.4f}        | {eval_f1:.4f}\n")
            sys.stdout.flush()

class ClickbaitDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512, is_test=False):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.is_test = is_test

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        # Concatenate postText, targetTitle, and targetParagraphs
        text = f"{row['postText']} [SEP] {row['targetTitle']} [SEP] {' '.join(row['targetParagraphs'])}"
        # Clean and format text
        text = re.sub(r'[^\w\s]', ' ', text.lower())
        text = ' '.join(text.split())

        # Tokenizing the text
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        item = {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze()
        }

        # Return labels if it's not the test set
        if not self.is_test:
            item['labels'] = torch.tensor(row['label'], dtype=torch.long)

        return item

def compute_class_weights(df):
    # Calculate the frequency of each class
    class_counts = df['tags'].value_counts().sort_index()
    total_samples = len(df)

    # Compute class weights: inverse frequency
    weights = []
    for count in class_counts.values:
        weight = total_samples / (len(class_counts) * count)
        weights.append(weight)
    # Apply square root to make weights less aggressive
    weights = np.sqrt(weights)
    return torch.tensor(weights, dtype=torch.float).to('cuda' if torch.cuda.is_available() else 'cpu')

class WeightedTrainer(Trainer):
    def __init__(self, class_weights, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, num_items_in_batch=None, return_outputs=False):
        labels = inputs.pop('labels')
        outputs = model(**inputs)
        logits = outputs.logits

        # Using class-weighted cross-entropy loss
        loss_fn = torch.nn.CrossEntropyLoss(weight=self.class_weights)
        loss = loss_fn(logits, labels)
        return (loss, outputs) if return_outputs else loss


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    #F1 score calculation
    f1 = f1_score(labels, predictions, average='macro')
    return {'eval_f1': f1}

In [None]:
INPUT_DIR = '/kaggle/input/dataset'
OUTPUT_DIR = '/kaggle/working'
TRAIN_PATH = os.path.join(INPUT_DIR, 'train.jsonl')
VAL_PATH = os.path.join(INPUT_DIR, 'val.jsonl')
TEST_PATH = os.path.join(INPUT_DIR, 'test.jsonl')


for path in [TRAIN_PATH, VAL_PATH, TEST_PATH]:
    if not os.path.exists(path):
        raise FileNotFoundError(f"File not found: {path}")
    print(f"Found: {path}")


torch.manual_seed(42)
np.random.seed(42)


try:
    train_data = pd.read_json(TRAIN_PATH, lines=True)
    val_data = pd.read_json(VAL_PATH, lines=True)
    test_data = pd.read_json(TEST_PATH, lines=True)
    print("Data loaded successfully")
    print("Training samples:", len(train_data))
    print("Validation samples:", len(val_data))
    print("Test samples:", len(test_data))
except Exception as e:
    raise ValueError(f"Failed to load JSON files: {e}")

# Preprocess tags column to handle lists
def preprocess_tags(tags):
    if isinstance(tags, list):
        return tags[0] if tags else np.nan
    return tags

# Apply preprocessing to 'tags' column in training and validation datasets
train_data['tags'] = train_data['tags'].apply(preprocess_tags)
val_data['tags'] = val_data['tags'].apply(preprocess_tags)

print("Sample tags (train):", train_data['tags'].head(10).tolist())
print("Tags types (train):", train_data['tags'].apply(type).value_counts())
print("Unique tags (train):", train_data['tags'].unique())


if train_data['tags'].isna().any() or val_data['tags'].isna().any():
    raise ValueError("Found NaN values in tags after preprocessing")

# Label encoding: Convert tags into numerical labels
label_encoder = LabelEncoder()
train_data['label'] = label_encoder.fit_transform(train_data['tags'])
val_data['label'] = label_encoder.transform(val_data['tags'])

print("Encoded labels:", label_encoder.classes_)

# Compute class weights to handle class imbalance
class_weights = compute_class_weights(train_data)
print("Class weights:", class_weights)




In [None]:
models = [
    {"name": "microsoft/deberta-v3-large", "max_length": 512},
    {"name": "google/electra-large-discriminator", "max_length": 512}
    # {"name": "roberta-large", "max_length": 512}  # for future work, we can try this one as well.
]

training_args = TrainingArguments(
    output_dir=os.path.join(OUTPUT_DIR, "results"),
    num_train_epochs=5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=4,
    learning_rate=2e-5, #1e-5 for tuning
    warmup_steps=100,
    weight_decay=0.01,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='eval_f1',
    logging_dir=os.path.join(OUTPUT_DIR, "logs"),
    logging_steps=10,
    fp16=True,
    gradient_accumulation_steps=4,
    report_to="none",
    save_total_limit=1,
    max_grad_norm=1.0,
    dataloader_pin_memory=False,
    remove_unused_columns=False,
)

In [None]:
# Initialize empty lists to store the logits (predictions) for validation and test sets
val_logits = []
test_logits = []

# Clean the output directory before saving new results (In kaggle, we have limited output storage)
print(f"Clearing previous output in {OUTPUT_DIR}...")
sys.stdout.flush()
for item in os.listdir(OUTPUT_DIR):
    item_path = os.path.join(OUTPUT_DIR, item)
    if os.path.isfile(item_path):
        os.remove(item_path)
    elif os.path.isdir(item_path):
        shutil.rmtree(item_path, ignore_errors=True)
print(f"Cleared {len(os.listdir(OUTPUT_DIR))} items from {OUTPUT_DIR}")
sys.stdout.flush()


# Now train and evaluate each model
for model_config in models:
    model_name = model_config["name"]
    max_length = model_config["max_length"]
    print(f"Training {model_name} at {pd.Timestamp.now()}...")
    sys.stdout.flush()

    try:

        tokenizer = AutoTokenizer.from_pretrained(model_name)
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token

        model = AutoModelForSequenceClassification.from_pretrained(
            model_name,
            num_labels=3, # We have 3 classes for spoiler types
            problem_type="single_label_classification"
        )


        print(f"Creating datasets for {model_name}...")
        sys.stdout.flush()
        train_dataset = ClickbaitDataset(train_data, tokenizer, max_length)
        val_dataset = ClickbaitDataset(val_data, tokenizer, max_length)
        test_dataset = ClickbaitDataset(test_data, tokenizer, max_length, is_test=True)
        print(f"Dataset sizes: Train={len(train_dataset)}, Val={len(val_dataset)}, Test={len(test_dataset)}")
        sys.stdout.flush()

        training_args.output_dir = os.path.join(OUTPUT_DIR, f"results_{model_name.split('/')[-1]}")
        training_args.logging_dir = os.path.join(OUTPUT_DIR, f"logs_{model_name.split('/')[-1]}")

        print(f"Trainer initialized for {model_name}, starting training...")
        sys.stdout.flush()
        trainer = WeightedTrainer(
            class_weights=class_weights,
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            compute_metrics=compute_metrics,
            callbacks=[PrintMetricsCallback()] # We can monitor the training process
        )

        # Train the model
        print("Starting trainer.train()...")
        sys.stdout.flush()
        trainer.train()
        print("Finished trainer.train()")
        sys.stdout.flush()


        print(f"Predicting for validation and test sets with {model_name}...")
        sys.stdout.flush()
        val_pred = trainer.predict(val_dataset).predictions
        test_pred = trainer.predict(test_dataset).predictions
        print(f"Logits shapes: Val={val_pred.shape}, Test={test_pred.shape}")
        sys.stdout.flush()

        # Store the logits (predictions) for later use in ensembling
        val_logits.append(val_pred)
        test_logits.append(test_pred)


        del model, trainer, tokenizer
        torch.cuda.empty_cache() #Free up GPU memory

    except Exception as e:
        print(f"Error training {model_name}: {e}")
        continue


In [None]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder

print("Creating ensemble predictions...")


val_logits_stacked = np.hstack([logits for logits in val_logits])
test_logits_stacked = np.hstack([logits for logits in test_logits])
print(f"val_logits_stacked shape: {val_logits_stacked.shape}")
print(f"test_logits_stacked shape: {test_logits_stacked.shape}")


stacking_classifier = LogisticRegression(multi_class='multinomial', max_iter=1000)
stacking_classifier.fit(val_logits_stacked, val_data['label'])

final_test_preds = stacking_classifier.predict(test_logits_stacked)
final_spoiler_types = label_encoder.inverse_transform(final_test_preds)


print("Available columns in test_data:", test_data.columns.tolist())


try:
    submission = pd.DataFrame({
        'id': test_data['uuid'],
        'spoilerType': final_spoiler_types
    })
except KeyError as e:
    print(f"KeyError: {e}. Using index as id if uuid is missing.")
    submission = pd.DataFrame({
        'id': range(len(final_spoiler_types)),
        'spoilerType': final_spoiler_types
    })

submission_path = os.path.join(OUTPUT_DIR, 'submission.csv')
submission.to_csv(submission_path, index=False)

print(f"Submission file created at {submission_path}")

print("Submission preview:")
print(pd.read_csv(submission_path).head())
