In [1]:
#imports
import pandas as pd
import re
import ast
from transformers import XLNetTokenizerFast
from sklearn.model_selection import StratifiedKFold
import numpy as np
import torch
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import precision_score, recall_score, f1_score
from collections import defaultdict

import warnings
warnings.filterwarnings('ignore')

In [2]:
stacked_df = pd.read_csv('combined_data.csv')

In [3]:
stacked_df.head()

Unnamed: 0,TID,Tweet,Subjects,Action/Phrases,Category
0,1496737903631904774,Russian President declared war. On the record....,['everyone'],['stop the war'],CTA
1,1496966148088291335,"Dear Black people, Don’t let Starr convince yo...",['Black people'],['Don’t let Starr convince you to defend the U...,CTA
2,1497463617767227395,"Kyiv, our splendid, peaceful city, survived an...",['world'],"['isolate Russia', 'expel ambassadors', 'oil e...",CTA
3,1497479069658664963,"Today, a Russian missile hit an apartment buil...",['world'],['a decisive response and isolation of Russia ...,CTA
4,1498243321659109376,Cut all business ties with Russia. All of them...,[],['Cut all business ties with Russia'],CTA


In [4]:
stacked_df['Subjects'] = stacked_df['Subjects'].apply(ast.literal_eval)
stacked_df['Action/Phrases'] = stacked_df['Action/Phrases'].apply(ast.literal_eval)

## Sequence Tokenization

In [7]:
def detailed_labeling(row):
    tweet = row['Tweet']
    subjects = row['Subjects']
    actions = row['Action/Phrases']
    category = row['Category']

    # Tokenize tweet and initialize labels
    words = re.findall(r'\w+|[^\w\s]', tweet, re.UNICODE)
    labels = ['0'] * len(words)

    def label_phrase(phrase, label_type):
        # Tokenize phrase
        phrase_words = re.findall(r'\w+|[^\w\s]', phrase, re.UNICODE)
        phrase_len = len(phrase_words)

        # Search for phrase in tokenized words
        for i in range(len(words) - phrase_len + 1):
            if words[i:i + phrase_len] == phrase_words:
                # Check bounds before assigning labels
                if i < len(labels):
                    labels[i] = f'B-{category}-{label_type}'
                for j in range(1, phrase_len):
                    if i + j < len(labels):
                        labels[i + j] = f'I-{category}-{label_type}'

    # Process actions and subjects
    for action in actions:
        label_phrase(action, 'action')
    for subject in subjects:
        label_phrase(subject, 'subject')

    # Combine words and labels into the final format
    labeled_tweet = ' '.join(f'{word}/{label}' for word, label in zip(words, labels))
    return labeled_tweet

In [8]:
stacked_df['detailed_labeling'] = stacked_df.apply(detailed_labeling, axis=1)

In [9]:
stacked_df['Tweet'].iloc[0]

'Russian President declared war. On the record. Should I play the video of your President Ambassador, shall I do it? Or will you confirm it? It’s the responsibility of this body to stop the war. I call on everyone to do everything possible to stop the war'

In [10]:
stacked_df['detailed_labeling'].iloc[0]

'Russian/0 President/0 declared/0 war/0 ./0 On/0 the/0 record/0 ./0 Should/0 I/0 play/0 the/0 video/0 of/0 your/0 President/0 Ambassador/0 ,/0 shall/0 I/0 do/0 it/0 ?/0 Or/0 will/0 you/0 confirm/0 it/0 ?/0 It/0 ’/0 s/0 the/0 responsibility/0 of/0 this/0 body/0 to/0 stop/B-CTA-action the/I-CTA-action war/I-CTA-action ./0 I/0 call/0 on/0 everyone/B-CTA-subject to/0 do/0 everything/0 possible/0 to/0 stop/B-CTA-action the/I-CTA-action war/I-CTA-action'

### Preparing Dataset for XLNET fine-tuning

In [11]:
tokenizer = XLNetTokenizerFast.from_pretrained("xlnet-base-cased")

spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

In [12]:
# Define label-to-ID mapping
label_to_id = {
    "B-CTA-action": 0,
    "I-CTA-action": 1,
    "B-CTA-subject": 2,
    "I-CTA-subject": 3,
     "B-DE-action": 5,
    "I-DE-action": 6,
    "B-DE-subject": 7,
    "I-DE-subject": 8,
    "0": 4,  # Default label
    "-100": -100,  # Ignore label for subword tokens
}
id_to_label = {v: k for k, v in label_to_id.items()}  # Reverse mapping for predictions

In [13]:
def process_labeled_tweet_with_display(labeled_tweet, max_length=512):
    words = []
    labels = []

    # Split words and labels
    for token_label in labeled_tweet.split():
        if "/" not in token_label:
            print(f"Warning: Token without a label - {token_label}. Skipping.")
            continue
        word, label = token_label.rsplit("/", 1)
        words.append(word)
        labels.append(label)

    # Tokenize and align labels
    encoding = tokenizer(
        words,
        is_split_into_words=True,
        truncation=True,
        padding='max_length',
        max_length=max_length,
        return_offsets_mapping=True,
        # Remove return_tensors='pt'
    )

    # Get word IDs to align labels
    word_ids = encoding.word_ids()
    labels_aligned = []
    for idx, word_id in enumerate(word_ids):
        if word_id is None:
            labels_aligned.append(-100)
        else:
            label = labels[word_id]
            labels_aligned.append(label_to_id.get(label, -100))

    encoding["labels"] = labels_aligned
    encoding.pop("offset_mapping")  # Remove offset mapping as it's no longer needed

    # For debugging
    tokens = tokenizer.convert_ids_to_tokens(encoding["input_ids"])
    token_label_pairs = [
        (token, id_to_label.get(label_id, "IGNORED")) if label_id != -100 else (token, "IGNORED")
        for token, label_id in zip(tokens, encoding["labels"])
    ]
    encoding["token_label_pairs"] = token_label_pairs

    return encoding

In [14]:
stacked_df["Processed"] = stacked_df["detailed_labeling"].apply(process_labeled_tweet_with_display)
processed_data = list(stacked_df["Processed"])

In [15]:
# Display tokens and labels for a sample row
for i, row in stacked_df.iterrows():
    print(f"\nRow {i + 1} Tokens and Labels:")
    for token, label in row["Processed"]["token_label_pairs"]:
        print(f"{token:20} -> {label}")
    break


Row 1 Tokens and Labels:
<pad>                -> IGNORED
<pad>                -> IGNORED
<pad>                -> IGNORED
<pad>                -> IGNORED
<pad>                -> IGNORED
<pad>                -> IGNORED
<pad>                -> IGNORED
<pad>                -> IGNORED
<pad>                -> IGNORED
<pad>                -> IGNORED
<pad>                -> IGNORED
<pad>                -> IGNORED
<pad>                -> IGNORED
<pad>                -> IGNORED
<pad>                -> IGNORED
<pad>                -> IGNORED
<pad>                -> IGNORED
<pad>                -> IGNORED
<pad>                -> IGNORED
<pad>                -> IGNORED
<pad>                -> IGNORED
<pad>                -> IGNORED
<pad>                -> IGNORED
<pad>                -> IGNORED
<pad>                -> IGNORED
<pad>                -> IGNORED
<pad>                -> IGNORED
<pad>                -> IGNORED
<pad>                -> IGNORED
<pad>                -> IGNORED
<pad>         

In [16]:
# Remove 'token_label_pairs' from the 'Processed' column
stacked_df['Processed'] = stacked_df['Processed'].apply(lambda x: {k: v for k, v in x.items() if k != 'token_label_pairs'})

# Remove 'token_label_pairs' from the processed_data list
for i in range(len(processed_data)):
    processed_data[i] = {k: v for k, v in processed_data[i].items() if k != 'token_label_pairs'}

In [17]:
print(stacked_df['Processed'].iloc[0])

{'input_ids': [5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5

### 10-fold Cross Validation by fine tuning XLNET

In [18]:
from transformers import Trainer, TrainingArguments, XLNetForTokenClassification, DataCollatorForTokenClassification
from torch.utils.data import Subset

In [19]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [21]:
for item in processed_data[1:2]:
    print(f"Input IDs (Flat): {item['input_ids']}")
    print(f"Labels: {item['labels']}")
    print(f"Input IDs Length: {len(item['input_ids'])}")
    print(f"Labels Length: {len(item['labels'])}")

Input IDs (Flat): [5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 

In [22]:
# Extract labels for stratification
# Use the majority label for each labeled sentence
def get_majority_label(processed):
    labels = processed["labels"]
    labels = [label for label in labels if label not in [-100, 4] ]
    if not labels:
        return 0
    return max(set(labels), key=labels.count)

majority_labels = [get_majority_label(item) for item in processed_data]

In [23]:
num_labels = len(label_to_id) - 1
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=5644)

In [24]:
majority_labels

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 3,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 5,
 7,
 6,
 8,
 6,
 6,
 6,
 6,
 6,
 5,
 6,
 6,
 6,
 5,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 8,
 6,
 6,
 6,
 6,
 7,
 6,
 6,
 6,
 6,
 6,
 5,
 6,
 6,
 6,
 5,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 7,
 6,
 6,
 5,
 6,
 6,
 6,
 6,
 6,
 6,
 5,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 5,
 6,
 5,
 8,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 8,
 6,
 6,
 6,
 8,
 5,
 5,
 7,
 8,
 6,
 6,
 6,
 6,
 7,
 7,
 6,
 7,
 6,
 7,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 5,
 6,
 6,
 7,
 6,
 6,
 6,
 8,
 6,
 6,
 6,
 6,
 6,
 6,
 5,
 6,
 6,
 6,
 8,
 6,
 7,
 5,
 6,
 6,
 6,
 6,
 0,
 6,


In [25]:
def extract_spans(tokens, labels):
    spans = []
    current_span = []
    current_label = None

    for token, label in zip(tokens, labels):
        if label.startswith("B-"):
            if current_span:
                spans.append((current_label, " ".join(current_span)))
                current_span = []
            current_label = label[2:]  # Extract category (e.g., "CTA-action")
            current_span.append(token)
        elif label.startswith("I-") and current_label == label[2:]:
            current_span.append(token)
        else:
            if current_span:
                spans.append((current_label, " ".join(current_span)))
                current_span = []
                current_label = None

    if current_span:
        spans.append((current_label, " ".join(current_span)))

    return spans

In [26]:
def get_tweet_level_labels(decoded_labels):
    categories = set()
    label_counts = {"CTA": 0, "DE": 0}  # Count occurrences of CTA and DE

    for label in decoded_labels:
        if label.startswith("B-") or label.startswith("I-"):
            parts = label.split("-")
            if len(parts) >= 2:
                category = parts[1]  # Extract category (e.g., "CTA" or "DE")
                categories.add(category)
                if category in label_counts:
                    label_counts[category] += 1

    # Determine binary category based on majority count
    if label_counts["CTA"] > label_counts["DE"]:
        bin_category = "CTA"
    else:
        bin_category = "DE"

    return categories, bin_category

In [27]:
def perform_span_classification(model, val_dataset):
    model.eval()
    all_predictions = []
    all_true_labels = []
    all_spans = []
    predicted_tweet_labels = []
    predicted_tweet_labels_bin = []
    true_tweet_labels = []
    true_tweet_labels_bin = []
    indices = val_dataset.indices  # Get the indices of the validation samples

    for idx_in_val_set, item in enumerate(val_dataset):
        index = indices[idx_in_val_set]  # Original index in processed_data / stacked_df

        input_ids = torch.tensor(item["input_ids"]).unsqueeze(0).to(device)
        attention_mask = torch.tensor(item["attention_mask"]).unsqueeze(0).to(device)

        # True labels
        true_labels = item["labels"]

        # Predict logits
        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predicted_labels = torch.argmax(logits, dim=-1).squeeze(0).tolist()

        # Convert predicted label IDs to label names
        tokens = tokenizer.convert_ids_to_tokens(item["input_ids"])
        decoded_labels = [
            id_to_label[label] if label != -100 else "IGNORED"
            for label in predicted_labels
        ]

        # Similarly, decode true labels
        decoded_true_labels = [
            id_to_label[label] if label != -100 else "IGNORED"
            for label in true_labels
        ]

        # Extract spans from predictions
        spans = extract_spans(tokens, decoded_labels)
        all_predictions.append(decoded_labels)
        all_true_labels.append(decoded_true_labels)
        all_spans.append(spans)

        # Get tweet-level labels (as sets)
        predicted_tweet_label, predicted_tweet_label_bin = get_tweet_level_labels(decoded_labels)  # Returns a set
        true_tweet_label, true_tweet_label_bin = get_tweet_level_labels(decoded_true_labels)  # Returns a set

        predicted_tweet_labels.append(predicted_tweet_label)
        predicted_tweet_labels_bin.append(predicted_tweet_label_bin)
        true_tweet_labels.append(true_tweet_label)
        true_tweet_labels_bin.append(true_tweet_label_bin)

    return all_predictions, all_true_labels, all_spans, predicted_tweet_labels, true_tweet_labels, true_tweet_labels_bin, predicted_tweet_labels_bin

In [28]:
data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer,
    padding='max_length',
    max_length=512,
    label_pad_token_id=-100,
    return_tensors='pt',  # Ensure tensors are returned
)

In [29]:
all_labels = ["CTA", "DE"]  # Define all possible labels
label_to_index = {label: idx for idx, label in enumerate(all_labels)}
all_fold_metrics = []
all_fold_metrics_binary = []

In [30]:
def train_fold(train_idx, val_idx, fold):
    print(f"\nFold {fold + 1}")

    # Create train and validation subsets
    train_dataset = Subset(processed_data, train_idx)
    val_dataset = Subset(processed_data, val_idx)
    # Training arguments
    training_args = TrainingArguments(
        output_dir=f"./xlnet_fold_{fold + 1}",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=5e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=3,
        weight_decay=0.01,
        logging_dir=f"./logs_fold_{fold + 1}",
        logging_steps=10,
        save_total_limit=2,
        report_to="none",
    )

    # Initialize the model and trainer
    model = XLNetForTokenClassification.from_pretrained("xlnet-base-cased", num_labels=num_labels).to(device)
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    # Train the model
    trainer.train()

    # Evaluate the model
    results = trainer.evaluate()
    print(f"Results for Fold {fold + 1}: {results}")

    # Perform span classification for validation set
    predictions, true_labels, spans, predicted_tweet_labels, true_tweet_labels, true_tweet_labels_bin, predicted_tweet_labels_bin = perform_span_classification(model, val_dataset)

    tid_to_true_labels = defaultdict(set)
    tid_to_pred_labels = defaultdict(set)

    for idx_in_val_set, item in enumerate(val_dataset):
        index = val_dataset.indices[idx_in_val_set]
        TID = stacked_df.iloc[index]["TID"]
        true_label_set = true_tweet_labels[idx_in_val_set]
        pred_label_set = predicted_tweet_labels[idx_in_val_set]
        tid_to_true_labels[TID].update(true_label_set)
        tid_to_pred_labels[TID].update(pred_label_set)


    # Prepare data for multi-label evaluation
    mlb = MultiLabelBinarizer(classes=all_labels)
    mlb.fit([all_labels])
    aggregated_true_labels = list(tid_to_true_labels.values())
    aggregated_pred_labels = list(tid_to_pred_labels.values())

    # Transform into binary indicator vectors
    aggregated_true_label_vectors = mlb.transform(aggregated_true_labels)
    aggregated_pred_label_vectors = mlb.transform(aggregated_pred_labels)

    # Compute evaluation metrics
    precision = precision_score(aggregated_true_label_vectors, aggregated_pred_label_vectors, average='micro', zero_division=0)
    recall = recall_score(aggregated_true_label_vectors, aggregated_pred_label_vectors, average='micro', zero_division=0)
    f1 = f1_score(aggregated_true_label_vectors, aggregated_pred_label_vectors, average='micro', zero_division=0)
    accuracy = accuracy_score(aggregated_true_label_vectors, aggregated_pred_label_vectors)


    fold_metrics = {
        "Fold": fold + 1,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1-score": f1,
    }
    all_fold_metrics.append(fold_metrics)

    print("\nMulti-Label Classification Metrics:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Micro Precision: {precision:.4f}")
    print(f"Micro Recall: {recall:.4f}")
    print(f"Micro F1-score: {f1:.4f}")

    # Detailed classification report
    print("\nClassification Report:")
    print(classification_report(aggregated_true_label_vectors, aggregated_pred_label_vectors, target_names=all_labels, zero_division=0))

    # Binary Classification Evaluation
    binary_true_labels = [1 if label == "CTA" else 0 for label in true_tweet_labels_bin]
    binary_pred_labels = [1 if label == "CTA" else 0 for label in predicted_tweet_labels_bin]

    # Compute binary classification metrics
    accuracy_bin = accuracy_score(binary_true_labels, binary_pred_labels)
    precision_bin = precision_score(binary_true_labels, binary_pred_labels, zero_division=0)
    recall_bin = recall_score(binary_true_labels, binary_pred_labels, zero_division=0)
    f1_bin = f1_score(binary_true_labels, binary_pred_labels, zero_division=0)

    fold_metrics_binary = {
        "Fold": fold + 1,
        "Accuracy": accuracy_bin,
        "Precision": precision_bin,
        "Recall": recall_bin,
        "F1-score": f1_bin,
    }
    all_fold_metrics_binary.append(fold_metrics_binary)

    print("\nBinary Classification Metrics:")
    print(f"Accuracy: {accuracy_bin:.4f}")
    print(f"Precision: {precision_bin:.4f}")
    print(f"Recall: {recall_bin:.4f}")
    print(f"F1-score: {f1_bin:.4f}")

    # Detailed binary classification report
    print("\nBinary Classification Report:")
    print(classification_report(binary_true_labels, binary_pred_labels, target_names=["DE", "CTA"], zero_division=0))

    return results, spans

In [31]:
results_per_fold = []
spans_per_fold = []


for fold, (train_idx, val_idx) in enumerate(skf.split(processed_data, majority_labels)):
    results, spans = train_fold(train_idx, val_idx, fold)
    results_per_fold.append(results)
    spans_per_fold.append(spans)


Fold 1


pytorch_model.bin:   0%|          | 0.00/467M [00:00<?, ?B/s]

Some weights of XLNetForTokenClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.6,0.61954
2,0.5162,0.567434
3,0.4256,0.603385


Results for Fold 1: {'eval_loss': 0.6033850312232971, 'eval_runtime': 2.3573, 'eval_samples_per_second': 67.026, 'eval_steps_per_second': 8.484, 'epoch': 3.0}

Multi-Label Classification Metrics:
Accuracy: 0.7070
Micro Precision: 0.7632
Micro Recall: 0.9177
Micro F1-score: 0.8333

Classification Report:
              precision    recall  f1-score   support

         CTA       0.59      0.85      0.70        41
          DE       0.84      0.94      0.89       117

   micro avg       0.76      0.92      0.83       158
   macro avg       0.72      0.90      0.79       158
weighted avg       0.78      0.92      0.84       158
 samples avg       0.81      0.92      0.85       158


Binary Classification Metrics:
Accuracy: 0.8101
Precision: 0.6170
Recall: 0.7073
F1-score: 0.6591

Binary Classification Report:
              precision    recall  f1-score   support

          DE       0.89      0.85      0.87       117
         CTA       0.62      0.71      0.66        41

    accuracy        

Some weights of XLNetForTokenClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.6628,0.66259
2,0.6173,0.625713
3,0.3386,0.721211


Results for Fold 2: {'eval_loss': 0.7212113738059998, 'eval_runtime': 2.3497, 'eval_samples_per_second': 67.244, 'eval_steps_per_second': 8.512, 'epoch': 3.0}

Multi-Label Classification Metrics:
Accuracy: 0.7244
Micro Precision: 0.7705
Micro Recall: 0.8924
Micro F1-score: 0.8270

Classification Report:
              precision    recall  f1-score   support

         CTA       0.66      0.80      0.73        41
          DE       0.81      0.92      0.86       117

   micro avg       0.77      0.89      0.83       158
   macro avg       0.74      0.86      0.79       158
weighted avg       0.77      0.89      0.83       158
 samples avg       0.81      0.89      0.84       158


Binary Classification Metrics:
Accuracy: 0.8165
Precision: 0.6500
Recall: 0.6341
F1-score: 0.6420

Binary Classification Report:
              precision    recall  f1-score   support

          DE       0.87      0.88      0.88       117
         CTA       0.65      0.63      0.64        41

    accuracy        

Some weights of XLNetForTokenClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.6653,0.623875
2,0.5605,0.62401
3,0.3223,0.647606


Results for Fold 3: {'eval_loss': 0.6476061940193176, 'eval_runtime': 2.367, 'eval_samples_per_second': 66.75, 'eval_steps_per_second': 8.449, 'epoch': 3.0}

Multi-Label Classification Metrics:
Accuracy: 0.7580
Micro Precision: 0.8101
Micro Recall: 0.9177
Micro F1-score: 0.8605

Classification Report:
              precision    recall  f1-score   support

         CTA       0.64      0.73      0.68        41
          DE       0.87      0.98      0.92       117

   micro avg       0.81      0.92      0.86       158
   macro avg       0.75      0.86      0.80       158
weighted avg       0.81      0.92      0.86       158
 samples avg       0.84      0.92      0.86       158


Binary Classification Metrics:
Accuracy: 0.8291
Precision: 0.6842
Recall: 0.6341
F1-score: 0.6582

Binary Classification Report:
              precision    recall  f1-score   support

          DE       0.88      0.90      0.89       117
         CTA       0.68      0.63      0.66        41

    accuracy          

Some weights of XLNetForTokenClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.7073,0.576671
2,0.516,0.552611
3,0.384,0.590327


Results for Fold 4: {'eval_loss': 0.5903272032737732, 'eval_runtime': 2.3707, 'eval_samples_per_second': 66.647, 'eval_steps_per_second': 8.436, 'epoch': 3.0}

Multi-Label Classification Metrics:
Accuracy: 0.7821
Micro Precision: 0.8182
Micro Recall: 0.9172
Micro F1-score: 0.8649

Classification Report:
              precision    recall  f1-score   support

         CTA       0.67      0.87      0.76        39
          DE       0.88      0.93      0.91       118

   micro avg       0.82      0.92      0.86       157
   macro avg       0.77      0.90      0.83       157
weighted avg       0.83      0.92      0.87       157
 samples avg       0.85      0.91      0.87       157


Binary Classification Metrics:
Accuracy: 0.8291
Precision: 0.6429
Recall: 0.6923
F1-score: 0.6667

Binary Classification Report:
              precision    recall  f1-score   support

          DE       0.90      0.87      0.89       119
         CTA       0.64      0.69      0.67        39

    accuracy        

Some weights of XLNetForTokenClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.6073,0.631604
2,0.5196,0.655632
3,0.3353,0.703931


Results for Fold 5: {'eval_loss': 0.7039305567741394, 'eval_runtime': 2.3665, 'eval_samples_per_second': 66.766, 'eval_steps_per_second': 8.451, 'epoch': 3.0}

Multi-Label Classification Metrics:
Accuracy: 0.7372
Micro Precision: 0.7903
Micro Recall: 0.9363
Micro F1-score: 0.8571

Classification Report:
              precision    recall  f1-score   support

         CTA       0.66      0.90      0.76        39
          DE       0.84      0.95      0.89       118

   micro avg       0.79      0.94      0.86       157
   macro avg       0.75      0.92      0.83       157
weighted avg       0.80      0.94      0.86       157
 samples avg       0.84      0.93      0.87       157


Binary Classification Metrics:
Accuracy: 0.8228
Precision: 0.6410
Recall: 0.6410
F1-score: 0.6410

Binary Classification Report:
              precision    recall  f1-score   support

          DE       0.88      0.88      0.88       119
         CTA       0.64      0.64      0.64        39

    accuracy        

Some weights of XLNetForTokenClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.5758,0.621272
2,0.4889,0.596
3,0.3704,0.648651


Results for Fold 6: {'eval_loss': 0.648651123046875, 'eval_runtime': 2.3517, 'eval_samples_per_second': 67.184, 'eval_steps_per_second': 8.504, 'epoch': 3.0}

Multi-Label Classification Metrics:
Accuracy: 0.7355
Micro Precision: 0.7898
Micro Recall: 0.8797
Micro F1-score: 0.8323

Classification Report:
              precision    recall  f1-score   support

         CTA       0.64      0.75      0.69        40
          DE       0.84      0.92      0.88       118

   micro avg       0.79      0.88      0.83       158
   macro avg       0.74      0.84      0.79       158
weighted avg       0.79      0.88      0.83       158
 samples avg       0.82      0.88      0.84       158


Binary Classification Metrics:
Accuracy: 0.8165
Precision: 0.6410
Recall: 0.6250
F1-score: 0.6329

Binary Classification Report:
              precision    recall  f1-score   support

          DE       0.87      0.88      0.88       118
         CTA       0.64      0.62      0.63        40

    accuracy         

Some weights of XLNetForTokenClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.6354,0.659913
2,0.4506,0.595579
3,0.4945,0.615717


Results for Fold 7: {'eval_loss': 0.6157168745994568, 'eval_runtime': 2.3513, 'eval_samples_per_second': 67.198, 'eval_steps_per_second': 8.506, 'epoch': 3.0}

Multi-Label Classification Metrics:
Accuracy: 0.7580
Micro Precision: 0.8032
Micro Recall: 0.9557
Micro F1-score: 0.8728

Classification Report:
              precision    recall  f1-score   support

         CTA       0.67      0.88      0.76        40
          DE       0.85      0.98      0.91       118

   micro avg       0.80      0.96      0.87       158
   macro avg       0.76      0.93      0.84       158
weighted avg       0.81      0.96      0.87       158
 samples avg       0.86      0.96      0.89       158


Binary Classification Metrics:
Accuracy: 0.8924
Precision: 0.7949
Recall: 0.7750
F1-score: 0.7848

Binary Classification Report:
              precision    recall  f1-score   support

          DE       0.92      0.93      0.93       118
         CTA       0.79      0.78      0.78        40

    accuracy        

Some weights of XLNetForTokenClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.6077,0.61927
2,0.492,0.571224
3,0.4242,0.585327


Results for Fold 8: {'eval_loss': 0.5853270888328552, 'eval_runtime': 2.3698, 'eval_samples_per_second': 66.671, 'eval_steps_per_second': 8.439, 'epoch': 3.0}

Multi-Label Classification Metrics:
Accuracy: 0.7962
Micro Precision: 0.8239
Micro Recall: 0.9177
Micro F1-score: 0.8683

Classification Report:
              precision    recall  f1-score   support

         CTA       0.72      0.82      0.77        40
          DE       0.86      0.95      0.90       118

   micro avg       0.82      0.92      0.87       158
   macro avg       0.79      0.89      0.84       158
weighted avg       0.83      0.92      0.87       158
 samples avg       0.86      0.92      0.88       158


Binary Classification Metrics:
Accuracy: 0.8544
Precision: 0.7429
Recall: 0.6500
F1-score: 0.6933

Binary Classification Report:
              precision    recall  f1-score   support

          DE       0.89      0.92      0.90       118
         CTA       0.74      0.65      0.69        40

    accuracy        

Some weights of XLNetForTokenClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.6529,0.63259
2,0.5792,0.555764
3,0.356,0.600224


Results for Fold 9: {'eval_loss': 0.6002240180969238, 'eval_runtime': 2.3479, 'eval_samples_per_second': 66.869, 'eval_steps_per_second': 8.518, 'epoch': 3.0}

Multi-Label Classification Metrics:
Accuracy: 0.7244
Micro Precision: 0.7760
Micro Recall: 0.9103
Micro F1-score: 0.8378

Classification Report:
              precision    recall  f1-score   support

         CTA       0.62      0.82      0.70        39
          DE       0.84      0.94      0.89       117

   micro avg       0.78      0.91      0.84       156
   macro avg       0.73      0.88      0.80       156
weighted avg       0.78      0.91      0.84       156
 samples avg       0.81      0.90      0.84       156


Binary Classification Metrics:
Accuracy: 0.8217
Precision: 0.6571
Recall: 0.5897
F1-score: 0.6216

Binary Classification Report:
              precision    recall  f1-score   support

          DE       0.87      0.90      0.88       118
         CTA       0.66      0.59      0.62        39

    accuracy        

Some weights of XLNetForTokenClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.6588,0.653964
2,0.5219,0.558409
3,0.406,0.614099


Results for Fold 10: {'eval_loss': 0.6140989065170288, 'eval_runtime': 2.3527, 'eval_samples_per_second': 66.731, 'eval_steps_per_second': 8.501, 'epoch': 3.0}

Multi-Label Classification Metrics:
Accuracy: 0.7885
Micro Precision: 0.8155
Micro Recall: 0.8782
Micro F1-score: 0.8457

Classification Report:
              precision    recall  f1-score   support

         CTA       0.66      0.74      0.70        39
          DE       0.87      0.92      0.90       117

   micro avg       0.82      0.88      0.85       156
   macro avg       0.77      0.83      0.80       156
weighted avg       0.82      0.88      0.85       156
 samples avg       0.84      0.88      0.85       156


Binary Classification Metrics:
Accuracy: 0.8408
Precision: 0.6750
Recall: 0.6923
F1-score: 0.6835

Binary Classification Report:
              precision    recall  f1-score   support

          DE       0.90      0.89      0.89       118
         CTA       0.68      0.69      0.68        39

    accuracy       

## Printing few Classified Spans from each fold

In [32]:
for fold, spans in enumerate(spans_per_fold):
    print(f"\nFold {fold + 1} Spans:")
    for span in spans[:5]:  # Display spans for the first 5 validation samples
        print(span)


Fold 1 Spans:
[('CTA-action', '▁Help'), ('CTA-action', '▁protect ▁our ▁people'), ('DE-action', 'barb'), ('DE-action', 'arian'), ('CTA-action', '▁Help ▁us'), ('CTA-action', '▁Provide ▁us ▁with ▁combat ▁aircraft'), ('CTA-action', '▁Do')]
[('CTA-action', '▁Help ▁deprive ▁ <unk> ▁ <unk> ▁a g gress or ▁of ▁its ▁assets'), ('CTA-action', '▁submit ▁information ▁that ▁will ▁help ▁in ▁search ▁ , ▁seizure ▁ , ▁ conf is cation ▁of ▁ <unk> ▁ <unk> ▁assets')]
[('DE-action', '▁bar'), ('DE-action', 'bar'), ('DE-action', 'ic'), ('DE-subject', '▁Russian'), ('DE-action', '▁war')]
[('DE-subject', '▁Russia')]
[('CTA-action', '▁double ▁down ▁and ▁ensure ▁Ukraine ▁defeat s ▁Russia ▁so ▁President ▁Zel en sky y ▁can ▁negotiate ▁from ▁a ▁position ▁of ▁strength')]

Fold 2 Spans:
[('DE-subject', '▁Putin'), ('CTA-action', '▁partners'), ('CTA-action', '▁help ▁Ukraine ▁defend ▁itself'), ('CTA-action', '▁Close ▁the ▁sky ▁now')]
[('DE-subject', '▁Russian'), ('DE-subject', '▁Russian'), ('CTA-subject', 'In'), ('CTA-act

## Average Metrics for Multi-Label Classification

In [33]:
metrics_df = pd.DataFrame(all_fold_metrics)
print(metrics_df)

   Fold  Accuracy  Precision    Recall  F1-score
0     1  0.707006   0.763158  0.917722  0.833333
1     2  0.724359   0.770492  0.892405  0.826979
2     3  0.757962   0.810056  0.917722  0.860534
3     4  0.782051   0.818182  0.917197  0.864865
4     5  0.737179   0.790323  0.936306  0.857143
5     6  0.735484   0.789773  0.879747  0.832335
6     7  0.757962   0.803191  0.955696  0.872832
7     8  0.796178   0.823864  0.917722  0.868263
8     9  0.724359   0.775956  0.910256  0.837758
9    10  0.788462   0.815476  0.878205  0.845679


In [34]:
average_metrics = metrics_df.drop(columns="Fold").mean()
print("Average Metrics (Multi-Label):")
average_metrics

Average Metrics (Multi-Label):


Unnamed: 0,0
Accuracy,0.7511
Precision,0.796047
Recall,0.912298
F1-score,0.849972


## Average Metrics for Binary Classification

In [35]:
metrics_df_binary = pd.DataFrame(all_fold_metrics_binary)
print(metrics_df_binary)

   Fold  Accuracy  Precision    Recall  F1-score
0     1  0.810127   0.617021  0.707317  0.659091
1     2  0.816456   0.650000  0.634146  0.641975
2     3  0.829114   0.684211  0.634146  0.658228
3     4  0.829114   0.642857  0.692308  0.666667
4     5  0.822785   0.641026  0.641026  0.641026
5     6  0.816456   0.641026  0.625000  0.632911
6     7  0.892405   0.794872  0.775000  0.784810
7     8  0.854430   0.742857  0.650000  0.693333
8     9  0.821656   0.657143  0.589744  0.621622
9    10  0.840764   0.675000  0.692308  0.683544


In [36]:
average_metrics_binary = metrics_df_binary.drop(columns="Fold").mean()
print("Average Metrics (Binary):")
average_metrics_binary

Average Metrics (Binary):


Unnamed: 0,0
Accuracy,0.833331
Precision,0.674601
Recall,0.664099
F1-score,0.668321
