In [None]:
#importing libraries
import pandas as pd
import numpy as np
import os
from google.colab import drive
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
# !pip install datasets
from datasets import Dataset, DatasetDict
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from transformers import EvalPrediction
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset, random_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
import numpy as np
from sklearn.metrics import classification_report
from collections import Counter



In [None]:
file_path = "Merged/concatenated_data.txt"
df = pd.read_csv(file_path, sep="\t", header=None, names=["Document_ID", "High_Level_Narratives", "Sub_Narratives"])

In [3]:
# Group by Document_ID and create lists of narratives
df = df.groupby("Document_ID").agg({
    "High_Level_Narratives": lambda x: list(set(";".join(x).split(";"))),
    "Sub_Narratives": lambda x: list(set(";".join(x).split(";")))
}).reset_index()

df.rename(columns={
    "High_Level_Narratives": "High_Level_Narratives_List",
    "Sub_Narratives": "Sub_Narratives_List"
}, inplace=True)


In [None]:

documents_folder = 'Merged/subtask-2-documents'


def load_documents(folder_path):
    documents = {}
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):  # Ensure only text files are read
            doc_id = filename  # Extract Document_ID from filename
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
                documents[doc_id] = file.read().strip()
    return documents

# Load document texts into a dictionary
document_texts = load_documents(documents_folder)

# Map document texts to the DataFrame
df['Text'] = df['Document_ID'].map(document_texts)

# Check if all documents are successfully mapped
print(f"Number of missing documents: {df['Text'].isnull().sum()}")
df


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Number of missing documents: 0


Unnamed: 0,Document_ID,High_Level_Narratives_List,Sub_Narratives_List,Text
0,EN_CC_100000.txt,[CC: Hidden plots by secret schemes of powerfu...,[CC: Criticism of institutions and authorities...,Pentagon plans to serve LAB-GROWN MEAT to troo...
1,EN_CC_100002.txt,[CC: Hidden plots by secret schemes of powerfu...,[CC: Criticism of institutions and authorities...,Oxford Residents Mount Resistance Against the ...
2,EN_CC_100003.txt,"[CC: Criticism of climate movement, CC: Critic...",[CC: Criticism of climate movement: Ad hominem...,"Fonda Heads To Canada For Oil Sands Protest, M..."
3,EN_CC_100004.txt,"[CC: Criticism of climate movement, CC: Contro...",[CC: Controversy about green technologies: Oth...,A Tesla Owner Just Exposed A Sick Secret About...
4,EN_CC_100005.txt,[CC: Criticism of climate movement],[CC: Criticism of climate movement: Climate mo...,Climate Crazies Fail in Attempt to Vandalize A...
...,...,...,...,...
435,EN_UA_DEV_216.txt,"[URW: Negative Consequences for the West, URW:...","[URW: Discrediting the West, Diplomacy: The We...","EU 'biggest loser' in Ukraine conflicts, Hunga..."
436,EN_UA_DEV_22.txt,"[URW: Discrediting the West, Diplomacy, URW: B...","[URW: Discrediting the West, Diplomacy: The EU...",European Parliament members clash over support...
437,EN_UA_DEV_23.txt,"[URW: Distrust towards Media, URW: Praise of R...",[URW: Amplifying war-related fears: By continu...,What is the current trajectory of the evil emp...
438,EN_UA_DEV_24.txt,"[URW: Negative Consequences for the West, URW:...","[URW: Discrediting the West, Diplomacy: Diplom...",Europe ‘Shot Itself in the Lungs’ With Sanctio...


In [5]:
unique_cc = sorted(set(narrative for narratives in df['High_Level_Narratives_List'] for narrative in narratives if narrative.startswith("CC:")))
unique_urw = sorted(set(narrative for narratives in df['High_Level_Narratives_List'] for narrative in narratives if narrative.startswith("URW:")))
label_space = ["Other"] + unique_cc + unique_urw

In [6]:
label_space

['Other',
 'CC: Amplifying Climate Fears',
 'CC: Climate change is beneficial',
 'CC: Controversy about green technologies',
 'CC: Criticism of climate movement',
 'CC: Criticism of climate policies',
 'CC: Criticism of institutions and authorities',
 'CC: Downplaying climate change',
 'CC: Green policies are geopolitical instruments',
 'CC: Hidden plots by secret schemes of powerful groups',
 'CC: Questioning the measurements and science',
 'URW: Amplifying war-related fears',
 'URW: Blaming the war on others rather than the invader',
 'URW: Discrediting Ukraine',
 'URW: Discrediting the West, Diplomacy',
 'URW: Distrust towards Media',
 'URW: Hidden plots by secret schemes of powerful groups',
 'URW: Negative Consequences for the West',
 'URW: Overpraising the West',
 'URW: Praise of Russia',
 'URW: Russia is the Victim',
 'URW: Speculating war outcomes']

In [7]:
len(label_space)

22

In [8]:
def create_multi_label_targets(row):
    target = [0] * len(label_space)
    if "Other" in row['High_Level_Narratives_List']:
        target[0] = 1
    for narrative in row['High_Level_Narratives_List']:
        if narrative in label_space:
            target[label_space.index(narrative)] = 1
    return target


In [9]:
df['multi_label_targets'] = df.apply(create_multi_label_targets, axis=1)

In [10]:
df

Unnamed: 0,Document_ID,High_Level_Narratives_List,Sub_Narratives_List,Text,multi_label_targets
0,EN_CC_100000.txt,[CC: Hidden plots by secret schemes of powerfu...,[CC: Criticism of institutions and authorities...,Pentagon plans to serve LAB-GROWN MEAT to troo...,"[0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, ..."
1,EN_CC_100002.txt,[CC: Hidden plots by secret schemes of powerfu...,[CC: Criticism of institutions and authorities...,Oxford Residents Mount Resistance Against the ...,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, ..."
2,EN_CC_100003.txt,"[CC: Criticism of climate movement, CC: Critic...",[CC: Criticism of climate movement: Ad hominem...,"Fonda Heads To Canada For Oil Sands Protest, M...","[0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,EN_CC_100004.txt,"[CC: Criticism of climate movement, CC: Contro...",[CC: Controversy about green technologies: Oth...,A Tesla Owner Just Exposed A Sick Secret About...,"[0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,EN_CC_100005.txt,[CC: Criticism of climate movement],[CC: Criticism of climate movement: Climate mo...,Climate Crazies Fail in Attempt to Vandalize A...,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...
435,EN_UA_DEV_216.txt,"[URW: Negative Consequences for the West, URW:...","[URW: Discrediting the West, Diplomacy: The We...","EU 'biggest loser' in Ukraine conflicts, Hunga...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
436,EN_UA_DEV_22.txt,"[URW: Discrediting the West, Diplomacy, URW: B...","[URW: Discrediting the West, Diplomacy: The EU...",European Parliament members clash over support...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, ..."
437,EN_UA_DEV_23.txt,"[URW: Distrust towards Media, URW: Praise of R...",[URW: Amplifying war-related fears: By continu...,What is the current trajectory of the evil emp...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
438,EN_UA_DEV_24.txt,"[URW: Negative Consequences for the West, URW:...","[URW: Discrediting the West, Diplomacy: Diplom...",Europe ‘Shot Itself in the Lungs’ With Sanctio...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."


In [11]:
df['High_Level_Narratives_List'][0]

['CC: Hidden plots by secret schemes of powerful groups',
 'CC: Controversy about green technologies',
 'CC: Criticism of institutions and authorities']

In [12]:
df['multi_label_targets'][0]

[0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [13]:
len(df['multi_label_targets'][0])

22

In [14]:
!pip install scikit-multilearn



In [15]:
from skmultilearn.model_selection import iterative_train_test_split
labels = np.array(df['multi_label_targets'].tolist(), dtype=np.float32)

# Convert indices to NumPy array for splitting
indices = np.arange(len(df))

# Perform iterative stratified split on indices
train_indices, train_labels, test_indices, test_labels = iterative_train_test_split(indices.reshape(-1, 1), labels, test_size=0.2)


train_indices = train_indices.ravel()
test_indices = test_indices.ravel()
train_texts = df.iloc[train_indices]["Text"].tolist()
test_texts = df.iloc[test_indices]["Text"].tolist()


In [16]:
import pandas as pd
import numpy as np
import random

def handle_rare_classes(data_df, labels, min_samples=15, max_duplication=3):
    """Handles rare classes and returns updated DataFrame and labels."""

    label_sums = np.sum(labels, axis=0)
    rare_classes = np.where(label_sums < min_samples)[0]

    if len(rare_classes) == 0:
        print("No rare classes found. Skipping balancing.")
        return data_df, labels

    print(f"Rare classes found: {len(rare_classes)}. Applying undersampling & controlled oversampling...")

    # 1. Undersample Majority Classes (using DataFrame operations)
    label_counts = np.sum(labels, axis=0)
    majority_classes = np.where(label_counts > min_samples * 5)[0]

    if len(majority_classes) > 0:
        print(f"Undersampling {len(majority_classes)} majority classes...")
        drop_indices = []
        for maj_class in majority_classes:
            maj_indices = data_df[labels[:, maj_class] == 1].index.tolist()
            keep_size = min(len(maj_indices), min_samples * 5)
            drop_indices.extend(random.sample(maj_indices, len(maj_indices) - keep_size))

        data_df = data_df.drop(drop_indices).reset_index(drop=True)
        labels = np.delete(labels, drop_indices, axis=0)

    # 2. Controlled Oversampling for Rare Classes
    new_rows = []  # List to store new rows (including 'Text' and 'Document_ID')
    new_labels = []

    for rare_class in rare_classes:
        rare_indices = data_df[labels[:, rare_class] == 1].index.tolist()

        if len(rare_indices) == 0:
            continue

        num_to_add = min_samples - len(rare_indices)
        num_to_add = min(num_to_add, max_duplication * len(rare_indices))

        for _ in range(num_to_add):
            idx = random.choice(rare_indices)
            original_row = data_df.iloc[idx].copy() # Get a copy of the row
            new_text = original_row["Text"]

            if len(new_text) > 20:
                words = new_text.split()
                if len(words) > 5:
                    random.shuffle(words[:5])
                new_text = " ".join(words)

            original_row["Text"] = new_text #Update text with augmentation
            new_rows.append(original_row)
            new_labels.append(labels[idx])

    if new_rows:
        new_data_df = pd.DataFrame(new_rows)
        data_df = pd.concat([data_df, new_data_df], ignore_index=True)
        labels = np.vstack([labels, np.array(new_labels)])

    print(f"Final dataset size: {len(data_df)} samples")
    return data_df, labels.astype(np.float32)

In [17]:
import numpy as np

def print_label_distribution(labels):
    """Prints the count of positive instances for each class."""
    label_counts = np.sum(labels, axis=0)
    print("Label Counts:", label_counts)
    print("Min:", label_counts.min(), "Max:", label_counts.max())


train_df = df.iloc[train_indices].copy().reset_index(drop=True)
train_texts = train_df["Text"].tolist()
train_labels = np.array(train_labels)

# Check class distribution after applying handle_rare_classes
print("Before balancing:")
print_label_distribution(np.array(train_labels))

train_df, train_labels = handle_rare_classes(train_df, train_labels, min_samples=15, max_duplication=3)
train_document_ids = train_df["Document_ID"].tolist()
print(train_document_ids)
print("After balancing:")
print_label_distribution(np.array(train_labels))


Before balancing:
Label Counts: [144.   6.   4.  18.  37.  33.  56.  16.   5.  31.  17.  32.  30.  32.
  54.  11.  10.  13.   9.  13.  14.  18.]
Min: 4.0 Max: 144.0
Rare classes found: 9. Applying undersampling & controlled oversampling...
Undersampling 1 majority classes...
Final dataset size: 331 samples
['EN_CC_100002.txt', 'EN_CC_100004.txt', 'EN_CC_100005.txt', 'EN_CC_100007.txt', 'EN_CC_100011.txt', 'EN_CC_100021.txt', 'EN_CC_100024.txt', 'EN_CC_100030.txt', 'EN_CC_100034.txt', 'EN_CC_100037.txt', 'EN_CC_100042.txt', 'EN_CC_100044.txt', 'EN_CC_100047.txt', 'EN_CC_100054.txt', 'EN_CC_100066.txt', 'EN_CC_100069.txt', 'EN_CC_100076.txt', 'EN_CC_100091.txt', 'EN_CC_100095.txt', 'EN_CC_100106.txt', 'EN_CC_100124.txt', 'EN_CC_100136.txt', 'EN_CC_100139.txt', 'EN_CC_100146.txt', 'EN_CC_100147.txt', 'EN_CC_100172.txt', 'EN_CC_100213.txt', 'EN_CC_100232.txt', 'EN_CC_200007.txt', 'EN_CC_200009.txt', 'EN_CC_200015.txt', 'EN_CC_200016.txt', 'EN_CC_200022.txt', 'EN_CC_200030.txt', 'EN_CC_2000

In [18]:


print(f"Length of train_df: {len(train_df)}")
print(f"Length of train_labels: {len(train_labels)}")

train_texts = train_df["Text"].tolist()
train_document_ids = train_df["Document_ID"].tolist()
print(f"Length of train_texts: {len(train_texts)}")
print(f"Length of train_document_ids: {len(train_document_ids)}")

Length of train_df: 331
Length of train_labels: 331
Length of train_texts: 331
Length of train_document_ids: 331


In [20]:
from datasets import Dataset, DatasetDict

train_dataset = Dataset.from_dict({
    "text": train_texts,
    "labels": [np.array(lbl, dtype=np.float32).tolist() for lbl in train_labels],
    "Document_ID": train_document_ids
})

test_dataset = Dataset.from_dict({
    "text": test_texts,
    "labels": [np.array(lbl, dtype=np.float32).tolist() for lbl in test_labels],
    "Document_ID": df["Document_ID"].iloc[test_indices].tolist()
})


In [21]:
datasets = DatasetDict({"train": train_dataset, "test": test_dataset})

In [22]:
datasets['train'][0]

 'labels': [0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0],
 'Document_ID': 'EN_CC_100002.txt'}

In [23]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
base_model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=len(label_space),  problem_type="multi_label_classification")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

tokenized_datasets = datasets.map(tokenize_function, batched=True)

Map:   0%|          | 0/331 [00:00<?, ? examples/s]

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

In [26]:
def find_best_threshold(preds, labels):
    """Find the best threshold for F1-score optimization."""
    best_f1 = 0
    best_threshold = 0.1  # Default threshold
    thresholds = np.arange(0.05, 0.3, 0.01)  # Try thresholds from 0.1 to 0.9

    for threshold in thresholds:
        binarized_preds = (preds > threshold).astype(int)
        f1 = f1_score(labels, binarized_preds, average="weighted", zero_division=0)

        if f1 > best_f1:
            best_f1 = f1
            best_threshold = threshold

    return best_threshold



def compute_metrics(eval_pred):
    logits, labels = eval_pred
    sigmoid_preds = torch.sigmoid(torch.tensor(logits)).numpy()
    best_threshold = find_best_threshold(sigmoid_preds, labels)
    predictions = (sigmoid_preds > best_threshold).astype(int)

    adjusted_preds = []
    for i, pred in enumerate(predictions):
        doc_id = test_dataset["Document_ID"][i]  # Access metadata

        # Start with the original predictions
        adjusted_pred = pred.copy()

        if doc_id.startswith("EN_CC"):
            # Remove URW predictions
            for urw in unique_urw:
                if urw in label_space:
                    adjusted_pred[label_space.index(urw)] = 0

            # If valid CC labels are predicted, remove "Other"
            if any(adjusted_pred[label_space.index(cc)] == 1 for cc in unique_cc if cc in label_space):
                adjusted_pred[label_space.index("Other")] = 0

            # Fallback to "Other" if no valid CC labels are predicted
            if not any(adjusted_pred[label_space.index(cc)] == 1 for cc in unique_cc if cc in label_space):
                adjusted_pred[label_space.index("Other")] = 1

        elif doc_id.startswith("EN_UA") or doc_id.startswith("EN_UA_DEV"):
            # Remove CC predictions
            for cc in unique_cc:
                if cc in label_space:
                    adjusted_pred[label_space.index(cc)] = 0

            # If valid URW labels are predicted, remove "Other"
            if any(adjusted_pred[label_space.index(urw)] == 1 for urw in unique_urw if urw in label_space):
                adjusted_pred[label_space.index("Other")] = 0

            # Fallback to "Other" if no valid URW labels are predicted
            if not any(adjusted_pred[label_space.index(urw)] == 1 for urw in unique_urw if urw in label_space):
                adjusted_pred[label_space.index("Other")] = 1

        adjusted_preds.append(adjusted_pred)

    adjusted_preds = np.array(adjusted_preds)



    # Compute metrics for multi-label classification
    precision = precision_score(labels, adjusted_preds, average="samples", zero_division=0)
    recall = recall_score(labels, adjusted_preds, average="samples", zero_division=0)
    f1 = f1_score(labels, adjusted_preds, average="samples", zero_division=0)
    return {"precision": precision, "recall": recall, "f1": f1}

In [27]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [28]:
# Ensure Document_ID is treated as a string
df['Document_ID'] = df['Document_ID'].astype(str)


In [29]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=30,
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    logging_dir="./logs",
    logging_steps=100,
    save_total_limit=1,
    seed=42,
    weight_decay=0.01,
    fp16=True
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [None]:


trainer = Trainer (
    model=base_model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


  trainer = Trainer (


In [32]:
trainer.train()


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.263924,0.132121,0.6,0.209994
2,No log,0.234151,0.359339,0.453704,0.371283
3,0.321000,0.206547,0.344436,0.636111,0.415931
4,0.321000,0.193753,0.500317,0.635185,0.525634
5,0.217800,0.180587,0.559228,0.740741,0.597848
6,0.217800,0.170723,0.506883,0.712963,0.559444
7,0.217800,0.173138,0.436539,0.687963,0.501254
8,0.165200,0.167786,0.45683,0.70463,0.521839
9,0.165200,0.158274,0.568126,0.760185,0.620522
10,0.135100,0.167567,0.474603,0.727778,0.542135


TrainOutput(global_step=1260, training_loss=0.10805722020921253, metrics={'train_runtime': 1490.9519, 'train_samples_per_second': 6.66, 'train_steps_per_second': 0.845, 'total_flos': 2613161946009600.0, 'train_loss': 0.10805722020921253, 'epoch': 30.0})

In [33]:
metrics = trainer.evaluate()
print(metrics)


{'eval_loss': 0.15624403953552246, 'eval_precision': 0.6032671957671958, 'eval_recall': 0.7342592592592592, 'eval_f1': 0.6378964245630913, 'eval_runtime': 0.9253, 'eval_samples_per_second': 97.261, 'eval_steps_per_second': 12.968, 'epoch': 30.0}


In [34]:
raw_predictions = trainer.predict(tokenized_datasets["test"])

In [35]:
raw_predictions

PredictionOutput(predictions=array([[-2.3808594 , -4.578125  , -4.796875  , ..., -4.8632812 ,
        -4.9179688 , -4.8476562 ],
       [-2.9648438 , -4.3789062 , -4.8398438 , ..., -4.7773438 ,
        -4.9648438 , -4.8945312 ],
       [-2.8964844 , -4.6289062 , -4.7070312 , ..., -4.265625  ,
        -4.8515625 , -4.5898438 ],
       ...,
       [-1.7197266 , -5.421875  , -5.6835938 , ..., -3.5957031 ,
        -3.8886719 , -2.5       ],
       [-3.4121094 , -4.1796875 , -4.4726562 , ..., -2.1640625 ,
        -3.0761719 , -0.98876953],
       [-3.7265625 , -3.046875  , -3.1074219 , ..., -0.7216797 ,
        -0.26879883, -0.94970703]], dtype=float32), label_ids=array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.]], dtype=float32), metrics={'test_loss': 0.15624403953552246, 'test_precision': 0.603267195767

In [36]:
sigmoid_preds = torch.sigmoid(torch.tensor(raw_predictions.predictions)).numpy()

In [37]:
def find_best_threshold(preds, labels):
    """Find the best threshold for F1-score optimization."""
    best_f1 = 0
    best_threshold = 0.5  # Default threshold
    thresholds = np.arange(0.1, 0.9, 0.05)  # Try thresholds from 0.1 to 0.9

    for threshold in thresholds:
        binarized_preds = (preds > threshold).astype(int)
        f1 = f1_score(labels, binarized_preds, average="weighted", zero_division=0)

        if f1 > best_f1:
            best_f1 = f1
            best_threshold = threshold

    return best_threshold

In [38]:
labels = np.array(test_labels)  # Convert test labels to NumPy array
best_threshold = find_best_threshold(sigmoid_preds, labels)

In [39]:
best_threshold

0.20000000000000004

In [40]:
binary_predictions = (sigmoid_preds > best_threshold).astype(int)


In [41]:
adjusted_preds = []

In [42]:
def decode_labels(binary_vector, label_space):
    return [label for i, label in enumerate(label_space) if binary_vector[i] == 1]

In [57]:
# Step 1: Adjust predictions based on compute_metrics logic
adjusted_preds = []

for i, pred in enumerate(binary_predictions):
    doc_id = test_dataset["Document_ID"][i]  # Access Document_ID
    adjusted_pred = pred.copy()

    # Apply the same adjustment logic as compute_metrics
    if doc_id.startswith("EN_CC"):
            # Remove URW predictions
            for urw in unique_urw:
                if urw in label_space:
                    adjusted_pred[label_space.index(urw)] = 0

            # If valid CC labels are predicted, remove "Other"
            if any(adjusted_pred[label_space.index(cc)] == 1 for cc in unique_cc if cc in label_space):
                adjusted_pred[label_space.index("Other")] = 0

            # Fallback to "Other" if no valid CC labels are predicted
            if not any(adjusted_pred[label_space.index(cc)] == 1 for cc in unique_cc if cc in label_space):
                adjusted_pred[label_space.index("Other")] = 1

    elif doc_id.startswith("EN_UA") or doc_id.startswith("EN_UA_DEV"):
            # Remove CC predictions
            for cc in unique_cc:
                if cc in label_space:
                    adjusted_pred[label_space.index(cc)] = 0

            # If valid URW labels are predicted, remove "Other"
            if any(adjusted_pred[label_space.index(urw)] == 1 for urw in unique_urw if urw in label_space):
                adjusted_pred[label_space.index("Other")] = 0

            # Fallback to "Other" if no valid URW labels are predicted
            if not any(adjusted_pred[label_space.index(urw)] == 1 for urw in unique_urw if urw in label_space):
                adjusted_pred[label_space.index("Other")] = 1

    adjusted_preds.append(adjusted_pred)

# Step 2: Decode adjusted predictions and ground truths
decoded_preds = [decode_labels(pred, label_space) for pred in adjusted_preds]
decoded_labels = [decode_labels(label, label_space) for label in labels]

# Step 3: Create DataFrame with Document_ID, adjusted predicted labels, and ground truths
result_df = pd.DataFrame({
    "Document_ID": test_dataset["Document_ID"],
    "Adjusted_Predicted_Labels": decoded_preds,
    "Ground_Truths": decoded_labels
})

# Step 4: Identify incorrect predictions
incorrect_predictions = []

for idx, row in result_df.iterrows():
    doc_id = row["Document_ID"]
    predicted = row["Adjusted_Predicted_Labels"]
    ground_truth = row["Ground_Truths"]

    # Check if the document is EN_CC but predicts URW
    if doc_id.startswith("EN_CC") and any(label.startswith("URW:") for label in predicted):
        incorrect_predictions.append((doc_id, "Predicted URW for CC"))

    # Check if the document is EN_UA but predicts CC
    elif doc_id.startswith("EN_UA") and any(label.startswith("CC:") for label in predicted):
        incorrect_predictions.append((doc_id, "Predicted CC for URW"))

    # Check if both Other and CC/URW are predicted
    if "Other" in predicted and (any(label.startswith("CC:") for label in predicted) or any(label.startswith("URW:") for label in predicted)):
        incorrect_predictions.append((doc_id, "Predicted both Other and CC/URW"))

# Step 5: Print results
print("Incorrect Predictions:")
for doc_id, issue in incorrect_predictions:
    print(f"Document_ID: {doc_id}, Issue: {issue}")



Incorrect Predictions:


In [44]:
result_df

Unnamed: 0,Document_ID,Adjusted_Predicted_Labels,Ground_Truths
0,EN_CC_100000.txt,"[CC: Criticism of climate movement, CC: Critic...","[CC: Controversy about green technologies, CC:..."
1,EN_CC_100003.txt,"[CC: Criticism of climate movement, CC: Critic...","[CC: Criticism of climate movement, CC: Critic..."
2,EN_CC_100008.txt,"[CC: Criticism of climate movement, CC: Critic...",[Other]
3,EN_CC_100010.txt,[Other],[Other]
4,EN_CC_100012.txt,"[CC: Criticism of climate movement, CC: Critic...",[CC: Criticism of institutions and authorities...
...,...,...,...
85,EN_UA_300076.txt,[Other],[URW: Blaming the war on others rather than th...
86,EN_UA_300079.txt,[URW: Blaming the war on others rather than th...,"[URW: Discrediting Ukraine, URW: Discrediting ..."
87,EN_UA_300080.txt,"[URW: Discrediting the West, Diplomacy]","[URW: Discrediting Ukraine, URW: Praise of Rus..."
88,EN_UA_300090.txt,[URW: Blaming the war on others rather than th...,"[URW: Discrediting the West, Diplomacy, URW: P..."


In [55]:
!zip -r results.zip ./results


  adding: results/ (stored 0%)
  adding: results/checkpoint-504/ (stored 0%)
  adding: results/checkpoint-504/trainer_state.json (deflated 75%)
  adding: results/checkpoint-504/vocab.json (deflated 59%)
  adding: results/checkpoint-504/tokenizer_config.json (deflated 75%)
  adding: results/checkpoint-504/scheduler.pt (deflated 56%)
  adding: results/checkpoint-504/model.safetensors (deflated 12%)
  adding: results/checkpoint-504/merges.txt (deflated 53%)
  adding: results/checkpoint-504/special_tokens_map.json (deflated 52%)
  adding: results/checkpoint-504/training_args.bin (deflated 52%)
  adding: results/checkpoint-504/optimizer.pt (deflated 28%)
  adding: results/checkpoint-504/config.json (deflated 63%)
  adding: results/checkpoint-504/rng_state.pth (deflated 25%)
  adding: results/checkpoint-504/tokenizer.json (deflated 82%)


In [56]:
from google.colab import files
files.download("results.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>