In [1]:
#importing libraries
import pandas as pd
import numpy as np
import os
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
!pip install datasets
from datasets import Dataset, DatasetDict
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from transformers import EvalPrediction

  from .autonotebook import tqdm as notebook_tqdm



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
file_path = "data/EN/subtask-2-annotations.txt"
df = pd.read_csv(file_path, sep="\t", header=None, names=["Document_ID", "High_Level_Narratives", "Sub_Narratives"])

In [3]:
df

Unnamed: 0,Document_ID,High_Level_Narratives,Sub_Narratives
0,EN_CC_100013.txt,CC: Criticism of climate movement,CC: Criticism of climate movement: Ad hominem ...
1,EN_UA_300009.txt,Other,Other
2,EN_UA_300017.txt,Other,Other
3,EN_CC_100021.txt,Other,Other
4,EN_UA_300041.txt,Other,Other
...,...,...,...
394,EN_CC_200022.txt,CC: Criticism of institutions and authorities;...,CC: Criticism of institutions and authorities:...
395,EN_CC_100028.txt,Other,Other
396,EN_CC_300010.txt,CC: Amplifying Climate Fears,CC: Amplifying Climate Fears: Other
397,EN_UA_013257.txt,URW: Russia is the Victim;URW: Blaming the war...,URW: Russia is the Victim: Russia actions in U...


In [4]:
# Group by Document_ID and create lists of narratives
df = df.groupby("Document_ID").agg({
    "High_Level_Narratives": lambda x: list(set(";".join(x).split(";"))),
    "Sub_Narratives": lambda x: list(set(";".join(x).split(";")))
}).reset_index()

df.rename(columns={
    "High_Level_Narratives": "High_Level_Narratives_List",
    "Sub_Narratives": "Sub_Narratives_List"
}, inplace=True)


In [5]:
documents_folder = 'data/EN/raw-documents'


def load_documents(folder_path):
    documents = {}
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):  # Ensure only text files are read
            doc_id = filename  # Extract Document_ID from filename
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
                documents[doc_id] = file.read().strip()
    return documents

# Load document texts into a dictionary
document_texts = load_documents(documents_folder)

# Map document texts to the DataFrame
df['Text'] = df['Document_ID'].map(document_texts)

# Check if all documents are successfully mapped
print(f"Number of missing documents: {df['Text'].isnull().sum()}")
df


Number of missing documents: 0


Unnamed: 0,Document_ID,High_Level_Narratives_List,Sub_Narratives_List,Text
0,EN_CC_100000.txt,[CC: Criticism of institutions and authorities...,[CC: Controversy about green technologies: Oth...,Pentagon plans to serve LAB-GROWN MEAT to troo...
1,EN_CC_100002.txt,[CC: Criticism of institutions and authorities...,[CC: Hidden plots by secret schemes of powerfu...,Oxford Residents Mount Resistance Against the ...
2,EN_CC_100003.txt,[CC: Criticism of institutions and authorities...,[CC: Criticism of climate movement: Ad hominem...,"Fonda Heads To Canada For Oil Sands Protest, M..."
3,EN_CC_100004.txt,[CC: Criticism of institutions and authorities...,[CC: Criticism of institutions and authorities...,A Tesla Owner Just Exposed A Sick Secret About...
4,EN_CC_100005.txt,[CC: Criticism of climate movement],"[CC: Criticism of climate movement: Other, CC:...",Climate Crazies Fail in Attempt to Vandalize A...
...,...,...,...,...
394,EN_UA_DEV_100028.txt,[URW: Negative Consequences for the West],[URW: Negative Consequences for the West: Othe...,European gas prices surge 20% as Russia's late...
395,EN_UA_DEV_216.txt,"[URW: Discrediting the West, Diplomacy, URW: N...","[URW: Discrediting the West, Diplomacy: The EU...","EU 'biggest loser' in Ukraine conflicts, Hunga..."
396,EN_UA_DEV_23.txt,"[URW: Amplifying war-related fears, URW: Distr...",[URW: Amplifying war-related fears: By continu...,What is the current trajectory of the evil emp...
397,EN_UA_DEV_24.txt,"[URW: Discrediting the West, Diplomacy, URW: N...",[URW: Negative Consequences for the West: Sanc...,Europe ‘Shot Itself in the Lungs’ With Sanctio...


fixesh kon

In [6]:
unique_urw = sorted(
    set(
        narrative for narratives in df['High_Level_Narratives_List'] 
        for narrative in narratives if narrative.startswith("URW:")
    )
)

print(unique_urw)  # Output the unique URW narratives

['URW: Amplifying war-related fears', 'URW: Blaming the war on others rather than the invader', 'URW: Discrediting Ukraine', 'URW: Discrediting the West, Diplomacy', 'URW: Distrust towards Media', 'URW: Hidden plots by secret schemes of powerful groups', 'URW: Negative Consequences for the West', 'URW: Overpraising the West', 'URW: Praise of Russia', 'URW: Russia is the Victim', 'URW: Speculating war outcomes']


In [7]:
df

Unnamed: 0,Document_ID,High_Level_Narratives_List,Sub_Narratives_List,Text
0,EN_CC_100000.txt,[CC: Criticism of institutions and authorities...,[CC: Controversy about green technologies: Oth...,Pentagon plans to serve LAB-GROWN MEAT to troo...
1,EN_CC_100002.txt,[CC: Criticism of institutions and authorities...,[CC: Hidden plots by secret schemes of powerfu...,Oxford Residents Mount Resistance Against the ...
2,EN_CC_100003.txt,[CC: Criticism of institutions and authorities...,[CC: Criticism of climate movement: Ad hominem...,"Fonda Heads To Canada For Oil Sands Protest, M..."
3,EN_CC_100004.txt,[CC: Criticism of institutions and authorities...,[CC: Criticism of institutions and authorities...,A Tesla Owner Just Exposed A Sick Secret About...
4,EN_CC_100005.txt,[CC: Criticism of climate movement],"[CC: Criticism of climate movement: Other, CC:...",Climate Crazies Fail in Attempt to Vandalize A...
...,...,...,...,...
394,EN_UA_DEV_100028.txt,[URW: Negative Consequences for the West],[URW: Negative Consequences for the West: Othe...,European gas prices surge 20% as Russia's late...
395,EN_UA_DEV_216.txt,"[URW: Discrediting the West, Diplomacy, URW: N...","[URW: Discrediting the West, Diplomacy: The EU...","EU 'biggest loser' in Ukraine conflicts, Hunga..."
396,EN_UA_DEV_23.txt,"[URW: Amplifying war-related fears, URW: Distr...",[URW: Amplifying war-related fears: By continu...,What is the current trajectory of the evil emp...
397,EN_UA_DEV_24.txt,"[URW: Discrediting the West, Diplomacy, URW: N...",[URW: Negative Consequences for the West: Sanc...,Europe ‘Shot Itself in the Lungs’ With Sanctio...


In [8]:
df = df[df["High_Level_Narratives_List"].apply(lambda narratives: any("URW:" in narrative for narrative in narratives))]


In [9]:
def preprocess_multi_label(data, narratives):
    label_vectors = []
    for narratives_list in data['High_Level_Narratives_List']:
        vector = [1 if narrative in narratives_list else 0 for narrative in narratives]
        label_vectors.append(vector)
    data['Labels'] = label_vectors
    return data

In [10]:
df

Unnamed: 0,Document_ID,High_Level_Narratives_List,Sub_Narratives_List,Text
179,EN_UA_000923.txt,"[URW: Speculating war outcomes, URW: Discredit...","[URW: Speculating war outcomes: Other, URW: Di...",Boris Johnson demands Putin ‘steps back from t...
180,EN_UA_001032.txt,"[URW: Discrediting the West, Diplomacy]","[URW: Discrediting the West, Diplomacy: Diplom...",Russia-Ukraine war map: Where are Russian troo...
181,EN_UA_001052.txt,[URW: Blaming the war on others rather than th...,[URW: Blaming the war on others rather than th...,NATO ‘Cautiously Optimistic’ Amid Reports of R...
184,EN_UA_002668.txt,"[URW: Speculating war outcomes, URW: Amplifyin...","[URW: Speculating war outcomes: Other, URW: Sp...",Putin may ABANDON siege of Kyiv and try to bli...
186,EN_UA_003579.txt,[URW: Negative Consequences for the West],[URW: Negative Consequences for the West: Sanc...,International agencies call for urgent aid to ...
...,...,...,...,...
394,EN_UA_DEV_100028.txt,[URW: Negative Consequences for the West],[URW: Negative Consequences for the West: Othe...,European gas prices surge 20% as Russia's late...
395,EN_UA_DEV_216.txt,"[URW: Negative Consequences for the West, URW:...","[URW: Discrediting the West, Diplomacy: The EU...","EU 'biggest loser' in Ukraine conflicts, Hunga..."
396,EN_UA_DEV_23.txt,"[URW: Praise of Russia, URW: Amplifying war-re...",[URW: Distrust towards Media: Western media is...,What is the current trajectory of the evil emp...
397,EN_UA_DEV_24.txt,"[URW: Negative Consequences for the West, URW:...","[URW: Discrediting the West, Diplomacy: Diplom...",Europe ‘Shot Itself in the Lungs’ With Sanctio...


In [11]:
df = preprocess_multi_label(df, urw_narratives)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Labels'] = label_vectors


In [12]:
def calculate_class_weights(labels):
    label_sums = np.sum(labels, axis=0)
    total_samples = len(labels)
    class_weights = [total_samples / (len(label_sums) * count) for count in label_sums]
    return torch.tensor(class_weights, dtype=torch.float)

In [13]:
labels = np.array(df['Labels'].tolist(), dtype=np.float32)
class_weights = calculate_class_weights(labels)

In [14]:
def handle_rare_classes(data, labels, min_samples=15):
    label_sums = np.sum(labels, axis=0)
    rare_classes = np.where(label_sums < min_samples)[0]

    for rare_class in rare_classes:
        rare_indices = [i for i, lbl in enumerate(labels) if lbl[rare_class] == 1]
        if len(rare_indices) > 0:
            duplicate_data = data.iloc[rare_indices]
            data = pd.concat([data] + [duplicate_data] * (min_samples - len(rare_indices)), ignore_index=True)
            labels = np.vstack([labels] + [labels[rare_indices]] * (min_samples - len(rare_indices)))
    return data, labels


In [15]:
df, labels = handle_rare_classes(df, labels)

In [16]:
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['Text'], labels, test_size=0.2, random_state=42)


In [17]:
train_dataset = Dataset.from_dict({"text": train_texts.tolist(), "labels": train_labels.tolist()})
test_dataset = Dataset.from_dict({"text": test_texts.tolist(), "labels": test_labels.tolist()})
datasets = DatasetDict({"train": train_dataset, "test": test_dataset})

In [18]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = AutoModelForSequenceClassification.from_pretrained(
    "roberta-base", num_labels=labels.shape[1], problem_type="multi_label_classification"
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

# Tokenize the dataset
tokenized_datasets = datasets.map(tokenize_function, batched=True)


Map:   0%|          | 0/171 [00:00<?, ? examples/s]

Map:   0%|          | 0/43 [00:00<?, ? examples/s]

In [20]:
# def custom_loss_function(outputs, targets):
#     sigmoid = torch.nn.Sigmoid()
#     bce = torch.nn.BCEWithLogitsLoss(weight=class_weights)
#     return bce(outputs, targets)

In [21]:
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import EvalPrediction

def compute_metrics(p: EvalPrediction):
    # Apply sigmoid activation to model predictions (logits)
    sigmoid_preds = torch.sigmoid(torch.tensor(p.predictions)).numpy()

    # Convert probabilities to binary predictions using a threshold of 0.5
    preds = (sigmoid_preds > 0.5).astype(int)

    # Ground-truth labels
    labels = p.label_ids

    # Calculate weighted precision, recall, and F1-score
    precision = precision_score(labels, preds, average="weighted", zero_division=0)
    recall = recall_score(labels, preds, average="weighted", zero_division=0)
    f1 = f1_score(labels, preds, average="weighted", zero_division=0)

    # Return the computed metrics
    return {"precision": precision, "recall": recall, "f1": f1}


In [22]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [23]:

training_args = TrainingArguments(
    output_dir='./results',                    # Directory to store checkpoints and final model
    num_train_epochs=20,                       # Total number of training epochs
    learning_rate=2e-5,  # Fine-tuning learning rate
    per_device_train_batch_size=4,            # Batch size per device during training
    per_device_eval_batch_size=4,             # Batch size for evaluation
    evaluation_strategy='epoch',               # Evaluate at the end of each epoch
    save_strategy='epoch',                     # Save model at the end of each epoch
    load_best_model_at_end=True,               # Load the best model at the end of training
    metric_for_best_model='f1',                # Use F1 score to evaluate the best model
    greater_is_better=True,                    # Higher F1 is better
    logging_dir='./logs',                      # Directory for storing logs
    logging_steps=100,                         # Log every 100 steps
    save_total_limit=3,                        # Limit the total amount of checkpoints
    seed=42,
    weight_decay=0.01,# Seed for reproducibility
    fp16=True
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [24]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [25]:
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.47538,0.0,0.0,0.0
2,No log,0.472857,0.0,0.0,0.0
3,0.498100,0.406566,0.344609,0.188119,0.211306
4,0.498100,0.369636,0.584263,0.356436,0.407102
5,0.386600,0.317398,0.684818,0.564356,0.597344
6,0.386600,0.296281,0.881738,0.633663,0.706556
7,0.274600,0.274359,0.949245,0.663366,0.746257
8,0.274600,0.270079,0.962046,0.673267,0.755889
9,0.274600,0.258384,0.937106,0.722772,0.799107
10,0.206400,0.245026,0.916942,0.722772,0.791446


TrainOutput(global_step=860, training_loss=0.22525559691495672, metrics={'train_runtime': 335.7608, 'train_samples_per_second': 10.186, 'train_steps_per_second': 2.561, 'total_flos': 899912523018240.0, 'train_loss': 0.22525559691495672, 'epoch': 20.0})

In [26]:
def predict_with_threshold(trainer, dataset, threshold=0.7):
    predictions = trainer.predict(dataset)
    logits = predictions.predictions
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.tensor(logits))
    return (probs > threshold).int()

# Predict and return results
results = predict_with_threshold(trainer, tokenized_datasets["test"])
print("Predicted Labels:", results)

Predicted Labels: tensor([[0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0],
        [1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0],
        [1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0],
        [1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0],
        [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0],
        [0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1],
        [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
        [0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
        [1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
        [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
        [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
        [1, 0, 0, 1, 0, 0, 0, 1, 0, 