In [6]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from transformers import EvalPrediction
from datasets import Dataset, DatasetDict
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import numpy as np

# Load the CSV file
data_path = 'data_with_text.csv'
data = pd.read_csv(data_path)
# Split the data into EN_CC and EN_UA
data['Type'] = data['Document_ID'].apply(lambda x: 'EN_CC' if x.startswith('EN_CC') else 'EN_UA')

def preprocess_data(data, narrative_type, narratives_list):
    """Preprocess data for a specific narrative type."""
    filtered_data = data[data['Type'] == narrative_type]
    
    # Assign labels (narratives or "Other")
    def assign_label(row):
        for narrative in narratives_list:
            if narrative in row['High_Level_Narratives_List']:
                return narrative
        return "Other"

    filtered_data['Label'] = filtered_data.apply(assign_label, axis=1)
    
    return filtered_data

# Define high-level narratives for CC and UA
cc_narratives = [
    "CC: Amplifying Climate Fears",
    "CC: Climate change is beneficial",
    "CC: Controversy about green technologies",
    "CC: Criticism of climate movement",
    "CC: Criticism of climate policies",
    "CC: Criticism of institutions and authorities",
    "CC: Downplaying climate change",
    "CC: Green policies are geopolitical instruments",
    "CC: Hidden plots by secret schemes of powerful groups",
    "CC: Questioning the measurements and science",
]

ua_narratives = [
    "URW: Amplifying war-related fears",
    "URW: Blaming the war on others rather than the invader",
    "URW: Discrediting Ukraine",
    "URW: Discrediting the West, Diplomacy",
    "URW: Distrust towards Media",
    "URW: Hidden plots by secret schemes of powerful groups",
    "URW: Negative Consequences for the West",
    "URW: Overpraising the West",
    "URW: Praise of Russia",
    "URW: Russia is the Victim",
    "URW: Speculating war outcomes",
]

# Preprocess EN_CC and EN_UA datasets
en_cc_data = preprocess_data(data, "EN_CC", cc_narratives)
en_ua_data = preprocess_data(data, "EN_UA", ua_narratives)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['Label'] = filtered_data.apply(assign_label, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['Label'] = filtered_data.apply(assign_label, axis=1)


In [7]:
# Map labels to integers for CC and UA
cc_label_to_id = {label: idx for idx, label in enumerate(cc_narratives + ["Other"])}
ua_label_to_id = {label: idx for idx, label in enumerate(ua_narratives + ["Other"])}

cc_id_to_label = {v: k for k, v in cc_label_to_id.items()}
ua_id_to_label = {v: k for k, v in ua_label_to_id.items()}

# Convert string labels to numeric labels
en_cc_data['Label'] = en_cc_data['Label'].map(cc_label_to_id)
en_ua_data['Label'] = en_ua_data['Label'].map(ua_label_to_id)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  en_cc_data['Label'] = en_cc_data['Label'].map(cc_label_to_id)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  en_ua_data['Label'] = en_ua_data['Label'].map(ua_label_to_id)


In [8]:
from collections import Counter

def calculate_class_weights(labels):
    label_counts = Counter(labels)
    total_samples = sum(label_counts.values())
    class_weights = {label: total_samples / (len(label_counts) * count) for label, count in label_counts.items()}
    return class_weights

# Calculate class weights for EN_CC
en_cc_labels = en_cc_data['Label'].tolist()
cc_class_weights = calculate_class_weights(en_cc_labels)

# Calculate class weights for EN_UA
en_ua_labels = en_ua_data['Label'].tolist()
ua_class_weights = calculate_class_weights(en_ua_labels)

# Convert class weights to tensors for PyTorch
cc_class_weights_tensor = torch.tensor([cc_class_weights[label] for label in sorted(cc_class_weights)], dtype=torch.float)
ua_class_weights_tensor = torch.tensor([ua_class_weights[label] for label in sorted(ua_class_weights)], dtype=torch.float)

# Define a custom loss function with class weights
def create_loss_function(class_weights_tensor):
    return torch.nn.CrossEntropyLoss(weight=class_weights_tensor)

# Custom loss functions for CC and UA
cc_loss_function = create_loss_function(cc_class_weights_tensor)
ua_loss_function = create_loss_function(ua_class_weights_tensor)


In [9]:
en_cc_data['Label'].value_counts()

10    75
3     27
4     21
2     19
5     15
0      7
1      4
6      4
8      3
9      1
Name: Label, dtype: int64

In [10]:
def handle_rare_classes(data, min_samples=2):
    label_counts = data['Label'].value_counts()
    rare_labels = label_counts[label_counts < min_samples].index
    
    # Duplicate rare class samples to ensure minimum count
    rare_data = data[data['Label'].isin(rare_labels)]
    for _ in range(min_samples - 1):
        data = pd.concat([data, rare_data])
    return data

en_cc_data = handle_rare_classes(en_cc_data)
en_ua_data = handle_rare_classes(en_ua_data)

# Split data into train/test sets
def split_data(df):
    train_texts, test_texts, train_labels, test_labels = train_test_split(
        df['Text'], df['Label'], test_size=0.2, random_state=42, stratify=df['Label']
    )
    
    train_data = Dataset.from_pandas(pd.DataFrame({'text': train_texts, 'label': train_labels}))
    test_data = Dataset.from_pandas(pd.DataFrame({'text': test_texts, 'label': test_labels}))
    
    return DatasetDict({"train": train_data, "test": test_data})

en_cc_dataset = split_data(en_cc_data)
en_ua_dataset = split_data(en_ua_data)


In [11]:
# Load RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

# Tokenize datasets
en_cc_dataset = en_cc_dataset.map(tokenize_function, batched=True)
en_ua_dataset = en_ua_dataset.map(tokenize_function, batched=True)

def load_model_with_custom_loss(num_labels, loss_function):
    model = RobertaForSequenceClassification.from_pretrained(
        'roberta-base',
        num_labels=num_labels
    )
    model.classifier.loss_fn = loss_function
    return model

cc_model = load_model_with_custom_loss(len(cc_narratives) + 1, cc_loss_function)  # +1 for "Other"
ua_model = load_model_with_custom_loss(len(ua_narratives) + 1, ua_loss_function)  # +1 for "Other"

Map:   0%|          | 0/141 [00:00<?, ? examples/s]

Map:   0%|          | 0/36 [00:00<?, ? examples/s]

Map:   0%|          | 0/178 [00:00<?, ? examples/s]

Map:   0%|          | 0/45 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
def compute_metrics(p: EvalPrediction):
    preds = np.argmax(p.predictions, axis=1)
    f1 = f1_score(p.label_ids, preds, average="weighted")
    return {"f1": f1}

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="epoch",
    metric_for_best_model="f1",
    load_best_model_at_end=True,


)

# Train EN_CC model
cc_trainer = Trainer(
    model=cc_model,
    args=training_args,
    train_dataset=en_cc_dataset["train"],
    eval_dataset=en_cc_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


  cc_trainer = Trainer(


In [13]:
cc_trainer.train()

  0%|          | 0/90 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 1.8192353248596191, 'eval_f1': 0.2450980392156863, 'eval_runtime': 0.9218, 'eval_samples_per_second': 39.055, 'eval_steps_per_second': 5.424, 'epoch': 1.0}
{'loss': 2.1272, 'grad_norm': 4.997844219207764, 'learning_rate': 4.4444444444444447e-05, 'epoch': 1.11}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 1.770568609237671, 'eval_f1': 0.2450980392156863, 'eval_runtime': 0.9138, 'eval_samples_per_second': 39.395, 'eval_steps_per_second': 5.471, 'epoch': 2.0}
{'loss': 1.7904, 'grad_norm': 4.2739949226379395, 'learning_rate': 3.888888888888889e-05, 'epoch': 2.22}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 1.5790925025939941, 'eval_f1': 0.2450980392156863, 'eval_runtime': 0.9161, 'eval_samples_per_second': 39.296, 'eval_steps_per_second': 5.458, 'epoch': 3.0}
{'loss': 1.6059, 'grad_norm': 6.627928733825684, 'learning_rate': 3.3333333333333335e-05, 'epoch': 3.33}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 1.4174120426177979, 'eval_f1': 0.4437669376693767, 'eval_runtime': 0.915, 'eval_samples_per_second': 39.345, 'eval_steps_per_second': 5.465, 'epoch': 4.0}
{'loss': 1.4024, 'grad_norm': 9.865017890930176, 'learning_rate': 2.777777777777778e-05, 'epoch': 4.44}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 1.3948510885238647, 'eval_f1': 0.4839244839244839, 'eval_runtime': 0.9223, 'eval_samples_per_second': 39.034, 'eval_steps_per_second': 5.421, 'epoch': 5.0}
{'loss': 0.9872, 'grad_norm': 10.181580543518066, 'learning_rate': 2.2222222222222223e-05, 'epoch': 5.56}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 1.3551788330078125, 'eval_f1': 0.49409722222222224, 'eval_runtime': 0.9221, 'eval_samples_per_second': 39.043, 'eval_steps_per_second': 5.423, 'epoch': 6.0}
{'loss': 0.9009, 'grad_norm': 6.556413173675537, 'learning_rate': 1.6666666666666667e-05, 'epoch': 6.67}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 1.3347971439361572, 'eval_f1': 0.628747795414462, 'eval_runtime': 0.9059, 'eval_samples_per_second': 39.739, 'eval_steps_per_second': 5.519, 'epoch': 7.0}
{'loss': 0.6669, 'grad_norm': 4.660386085510254, 'learning_rate': 1.1111111111111112e-05, 'epoch': 7.78}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 1.2475507259368896, 'eval_f1': 0.5161290322580645, 'eval_runtime': 0.9174, 'eval_samples_per_second': 39.239, 'eval_steps_per_second': 5.45, 'epoch': 8.0}
{'loss': 0.5671, 'grad_norm': 6.02388858795166, 'learning_rate': 5.555555555555556e-06, 'epoch': 8.89}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 1.295204758644104, 'eval_f1': 0.5309283309283309, 'eval_runtime': 0.9124, 'eval_samples_per_second': 39.454, 'eval_steps_per_second': 5.48, 'epoch': 9.0}
{'loss': 0.4997, 'grad_norm': 4.068960189819336, 'learning_rate': 0.0, 'epoch': 10.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 1.232291579246521, 'eval_f1': 0.5859365849087544, 'eval_runtime': 0.9382, 'eval_samples_per_second': 38.372, 'eval_steps_per_second': 5.329, 'epoch': 10.0}
{'train_runtime': 459.261, 'train_samples_per_second': 3.07, 'train_steps_per_second': 0.196, 'train_loss': 1.1719401677449544, 'epoch': 10.0}


TrainOutput(global_step=90, training_loss=1.1719401677449544, metrics={'train_runtime': 459.261, 'train_samples_per_second': 3.07, 'train_steps_per_second': 0.196, 'total_flos': 371016566507520.0, 'train_loss': 1.1719401677449544, 'epoch': 10.0})