In [1]:
#importing libraries
import pandas as pd
import numpy as np
import os
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from transformers import EvalPrediction


In [2]:
df = pd.read_csv('data_with_text.csv')
df.head()

Unnamed: 0,Document_ID,High_Level_Narratives_List,Sub_Narratives_List,Text
0,EN_CC_100000.txt,"['CC: Controversy about green technologies', '...",['CC: Criticism of institutions and authoritie...,Pentagon plans to serve LAB-GROWN MEAT to troo...
1,EN_CC_100002.txt,['CC: Criticism of institutions and authoritie...,['CC: Criticism of institutions and authoritie...,Oxford Residents Mount Resistance Against the ...
2,EN_CC_100003.txt,"['CC: Criticism of climate movement', 'CC: Cri...",['CC: Criticism of institutions and authoritie...,"Fonda Heads To Canada For Oil Sands Protest, M..."
3,EN_CC_100004.txt,['CC: Criticism of institutions and authoritie...,['CC: Controversy about green technologies: Ot...,A Tesla Owner Just Exposed A Sick Secret About...
4,EN_CC_100005.txt,['CC: Criticism of climate movement'],['CC: Criticism of climate movement: Climate m...,Climate Crazies Fail in Attempt to Vandalize A...


In [3]:
# Convert the 'High_Level_Narratives_List' column from string to list
df['High_Level_Narratives_List'] = df['High_Level_Narratives_List'].apply(lambda x: eval(x) if isinstance(x, str) else x)

# Explode the dataframe to have one row per High Level Narrative
df = df.explode('High_Level_Narratives_List')

# Rename the column for clarity
df = df.rename(columns={"High_Level_Narratives_List": "High_Level_Narrative"})

df.head()


Unnamed: 0,Document_ID,High_Level_Narrative,Sub_Narratives_List,Text
0,EN_CC_100000.txt,CC: Controversy about green technologies,['CC: Criticism of institutions and authoritie...,Pentagon plans to serve LAB-GROWN MEAT to troo...
0,EN_CC_100000.txt,CC: Criticism of institutions and authorities,['CC: Criticism of institutions and authoritie...,Pentagon plans to serve LAB-GROWN MEAT to troo...
0,EN_CC_100000.txt,CC: Hidden plots by secret schemes of powerful...,['CC: Criticism of institutions and authoritie...,Pentagon plans to serve LAB-GROWN MEAT to troo...
1,EN_CC_100002.txt,CC: Criticism of institutions and authorities,['CC: Criticism of institutions and authoritie...,Oxford Residents Mount Resistance Against the ...
1,EN_CC_100002.txt,CC: Hidden plots by secret schemes of powerful...,['CC: Criticism of institutions and authoritie...,Oxford Residents Mount Resistance Against the ...


In [4]:
df = df[df["High_Level_Narrative"].str.startswith("URW")].reset_index(drop=True)
df.head()

Unnamed: 0,Document_ID,High_Level_Narrative,Sub_Narratives_List,Text
0,EN_UA_000923.txt,URW: Speculating war outcomes,"['URW: Discrediting the West, Diplomacy: The E...",Boris Johnson demands Putin ‘steps back from t...
1,EN_UA_000923.txt,"URW: Discrediting the West, Diplomacy","['URW: Discrediting the West, Diplomacy: The E...",Boris Johnson demands Putin ‘steps back from t...
2,EN_UA_001032.txt,"URW: Discrediting the West, Diplomacy","['URW: Discrediting the West, Diplomacy: Diplo...",Russia-Ukraine war map: Where are Russian troo...
3,EN_UA_001052.txt,URW: Blaming the war on others rather than the...,['URW: Blaming the war on others rather than t...,NATO ‘Cautiously Optimistic’ Amid Reports of R...
4,EN_UA_002668.txt,URW: Speculating war outcomes,"['URW: Speculating war outcomes: Other', 'URW:...",Putin may ABANDON siege of Kyiv and try to bli...


In [5]:
print(f'We have {len(df)} URW High Level Narratives after spliting multiple URW High Level Narratives in multiple rows')

We have 264 URW High Level Narratives after spliting multiple URW High Level Narratives in multiple rows


In [6]:
#in here we will add high level narrative to text with a | seperating high level narrative and text
df["Text"] = df["High_Level_Narrative"] + " | " + df["Text"]

df.head()

Unnamed: 0,Document_ID,High_Level_Narrative,Sub_Narratives_List,Text
0,EN_UA_000923.txt,URW: Speculating war outcomes,"['URW: Discrediting the West, Diplomacy: The E...",URW: Speculating war outcomes | Boris Johnson ...
1,EN_UA_000923.txt,"URW: Discrediting the West, Diplomacy","['URW: Discrediting the West, Diplomacy: The E...","URW: Discrediting the West, Diplomacy | Boris ..."
2,EN_UA_001032.txt,"URW: Discrediting the West, Diplomacy","['URW: Discrediting the West, Diplomacy: Diplo...","URW: Discrediting the West, Diplomacy | Russia..."
3,EN_UA_001052.txt,URW: Blaming the war on others rather than the...,['URW: Blaming the war on others rather than t...,URW: Blaming the war on others rather than the...
4,EN_UA_002668.txt,URW: Speculating war outcomes,"['URW: Speculating war outcomes: Other', 'URW:...",URW: Speculating war outcomes | Putin may ABAN...


In [7]:
# Function to filter sub-narratives that match the high-level narrative
def filter_sub_narratives(row):
    high_level = row["High_Level_Narrative"].strip()
    sub_narratives = eval(row["Sub_Narratives_List"])  # Convert string representation of list to actual list
    
    # Keep only sub-narratives that start with the high-level narrative
    filtered = [sub for sub in sub_narratives if sub.startswith(high_level)]
    return filtered

# Apply filtering function
df["Filtered_Sub_Narratives"] = df.apply(filter_sub_narratives, axis=1)
df['Filtered_Sub_Narratives']

0                 [URW: Speculating war outcomes: Other]
1      [URW: Discrediting the West, Diplomacy: The EU...
2      [URW: Discrediting the West, Diplomacy: Diplom...
3      [URW: Blaming the war on others rather than th...
4      [URW: Speculating war outcomes: Other, URW: Sp...
                             ...                        
259    [URW: Praise of Russia: Praise of Russian mili...
260    [URW: Amplifying war-related fears: By continu...
261    [URW: Negative Consequences for the West: Sanc...
262    [URW: Discrediting the West, Diplomacy: Diplom...
263                   [URW: Russia is the Victim: Other]
Name: Filtered_Sub_Narratives, Length: 264, dtype: object

In [8]:
# Get all unique sub-narratives
sub_narratives = list(set(label for sublist in df["Filtered_Sub_Narratives"] for label in sublist))
len(sub_narratives)

42

In [9]:
def preprocess_multi_label(data, narratives):
    label_vectors = []
    for narratives_list in data['Filtered_Sub_Narratives']:
        vector = [1 if narrative in narratives_list else 0 for narrative in narratives]
        label_vectors.append(vector)
    data['Labels'] = label_vectors
    return data

# Encode sub-level narratives
df = preprocess_multi_label(df, sub_narratives)
df.head()


Unnamed: 0,Document_ID,High_Level_Narrative,Sub_Narratives_List,Text,Filtered_Sub_Narratives,Labels
0,EN_UA_000923.txt,URW: Speculating war outcomes,"['URW: Discrediting the West, Diplomacy: The E...",URW: Speculating war outcomes | Boris Johnson ...,[URW: Speculating war outcomes: Other],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,EN_UA_000923.txt,"URW: Discrediting the West, Diplomacy","['URW: Discrediting the West, Diplomacy: The E...","URW: Discrediting the West, Diplomacy | Boris ...","[URW: Discrediting the West, Diplomacy: The EU...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
2,EN_UA_001032.txt,"URW: Discrediting the West, Diplomacy","['URW: Discrediting the West, Diplomacy: Diplo...","URW: Discrediting the West, Diplomacy | Russia...","[URW: Discrediting the West, Diplomacy: Diplom...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,EN_UA_001052.txt,URW: Blaming the war on others rather than the...,['URW: Blaming the war on others rather than t...,URW: Blaming the war on others rather than the...,[URW: Blaming the war on others rather than th...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,EN_UA_002668.txt,URW: Speculating war outcomes,"['URW: Speculating war outcomes: Other', 'URW:...",URW: Speculating war outcomes | Putin may ABAN...,"[URW: Speculating war outcomes: Other, URW: Sp...","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."


In [10]:
def handle_rare_classes(data, labels, min_samples=15):
    label_sums = np.sum(labels, axis=0)
    rare_classes = np.where(label_sums < min_samples)[0]

    for rare_class in rare_classes:
        rare_indices = [i for i, lbl in enumerate(labels) if lbl[rare_class] == 1]
        if len(rare_indices) > 0:
            duplicate_data = data.iloc[rare_indices]
            data = pd.concat([data] + [duplicate_data] * (min_samples - len(rare_indices)), ignore_index=True)
            labels = np.vstack([labels] + [labels[rare_indices]] * (min_samples - len(rare_indices)))
    return data, labels

# Apply rare class handling
labels = np.array(df['Labels'].tolist(), dtype=np.float32)
df, labels = handle_rare_classes(df, labels)


In [11]:
from sklearn.model_selection import train_test_split

# Use Combined_Text as input (High-Level + Text)
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['Text'], labels, test_size=0.2, random_state=42
)

# Convert to Hugging Face Dataset format
from datasets import Dataset, DatasetDict

train_dataset = Dataset.from_dict({"text": train_texts.tolist(), "labels": train_labels.tolist()})
test_dataset = Dataset.from_dict({"text": test_texts.tolist(), "labels": test_labels.tolist()})
datasets = DatasetDict({"train": train_dataset, "test": test_dataset})


In [12]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = AutoModelForSequenceClassification.from_pretrained(
    "roberta-base", num_labels=labels.shape[1], problem_type="multi_label_classification"
)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

# Tokenize the dataset
tokenized_datasets = datasets.map(tokenize_function, batched=True)


Map:   0%|          | 0/1003 [00:00<?, ? examples/s]

Map:   0%|          | 0/251 [00:00<?, ? examples/s]

In [14]:
import torch
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import EvalPrediction

def compute_metrics(p: EvalPrediction):
    # Apply sigmoid activation to model predictions (logits)
    sigmoid_preds = torch.sigmoid(torch.tensor(p.predictions)).numpy()

    # Convert probabilities to binary predictions using a threshold of 0.5
    preds = (sigmoid_preds > 0.5).astype(int)

    # Ground-truth labels
    labels = p.label_ids

    # Calculate weighted precision, recall, and F1-score
    precision = precision_score(labels, preds, average="weighted", zero_division=0)
    recall = recall_score(labels, preds, average="weighted", zero_division=0)
    f1 = f1_score(labels, preds, average="weighted", zero_division=0)

    return {"precision": precision, "recall": recall, "f1": f1}


In [15]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,  # Reduced from 20 for efficiency
    learning_rate=2e-5, 
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    greater_is_better=True,
    logging_dir='./logs',
    logging_steps=100,
    save_total_limit=1,
    seed=42,
    weight_decay=0.01,
    # fp16=True
)




In [16]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Start training
trainer.train()


  trainer = Trainer(


  0%|          | 0/2510 [00:00<?, ?it/s]

{'loss': 0.3443, 'grad_norm': 0.4358736276626587, 'learning_rate': 1.920318725099602e-05, 'epoch': 0.4}
{'loss': 0.1729, 'grad_norm': 0.3660593628883362, 'learning_rate': 1.8406374501992033e-05, 'epoch': 0.8}


  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 0.15750738978385925, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_runtime': 4.5574, 'eval_samples_per_second': 55.075, 'eval_steps_per_second': 13.824, 'epoch': 1.0}
{'loss': 0.1551, 'grad_norm': 0.3340104818344116, 'learning_rate': 1.760956175298805e-05, 'epoch': 1.2}
{'loss': 0.1556, 'grad_norm': 0.3193029463291168, 'learning_rate': 1.6812749003984067e-05, 'epoch': 1.59}
{'loss': 0.1451, 'grad_norm': 0.2908485233783722, 'learning_rate': 1.601593625498008e-05, 'epoch': 1.99}


  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 0.14535662531852722, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_runtime': 4.6239, 'eval_samples_per_second': 54.283, 'eval_steps_per_second': 13.625, 'epoch': 2.0}
{'loss': 0.138, 'grad_norm': 1.088565707206726, 'learning_rate': 1.5219123505976096e-05, 'epoch': 2.39}
{'loss': 0.1275, 'grad_norm': 0.5130642056465149, 'learning_rate': 1.4422310756972113e-05, 'epoch': 2.79}


  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 0.10890921205282211, 'eval_precision': 0.25699745547073793, 'eval_recall': 0.0916030534351145, 'eval_f1': 0.1332012457800709, 'eval_runtime': 4.5472, 'eval_samples_per_second': 55.199, 'eval_steps_per_second': 13.855, 'epoch': 3.0}
{'loss': 0.1106, 'grad_norm': 0.3714918792247772, 'learning_rate': 1.3625498007968127e-05, 'epoch': 3.19}
{'loss': 0.1025, 'grad_norm': 0.4251931309700012, 'learning_rate': 1.2828685258964144e-05, 'epoch': 3.59}
{'loss': 0.0959, 'grad_norm': 0.3303582966327667, 'learning_rate': 1.2031872509960161e-05, 'epoch': 3.98}


  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 0.08611315488815308, 'eval_precision': 0.5765714824238998, 'eval_recall': 0.4020356234096692, 'eval_f1': 0.45963643860392844, 'eval_runtime': 4.488, 'eval_samples_per_second': 55.927, 'eval_steps_per_second': 14.037, 'epoch': 4.0}
{'loss': 0.0837, 'grad_norm': 0.24056920409202576, 'learning_rate': 1.1235059760956175e-05, 'epoch': 4.38}
{'loss': 0.0789, 'grad_norm': 0.2558727562427521, 'learning_rate': 1.0438247011952192e-05, 'epoch': 4.78}


  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 0.07193789631128311, 'eval_precision': 0.7080676679714184, 'eval_recall': 0.544529262086514, 'eval_f1': 0.5918723887671324, 'eval_runtime': 4.5404, 'eval_samples_per_second': 55.281, 'eval_steps_per_second': 13.875, 'epoch': 5.0}
{'loss': 0.073, 'grad_norm': 0.24863848090171814, 'learning_rate': 9.641434262948209e-06, 'epoch': 5.18}
{'loss': 0.0702, 'grad_norm': 0.2795252799987793, 'learning_rate': 8.844621513944224e-06, 'epoch': 5.58}
{'loss': 0.0666, 'grad_norm': 0.2290727198123932, 'learning_rate': 8.04780876494024e-06, 'epoch': 5.98}


  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 0.06334521621465683, 'eval_precision': 0.7813953787236229, 'eval_recall': 0.6870229007633588, 'eval_f1': 0.7192369165727738, 'eval_runtime': 4.6157, 'eval_samples_per_second': 54.38, 'eval_steps_per_second': 13.649, 'epoch': 6.0}
{'loss': 0.0634, 'grad_norm': 0.3828502297401428, 'learning_rate': 7.250996015936256e-06, 'epoch': 6.37}
{'loss': 0.0598, 'grad_norm': 0.38201504945755005, 'learning_rate': 6.454183266932272e-06, 'epoch': 6.77}


  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 0.05758536979556084, 'eval_precision': 0.8573513019575695, 'eval_recall': 0.7837150127226463, 'eval_f1': 0.8041199757286696, 'eval_runtime': 4.5186, 'eval_samples_per_second': 55.548, 'eval_steps_per_second': 13.942, 'epoch': 7.0}
{'loss': 0.0563, 'grad_norm': 0.30093902349472046, 'learning_rate': 5.657370517928288e-06, 'epoch': 7.17}
{'loss': 0.0562, 'grad_norm': 0.3569957911968231, 'learning_rate': 4.860557768924303e-06, 'epoch': 7.57}
{'loss': 0.0544, 'grad_norm': 0.23548771440982819, 'learning_rate': 4.06374501992032e-06, 'epoch': 7.97}


  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 0.05394032225012779, 'eval_precision': 0.8907270441119376, 'eval_recall': 0.811704834605598, 'eval_f1': 0.8352217269853094, 'eval_runtime': 4.6006, 'eval_samples_per_second': 54.558, 'eval_steps_per_second': 13.694, 'epoch': 8.0}
{'loss': 0.0519, 'grad_norm': 0.24795736372470856, 'learning_rate': 3.2669322709163346e-06, 'epoch': 8.37}
{'loss': 0.0527, 'grad_norm': 0.17897090315818787, 'learning_rate': 2.470119521912351e-06, 'epoch': 8.76}


  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 0.05159544199705124, 'eval_precision': 0.8904191670865034, 'eval_recall': 0.8320610687022901, 'eval_f1': 0.8461285391671342, 'eval_runtime': 4.4797, 'eval_samples_per_second': 56.03, 'eval_steps_per_second': 14.063, 'epoch': 9.0}
{'loss': 0.049, 'grad_norm': 0.23260082304477692, 'learning_rate': 1.6733067729083665e-06, 'epoch': 9.16}
{'loss': 0.0506, 'grad_norm': 0.27713316679000854, 'learning_rate': 8.764940239043826e-07, 'epoch': 9.56}
{'loss': 0.0496, 'grad_norm': 0.23332971334457397, 'learning_rate': 7.968127490039842e-08, 'epoch': 9.96}


  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 0.05091176927089691, 'eval_precision': 0.9036980849455739, 'eval_recall': 0.8371501272264631, 'eval_f1': 0.8532962209857189, 'eval_runtime': 4.7068, 'eval_samples_per_second': 53.327, 'eval_steps_per_second': 13.385, 'epoch': 10.0}
{'train_runtime': 733.1059, 'train_samples_per_second': 13.682, 'train_steps_per_second': 3.424, 'train_loss': 0.09838276600457757, 'epoch': 10.0}


TrainOutput(global_step=2510, training_loss=0.09838276600457757, metrics={'train_runtime': 733.1059, 'train_samples_per_second': 13.682, 'train_steps_per_second': 3.424, 'total_flos': 2639951667302400.0, 'train_loss': 0.09838276600457757, 'epoch': 10.0})

# Azinja be bad ro kari nadashte bashid!

## ----------------------------------------------------------------------

In [9]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score, accuracy_score, recall_score
import numpy as np

In [10]:
mlb = MultiLabelBinarizer(classes=unique_labels)
df["Encoded_Labels"] = mlb.fit_transform(df["Filtered_Sub_Narratives"]).tolist()
df.head()

Unnamed: 0,Document_ID,High_Level_Narrative,Sub_Narratives_List,Text,Filtered_Sub_Narratives,Encoded_Labels
0,EN_UA_000923.txt,URW: Speculating war outcomes,"['URW: Discrediting the West, Diplomacy: The E...",URW: Speculating war outcomes | Boris Johnson ...,[URW: Speculating war outcomes: Other],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,EN_UA_000923.txt,"URW: Discrediting the West, Diplomacy","['URW: Discrediting the West, Diplomacy: The E...","URW: Discrediting the West, Diplomacy | Boris ...","[URW: Discrediting the West, Diplomacy: The EU...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,EN_UA_001032.txt,"URW: Discrediting the West, Diplomacy","['URW: Discrediting the West, Diplomacy: Diplo...","URW: Discrediting the West, Diplomacy | Russia...","[URW: Discrediting the West, Diplomacy: Diplom...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,EN_UA_001052.txt,URW: Blaming the war on others rather than the...,['URW: Blaming the war on others rather than t...,URW: Blaming the war on others rather than the...,[URW: Blaming the war on others rather than th...,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,EN_UA_002668.txt,URW: Speculating war outcomes,"['URW: Speculating war outcomes: Other', 'URW:...",URW: Speculating war outcomes | Putin may ABAN...,"[URW: Speculating war outcomes: Other, URW: Sp...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [11]:
import ast

df["Encoded_Labels"] = df["Encoded_Labels"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

In [12]:
from sklearn.utils import shuffle

df = shuffle(df, random_state=42).reset_index(drop=True)

train_size = int(0.8 * len(df))
# Convert to list explicitly
train_texts, train_labels = list(df["Text"][:train_size]), list(df["Encoded_Labels"][:train_size])
test_texts, test_labels = list(df["Text"][train_size:]), list(df["Encoded_Labels"][train_size:])


In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define the tokenizer and model
model_name = "roberta-base"
tokenizer = RobertaTokenizer.from_pretrained(model_name)

In [14]:
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True, max_length=512)

In [15]:
class NarrativeDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

In [16]:
train_dataset = NarrativeDataset(train_encodings, train_labels)
test_dataset = NarrativeDataset(test_encodings, test_labels)

# Define model
model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=len(unique_labels))
model.to(device)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [17]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=20,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    save_total_limit=1
)



In [18]:
import numpy as np
from sklearn.metrics import f1_score, accuracy_score, recall_score

def compute_metrics(pred):
    logits, labels = pred
    preds = (logits > 0.5).astype(int)  # Convert logits to binary predictions for multi-label classification
    labels = np.array(labels)  # Ensure labels are in array format

    # Compute macro-averaged metrics
    f1 = f1_score(labels, preds, average="macro")
    acc = accuracy_score(labels, preds)
    recall = recall_score(labels, preds, average="macro")

    return {"f1": f1, "accuracy": acc, "recall": recall}


In [19]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

In [20]:
# Train the model
trainer.train()

  0%|          | 0/540 [00:00<?, ?it/s]

{'loss': 0.5459, 'grad_norm': 0.861284613609314, 'learning_rate': 4.9074074074074075e-05, 'epoch': 0.37}
{'loss': 0.3339, 'grad_norm': 0.5963953137397766, 'learning_rate': 4.814814814814815e-05, 'epoch': 0.74}


  0%|          | 0/7 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.23084905743598938, 'eval_f1': 0.0, 'eval_accuracy': 0.0, 'eval_recall': 0.0, 'eval_runtime': 0.9801, 'eval_samples_per_second': 54.077, 'eval_steps_per_second': 7.142, 'epoch': 1.0}
{'loss': 0.2499, 'grad_norm': 0.45457470417022705, 'learning_rate': 4.722222222222222e-05, 'epoch': 1.11}
{'loss': 0.1993, 'grad_norm': 0.34782975912094116, 'learning_rate': 4.62962962962963e-05, 'epoch': 1.48}
{'loss': 0.178, 'grad_norm': 0.32758286595344543, 'learning_rate': 4.5370370370370374e-05, 'epoch': 1.85}


  0%|          | 0/7 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.16114045679569244, 'eval_f1': 0.0, 'eval_accuracy': 0.0, 'eval_recall': 0.0, 'eval_runtime': 0.9701, 'eval_samples_per_second': 54.632, 'eval_steps_per_second': 7.216, 'epoch': 2.0}
{'loss': 0.1549, 'grad_norm': 0.2748515009880066, 'learning_rate': 4.4444444444444447e-05, 'epoch': 2.22}
{'loss': 0.1444, 'grad_norm': 0.2389254868030548, 'learning_rate': 4.351851851851852e-05, 'epoch': 2.59}
{'loss': 0.1464, 'grad_norm': 0.26790982484817505, 'learning_rate': 4.259259259259259e-05, 'epoch': 2.96}


  0%|          | 0/7 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.1468096822500229, 'eval_f1': 0.0, 'eval_accuracy': 0.0, 'eval_recall': 0.0, 'eval_runtime': 0.9744, 'eval_samples_per_second': 54.395, 'eval_steps_per_second': 7.184, 'epoch': 3.0}


KeyboardInterrupt: 