In [1]:
!pip install datasets



In [2]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from transformers import EvalPrediction
from datasets import Dataset, DatasetDict
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np
from collections import Counter
import torch
from torch.nn import CrossEntropyLoss

In [3]:
data_path = '/content/drive/MyDrive/NLP_project/data_with_text.csv'
data = pd.read_csv(data_path)

In [4]:
def assign_three_way_label_no_ast(row):
    """
    Assign a label based on High_Level_Narratives_List:
    - "Other" if it contains "Other"
    - "CC" if it contains any CC-related narratives
    - "URW" if it contains any URW-related narratives
    """
    narratives = row.strip("[]").replace("'", "").split(",")  # Convert the string representation to a list
    narratives = [n.strip() for n in narratives]  # Clean up extra spaces
    if "Other" in narratives:
        return "Other"
    elif any(narrative.startswith("CC:") for narrative in narratives):
        return "CC"
    elif any(narrative.startswith("URW:") for narrative in narratives):
        return "URW"
    else:
        return "Unknown"  # For unexpected cases

In [5]:
# Apply the function to create the three-way label
data['Three_Way_Label'] = data['High_Level_Narratives_List'].apply(assign_three_way_label_no_ast)

# Map labels to integers
label_to_id = {"Other": 0, "CC": 1, "URW": 2}
data['Label'] = data['Three_Way_Label'].map(label_to_id)

# Calculate class weights for imbalanced dataset
label_counts = Counter(data['Label'])
total_samples = sum(label_counts.values())
class_weights = {label: total_samples / (len(label_counts) * count) for label, count in label_counts.items()}
class_weights_tensor = torch.tensor([class_weights[i] for i in range(len(label_to_id))], dtype=torch.float)

In [6]:
data['Label'].value_counts()

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
0,169
2,127
1,103


In [7]:
# Split data into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    data['Text'], data['Label'], test_size=0.2, random_state=42, stratify=data['Label']
)

In [8]:
train_dataset = Dataset.from_pandas(pd.DataFrame({'text': train_texts, 'label': train_labels}))
test_dataset = Dataset.from_pandas(pd.DataFrame({'text': test_texts, 'label': test_labels}))
datasets = DatasetDict({"train": train_dataset, "test": test_dataset})

In [9]:
# Load tokenizer and tokenize datasets
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

encoded_datasets = datasets.map(tokenize_function, batched=True)

# Load the model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=3)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/319 [00:00<?, ? examples/s]

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
class CustomRobertaForSequenceClassification(RobertaForSequenceClassification):
    def __init__(self, config, class_weights):
        super().__init__(config)
        self.class_weights = class_weights
        self.loss_fn = CrossEntropyLoss(weight=self.class_weights)

    def forward(self, input_ids=None, attention_mask=None, labels=None, **kwargs):
        # Remove unexpected arguments like 'num_items_in_batch'
        kwargs.pop('num_items_in_batch', None)

        # Call the parent forward method
        outputs = super().forward(input_ids=input_ids, attention_mask=attention_mask, **kwargs)
        logits = outputs.logits
        loss = None
        if labels is not None:
            loss = self.loss_fn(logits, labels)
        return {"loss": loss, "logits": logits, **outputs}



In [11]:
# Instantiate the custom model
model = CustomRobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=3, class_weights=class_weights_tensor)

Some weights of CustomRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'loss_fn.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# Define compute metrics
def compute_metrics(eval_pred: EvalPrediction):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

In [13]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [14]:
training_args = TrainingArguments(
    output_dir='./results',                    # Directory to store checkpoints and final model
    num_train_epochs=10,                       # Total number of training epochs
    learning_rate=2e-5,  # Fine-tuning learning rate
    per_device_train_batch_size=10,            # Batch size per device during training
    per_device_eval_batch_size=10,             # Batch size for evaluation
    evaluation_strategy='epoch',               # Evaluate at the end of each epoch
    save_strategy='epoch',                     # Save model at the end of each epoch
    load_best_model_at_end=True,               # Load the best model at the end of training
    metric_for_best_model='f1',                # Use F1 score to evaluate the best model
    greater_is_better=True,                    # Higher F1 is better
    logging_dir='./logs',                      # Directory for storing logs
    logging_steps=100,                         # Log every 100 steps
    save_total_limit=3,                        # Limit the total amount of checkpoints
    seed=42,
     weight_decay=0.05,# Seed for reproducibility
)

# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_datasets["train"],
    eval_dataset=encoded_datasets["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


In [15]:
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,1.083392,0.475,0.410963,0.440335,0.475
2,No log,0.896305,0.6375,0.630811,0.662182,0.6375
3,No log,0.641014,0.7125,0.716301,0.728121,0.7125
4,No log,0.635699,0.7125,0.704555,0.725186,0.7125
5,No log,0.544115,0.7875,0.787133,0.789904,0.7875
6,No log,0.478546,0.7875,0.788026,0.792708,0.7875
7,No log,0.530456,0.7125,0.713015,0.728046,0.7125
8,No log,0.523803,0.75,0.749234,0.761784,0.75
9,No log,0.755202,0.7125,0.712103,0.73605,0.7125
10,0.498100,0.650584,0.7625,0.762004,0.771007,0.7625


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TrainOutput(global_step=110, training_loss=0.4630756855010986, metrics={'train_runtime': 436.4833, 'train_samples_per_second': 7.308, 'train_steps_per_second': 0.252, 'total_flos': 839331802552320.0, 'train_loss': 0.4630756855010986, 'epoch': 10.0})

In [17]:
# Evaluate the model
evaluation_results = trainer.evaluate()
print(evaluation_results)

{'eval_loss': 0.4785463809967041, 'eval_accuracy': 0.7875, 'eval_f1': 0.7880261248185776, 'eval_precision': 0.7927083333333333, 'eval_recall': 0.7875, 'eval_runtime': 2.5146, 'eval_samples_per_second': 31.815, 'eval_steps_per_second': 3.181, 'epoch': 10.0}
