# Notebook for RoBERTa Classification
Manon Kooning, S5221838

In [None]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments, RobertaConfig, RobertaPreTrainedModel, RobertaModel
from google.colab import files
import numpy as np
from sklearn.utils.class_weight import compute_class_weight

Upload training data and do some preprocessing


In [None]:
print("Please upload your TRAIN file.") # modified_train.csv
uploaded_train = files.upload()
train_file = list(uploaded_train.keys())[0]
print(f"Train file {train_file} uploaded successfully.")

print("Please upload your TEST file.") # modified_test.csv
uploaded_test = files.upload()
test_file = list(uploaded_test.keys())[0]
print(f"Test file {test_file} uploaded successfully.")

# Load datasets
df_train = pd.read_csv(train_file, delimiter=',')
print("Train dataset loaded. Preview:")
print(df_train.head())

df_test = pd.read_csv(test_file, delimiter=',')
print("Test dataset loaded. Preview:")
print(df_test.head())

# Clean and preprocess
for df in [df_train, df_test]:
    for col in ['story', 'agency', 'event_sequencing', 'world_making']:
        df[col] = pd.to_numeric(df[col], errors='coerce')

    df.dropna(subset=['body', 'story', 'agency', 'event_sequencing', 'world_making'], inplace=True)
    df['story'] = df['story'].astype(int)
    df['agency'] = df['agency'].astype(int)
    df['event_sequencing'] = df['event_sequencing'].astype(int)
    df['world_making'] = df['world_making'].astype(int)

print("Dataset filtered to relevant columns. Checking data types:")
print(df_train.dtypes)
print(df_test.dtypes)

Please upload your TRAIN file.


Saving modified_train.csv to modified_train.csv
Train file modified_train.csv uploaded successfully.
Please upload your TEST file.


Saving modified_test.csv to modified_test.csv
Test file modified_test.csv uploaded successfully.
Train dataset loaded. Preview:
         name  agency  event_sequencing  world_making  story  \
0  t1_c95k50u       0                 0             0      0   
1  t1_c95k75v       0                 0             0      0   
2  t1_c95mz3n       1                 1             0      1   
3  t1_c95tixb       0                 0             0      0   
4  t1_c95l4my       1                 1             0      1   

                                                body  
0  Give me time god damn it!   E: this was posted...  
1  I somewhat agree but I'm going to take a shot ...  
2  Jesus fuck I was a hundred words into a respon...  
3  So basically, whether incentives from assignin...  
4  Quick note on the hot coffee case.   * The cof...  
Test dataset loaded. Preview:
         name  agency  event_sequencing  world_making  story  \
0  t1_c9mqknk       0                 0             0      0   

In [None]:
# some functions
MAX_LEN = 512

def chunk_text(text):
    """
    Splits long tokenized text into chunks of at most MAX_LEN tokens.

    Args:
        text (str): Input string to be tokenized and split.

    Returns:
        list: A list of token ID chunks (list of lists).
    """
    tokens = tokenizer.encode(text, add_special_tokens=True)
    return [tokens[i:i + MAX_LEN] for i in range(0, len(tokens), MAX_LEN)]

def expand_dataset(texts, labels):
    """
    Prepares input data for BERT, splitting long texts into chunks
    and generating attention masks.

    Args:
        texts (list of str): List of input texts.
        labels (list): List of labels corresponding to the texts.

    Returns:
        Tuple:
            - dict with 'input_ids' and 'attention_mask'
            - list of labels expanded to match each text chunk
    """
    input_ids, attention_masks, expanded_labels = [], [], []
    for text, label in zip(texts, labels):
        chunks = chunk_text(text)
        for chunk in chunks:
            decoded = tokenizer.decode(chunk, skip_special_tokens=True)
            enc = tokenizer(decoded, padding='max_length', truncation=True, max_length=MAX_LEN, return_tensors='pt')
            input_ids.append(enc['input_ids'].squeeze(0))
            attention_masks.append(enc['attention_mask'].squeeze(0))
            expanded_labels.append(label)
    return {'input_ids': input_ids, 'attention_mask': attention_masks}, expanded_labels


class CustomDataset(Dataset):
    """
    Custom dataset class for PyTorch's DataLoader.
    Stores encodings and labels in a format usable for training.
    """
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

def compute_metrics_binary(eval_pred):
    """
    Computes accuracy, precision, recall, and F1 scores for binary classification tasks,
    along with micro, macro, and weighted F1 scores.

    Args:
        eval_pred: A tuple containing model logits and true labels.

    Returns:
        dict: Evaluation metrics including accuracy, precision, recall, and F1 scores.
    """
    logits, labels = eval_pred
    preds = torch.argmax(torch.tensor(logits), axis=1)

    # Compute precision, recall, and F1 for binary classification
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary', zero_division=0)

    # Compute accuracy
    acc = accuracy_score(labels, preds)

    # Compute micro, macro, and weighted F1 scores
    precision_micro, recall_micro, f1_micro, _ = precision_recall_fscore_support(labels, preds, average='micro', zero_division=0)
    precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(labels, preds, average='macro', zero_division=0)
    precision_weighted, recall_weighted, f1_weighted, _ = precision_recall_fscore_support(labels, preds, average='weighted', zero_division=0)

    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "f1_micro": f1_micro,
        "f1_macro": f1_macro,
        "f1_weighted": f1_weighted
    }

class WeightedBERTClassifier(RobertaPreTrainedModel):
    """
    RoBERTa-based text classifier with support for class-weighted loss.

    Uses a pre-trained RoBERTa model with a dropout and linear layer for classification.
    Applies weighted cross-entropy loss during training to handle class imbalance.

    Args:
        config (RobertaConfig): Model configuration.
        class_weights (torch.Tensor): Tensor of class weights for loss computation.

    Returns:
        dict: Contains 'logits' and optionally 'loss' if labels are provided.
    """
    def __init__(self, config, class_weights):
        super().__init__(config)
        self.bert = RobertaModel(config)
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        self.class_weights = class_weights

        self.init_weights()

    def forward(self, input_ids=None, attention_mask=None, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = self.dropout(outputs.pooler_output)
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss(weight=self.class_weights.to(logits.device))
            loss = loss_fn(logits, labels)

        return {'loss': loss, 'logits': logits} if loss is not None else {'logits': logits}

# Training the Story Classifier

In [None]:
train_texts = df_train['body'].tolist()
train_labels_story = df_train['story'].values
val_texts = df_test['body'].tolist()
val_labels_story = df_test['story'].values

# Compute class weights from training labels
class_weights = compute_class_weight('balanced', classes=np.unique(train_labels_story), y=train_labels_story)
class_weights = torch.tensor(class_weights, dtype=torch.float)

print(f"Using provided train/test sets: {len(train_texts)} training samples, {len(val_texts)} validation samples.")

# Tokenization
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
train_encodings, train_labels_expanded = expand_dataset(train_texts, train_labels_story)
val_encodings, val_labels_expanded = expand_dataset(val_texts, val_labels_story)
print("Tokenization complete with chunking!")

train_dataset = CustomDataset(train_encodings, train_labels_expanded)
val_dataset = CustomDataset(val_encodings, val_labels_expanded)
print("Datasets created.")

Using provided train/test sets: 434 training samples, 186 validation samples.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (967 > 512). Running this sequence through the model will result in indexing errors


Tokenization complete with chunking!
Datasets created.


In [None]:
# Instantiate model
config = RobertaConfig.from_pretrained('roberta-base', num_labels=2)
model_story = WeightedBERTClassifier.from_pretrained('roberta-base', config=config, class_weights=class_weights)

# Training args
args_story = TrainingArguments(
    output_dir="./story_model",
    eval_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=5,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    warmup_steps=500,
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)

# Trainer
trainer_story = Trainer(
    model=model_story,
    args=args_story,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics_binary,
)

print("Starting training...")
trainer_story.train()
model_story.save_pretrained("./story_model")
tokenizer.save_pretrained("./story_model")
print("Training complete.")

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of WeightedBERTClassifier were not initialized from the model checkpoint at roberta-base and are newly initialized: ['bert.embeddings.LayerNorm.bias', 'bert.embeddings.LayerNorm.weight', 'bert.embeddings.position_embeddings.weight', 'bert.embeddings.token_type_embeddings.weight', 'bert.embeddings.word_embeddings.weight', 'bert.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.attention.output.dense.bias', 'bert.encoder.layer.0.attention.output.dense.weight', 'bert.encoder.layer.0.attention.self.key.bias', 'bert.encoder.layer.0.attention.self.key.weight', 'bert.encoder.layer.0.attention.self.query.bias', 'bert.encoder.layer.0.attention.self.query.weight', 'bert.encoder.layer.0.attention.self.value.bias', 'bert.encoder.layer.0.attention.self.value.weight', 'bert.encoder.layer.0.intermediate.dense.bias', 'bert.encoder.layer.0.intermediate.dense.weight', 'bert.encoder.layer.0.output.LayerNorm.bias'

Starting training...


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmanonkooning[0m ([33mmanonkooning-university-of-groningen[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,F1 Micro,F1 Macro,F1 Weighted
1,No log,0.869687,0.252577,0.252577,1.0,0.403292,0.252577,0.201646,0.101862
2,No log,0.569266,0.768041,0.666667,0.163265,0.262295,0.768041,0.56234,0.710816
3,No log,0.754443,0.551546,0.349206,0.897959,0.502857,0.551546,0.547203,0.569148
4,No log,0.572534,0.778351,0.542857,0.77551,0.638655,0.778351,0.739402,0.789256
5,No log,0.504645,0.793814,0.57377,0.714286,0.636364,0.793814,0.746239,0.800611


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Training complete.


In [None]:
eval_results = trainer_story.evaluate()
print("Final Evaluation Results:", eval_results)

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Final Evaluation Results: {'eval_loss': 0.5725343823432922, 'eval_accuracy': 0.7783505154639175, 'eval_precision': 0.5428571428571428, 'eval_recall': 0.7755102040816326, 'eval_f1': 0.6386554621848739, 'eval_f1_micro': 0.7783505154639175, 'eval_f1_macro': 0.7394020805348162, 'eval_f1_weighted': 0.7892560772440659, 'eval_runtime': 4.916, 'eval_samples_per_second': 39.463, 'eval_steps_per_second': 5.085, 'epoch': 5.0}


# Traing the Agency model

In [None]:
df_train_agency = df_train[df_train['agency'].isin([0, 1])]
df_test_agency = df_test[df_test['agency'].isin([0, 1])]

train_texts = df_train_agency['body'].tolist()
train_labels_agency = df_train_agency['agency'].values
val_texts = df_test_agency['body'].tolist()
val_labels_agency = df_test_agency['agency'].values

print(f"Using provided train/test sets: {len(train_texts)} training samples, {len(val_texts)} validation samples.")

# Compute class weights
class_weights_agency = compute_class_weight('balanced', classes=np.unique(train_labels_agency), y=train_labels_agency)
class_weights_agency = torch.tensor(class_weights_agency, dtype=torch.float)

# Tokenization
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
train_enc_agency, train_lbls_agency = expand_dataset(train_texts, train_labels_agency)
val_enc_agency, val_lbls_agency = expand_dataset(val_texts, val_labels_agency)

train_dataset_agency = CustomDataset(train_enc_agency, train_lbls_agency)
val_dataset_agency = CustomDataset(val_enc_agency, val_lbls_agency)

# Model config and training
config_agency = RobertaConfig.from_pretrained('roberta-base', num_labels=2)
model_agency = WeightedBERTClassifier.from_pretrained('roberta-base', config=config_agency, class_weights=class_weights_agency)

args_agency = TrainingArguments(
    output_dir="./agency_model",
    eval_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=5,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    warmup_steps=500,
    logging_dir="./logs_agency",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)

trainer_agency = Trainer(
    model=model_agency,
    args=args_agency,
    train_dataset=train_dataset_agency,
    eval_dataset=val_dataset_agency,
    compute_metrics=compute_metrics_binary,
)

print("Starting training for agency...")
trainer_agency.train()
model_agency.save_pretrained("./agency_model")
tokenizer.save_pretrained("./agency_model")
print("Agency model training complete.")

Using provided train/test sets: 423 training samples, 181 validation samples.


Token indices sequence length is longer than the specified maximum sequence length for this model (967 > 512). Running this sequence through the model will result in indexing errors
Some weights of WeightedBERTClassifier were not initialized from the model checkpoint at roberta-base and are newly initialized: ['bert.embeddings.LayerNorm.bias', 'bert.embeddings.LayerNorm.weight', 'bert.embeddings.position_embeddings.weight', 'bert.embeddings.token_type_embeddings.weight', 'bert.embeddings.word_embeddings.weight', 'bert.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.attention.output.dense.bias', 'bert.encoder.layer.0.attention.output.dense.weight', 'bert.encoder.layer.0.attention.self.key.bias', 'bert.encoder.layer.0.attention.self.key.weight', 'bert.encoder.layer.0.attention.self.query.bias', 'bert.encoder.layer.0.attention.self.query.weight', 'bert.encoder.layer.0.attention.self.value.bias', 'bert.encode

Starting training for agency...


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,F1 Micro,F1 Macro,F1 Weighted
1,No log,0.729352,0.285714,0.285714,1.0,0.444444,0.285714,0.222222,0.126984
2,No log,0.639401,0.492063,0.35,0.907407,0.505155,0.492063,0.491708,0.485945
3,No log,0.707784,0.57672,0.393443,0.888889,0.545455,0.57672,0.574707,0.587244
4,No log,0.512681,0.783069,0.610169,0.666667,0.637168,0.783069,0.741226,0.785822
5,No log,0.582506,0.767196,0.578125,0.685185,0.627119,0.767196,0.728944,0.772583


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Agency model training complete.


In [None]:
eval_results = trainer_agency.evaluate()
print("Final Evaluation Results:", eval_results)

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Final Evaluation Results: {'eval_loss': 0.5126806497573853, 'eval_accuracy': 0.783068783068783, 'eval_precision': 0.6101694915254238, 'eval_recall': 0.6666666666666666, 'eval_f1': 0.6371681415929203, 'eval_f1_micro': 0.783068783068783, 'eval_f1_macro': 0.7412255802304224, 'eval_f1_weighted': 0.7858216253607806, 'eval_runtime': 4.8001, 'eval_samples_per_second': 39.375, 'eval_steps_per_second': 5.0, 'epoch': 5.0}


# Training the Event Sequencing model

In [None]:
df_train_event = df_train[df_train['event_sequencing'].isin([0, 1])]
df_test_event = df_test[df_test['event_sequencing'].isin([0, 1])]

train_texts = df_train_event['body'].tolist()
train_labels_event = df_train_event['event_sequencing'].values
val_texts = df_test_event['body'].tolist()
val_labels_event = df_test_event['event_sequencing'].values

print(f"Using provided train/test sets: {len(train_texts)} training samples, {len(val_texts)} validation samples.")

# Compute class weights
class_weights_event = compute_class_weight('balanced', classes=np.unique(train_labels_event), y=train_labels_event)
class_weights_event = torch.tensor(class_weights_event, dtype=torch.float)

# Tokenization
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
train_enc_event, train_lbls_event = expand_dataset(train_texts, train_labels_event)
val_enc_event, val_lbls_event = expand_dataset(val_texts, val_labels_event)
print("Tokenization complete with chunking!")

train_dataset_event = CustomDataset(train_enc_event, train_lbls_event)
val_dataset_event = CustomDataset(val_enc_event, val_lbls_event)

print(f"Data split: {len(train_texts)} training samples, {len(val_texts)} validation samples.")

# Instantiate model
config_event = RobertaConfig.from_pretrained('roberta-base', num_labels=2)
model_event = WeightedBERTClassifier.from_pretrained('roberta-base', config=config_event, class_weights=class_weights_event)

# Training args
args_event = TrainingArguments(
    output_dir="./event_sequencing_model",
    eval_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=5,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    warmup_steps=500,
    logging_dir="./logs_event",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)

# Trainer
trainer_event = Trainer(
    model=model_event,
    args=args_event,
    train_dataset=train_dataset_event,
    eval_dataset=val_dataset_event,
    compute_metrics=compute_metrics_binary,
)

print("Starting training for event_sequencing...")
trainer_event.train()
model_event.save_pretrained("./event_sequencing_model")
tokenizer.save_pretrained("./event_sequencing_model")
print("Event_sequencing model training complete.")


Using provided train/test sets: 411 training samples, 182 validation samples.


Token indices sequence length is longer than the specified maximum sequence length for this model (967 > 512). Running this sequence through the model will result in indexing errors


Tokenization complete with chunking!
Data split: 411 training samples, 182 validation samples.


Some weights of WeightedBERTClassifier were not initialized from the model checkpoint at roberta-base and are newly initialized: ['bert.embeddings.LayerNorm.bias', 'bert.embeddings.LayerNorm.weight', 'bert.embeddings.position_embeddings.weight', 'bert.embeddings.token_type_embeddings.weight', 'bert.embeddings.word_embeddings.weight', 'bert.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.attention.output.dense.bias', 'bert.encoder.layer.0.attention.output.dense.weight', 'bert.encoder.layer.0.attention.self.key.bias', 'bert.encoder.layer.0.attention.self.key.weight', 'bert.encoder.layer.0.attention.self.query.bias', 'bert.encoder.layer.0.attention.self.query.weight', 'bert.encoder.layer.0.attention.self.value.bias', 'bert.encoder.layer.0.attention.self.value.weight', 'bert.encoder.layer.0.intermediate.dense.bias', 'bert.encoder.layer.0.intermediate.dense.weight', 'bert.encoder.layer.0.output.LayerNorm.bias'

Starting training for event_sequencing...


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,F1 Micro,F1 Macro,F1 Weighted
1,No log,0.69388,0.365079,0.289941,1.0,0.449541,0.365079,0.349771,0.301733
2,No log,0.860256,0.306878,0.272222,1.0,0.427948,0.306878,0.274376,0.200435
3,No log,0.585687,0.661376,0.427184,0.897959,0.578947,0.661376,0.647881,0.681071
4,No log,0.55849,0.708995,0.465909,0.836735,0.59854,0.708995,0.685162,0.726869
5,No log,0.478022,0.714286,0.471264,0.836735,0.602941,0.714286,0.6899,0.73177


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Event_sequencing model training complete.


In [None]:
eval_results = trainer_event.evaluate()
print("Final Evaluation Results:", eval_results)

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Final Evaluation Results: {'eval_loss': 0.4780217707157135, 'eval_accuracy': 0.7142857142857143, 'eval_precision': 0.47126436781609193, 'eval_recall': 0.8367346938775511, 'eval_f1': 0.6029411764705882, 'eval_f1_micro': 0.7142857142857143, 'eval_f1_macro': 0.6899003403014098, 'eval_f1_weighted': 0.7317695673310646, 'eval_runtime': 4.8197, 'eval_samples_per_second': 39.214, 'eval_steps_per_second': 4.98, 'epoch': 5.0}


# Training the World Making model

In [None]:
from transformers import RobertaTokenizer, RobertaConfig

df_train_world = df_train[df_train['world_making'].isin([0, 1])]
df_test_world = df_test[df_test['world_making'].isin([0, 1])]

train_texts = df_train_world['body'].tolist()
train_labels_world = df_train_world['world_making'].values
val_texts = df_test_world['body'].tolist()
val_labels_world = df_test_world['world_making'].values

print(f"Using provided train/test sets: {len(train_texts)} training samples, {len(val_texts)} validation samples.")

# Compute class weights
class_weights_world = compute_class_weight('balanced', classes=np.unique(train_labels_world), y=train_labels_world)
class_weights_world = torch.tensor(class_weights_world, dtype=torch.float)

# Tokenization (RoBERTa)
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
train_enc_world, train_lbls_world = expand_dataset(train_texts, train_labels_world)
val_enc_world, val_lbls_world = expand_dataset(val_texts, val_labels_world)
print("Tokenization complete with chunking!")

train_dataset_world = CustomDataset(train_enc_world, train_lbls_world)
val_dataset_world = CustomDataset(val_enc_world, val_lbls_world)

print(f"Data split: {len(train_texts)} training samples, {len(val_texts)} validation samples.")

# Instantiate model (RoBERTa)
config_world = RobertaConfig.from_pretrained('roberta-base', num_labels=2)
model_world = WeightedBERTClassifier.from_pretrained('roberta-base', config=config_world, class_weights=class_weights_world)

# Training args
args_world = TrainingArguments(
    output_dir="./world_making_model",
    eval_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=5,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    warmup_steps=500,
    logging_dir="./logs_world_making",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)

# Trainer
trainer_world = Trainer(
    model=model_world,
    args=args_world,
    train_dataset=train_dataset_world,
    eval_dataset=val_dataset_world,
    compute_metrics=compute_metrics_binary,
)

print("Starting training for world_making...")
trainer_world.train()
model_world.save_pretrained("./world_making_model")
tokenizer.save_pretrained("./world_making_model")
print("World_making model training complete.")


Using provided train/test sets: 423 training samples, 180 validation samples.


Token indices sequence length is longer than the specified maximum sequence length for this model (967 > 512). Running this sequence through the model will result in indexing errors


Tokenization complete with chunking!
Data split: 423 training samples, 180 validation samples.


Some weights of WeightedBERTClassifier were not initialized from the model checkpoint at roberta-base and are newly initialized: ['bert.embeddings.LayerNorm.bias', 'bert.embeddings.LayerNorm.weight', 'bert.embeddings.position_embeddings.weight', 'bert.embeddings.token_type_embeddings.weight', 'bert.embeddings.word_embeddings.weight', 'bert.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.attention.output.dense.bias', 'bert.encoder.layer.0.attention.output.dense.weight', 'bert.encoder.layer.0.attention.self.key.bias', 'bert.encoder.layer.0.attention.self.key.weight', 'bert.encoder.layer.0.attention.self.query.bias', 'bert.encoder.layer.0.attention.self.query.weight', 'bert.encoder.layer.0.attention.self.value.bias', 'bert.encoder.layer.0.attention.self.value.weight', 'bert.encoder.layer.0.intermediate.dense.bias', 'bert.encoder.layer.0.intermediate.dense.weight', 'bert.encoder.layer.0.output.LayerNorm.bias'

Starting training for world_making...


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,F1 Micro,F1 Macro,F1 Weighted
1,No log,0.89619,0.925532,0.0,0.0,0.0,0.925532,0.480663,0.889738
2,No log,1.162697,0.925532,0.0,0.0,0.0,0.925532,0.480663,0.889738
3,No log,1.480745,0.925532,0.0,0.0,0.0,0.925532,0.480663,0.889738
4,No log,1.145422,0.925532,0.0,0.0,0.0,0.925532,0.480663,0.889738
5,No log,0.854093,0.925532,0.0,0.0,0.0,0.925532,0.480663,0.889738


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


World_making model training complete.


In [None]:
eval_results = trainer_world.evaluate()
print("Final Evaluation Results:", eval_results)

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Final Evaluation Results: {'eval_loss': 0.8961899280548096, 'eval_accuracy': 0.925531914893617, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_f1_micro': 0.925531914893617, 'eval_f1_macro': 0.48066298342541436, 'eval_f1_weighted': 0.8897378629364053, 'eval_runtime': 4.7601, 'eval_samples_per_second': 39.495, 'eval_steps_per_second': 5.042, 'epoch': 5.0}


# Labeling the new dataset

In [None]:
# Upload the file that needs to be labeled
print("Please upload your dataset file.")
uploaded = files.upload()
filename2 = list(uploaded.keys())[0]
print(f"File {filename2} uploaded successfully.")

Please upload your dataset file.


Saving label_test.csv to label_test.csv
File label_test.csv uploaded successfully.


In [None]:
df_input = pd.read_csv(filename2, delimiter=',')
print("Dataset loaded. Preview:")
print(df_input.head())

Dataset loaded. Preview:
         name  agency  event_sequencing  world_making  opt_comment  story  \
0  t1_cmdtn63     NaN               NaN           NaN          NaN    NaN   
1  t1_cmeg54f     NaN               NaN           NaN          NaN    NaN   
2  t1_cmenx3z     NaN               NaN           NaN          NaN    NaN   
3  t1_cmdamo2     NaN               NaN           NaN          NaN    NaN   
4  t1_cmdupvw     NaN               NaN           NaN          NaN    NaN   

                                                body  
0  I think we have more common ground than we tho...  
1  &gt; we need to train them to be on par with s...  
2                               That would be great.  
3  "The American legal system is founded on the i...  
4  In the encounter between Michael Brown and Dar...  


In [None]:
from tqdm import tqdm

def classify_chunked(sentences, model, tokenizer):
    """
    Predicts class labels for a list of sentences, chunking long texts.

    Args:
        sentences (list of str): Input texts.
        model (DistilBertForSequenceClassification): Fine-tuned DistilBERT model.
        tokenizer (DistilBertTokenizer): Tokenizer for the model.

    Returns:
        list of int: Predicted labels.
    """
    model.eval()
    predictions = []
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Use tqdm for the progress bar
    for sentence in tqdm(sentences, desc="Classifying sentences", unit="sentence"):
        tokens = tokenizer.encode(sentence, add_special_tokens=True)
        chunks = [tokens[i:i+512] for i in range(0, len(tokens), 512)]
        chunk_probs = []
        for chunk in chunks:
            chunk_text = tokenizer.decode(chunk, skip_special_tokens=True)
            encodings = tokenizer(chunk_text, return_tensors="pt", padding=True, truncation=True, max_length=512)
            encodings = {key: val.to(device) for key, val in encodings.items()}
            with torch.no_grad():
                outputs = model(**encodings)
                probs = torch.nn.functional.softmax(outputs.logits, dim=1)
                chunk_probs.append(probs.cpu().numpy())
        chunk_probs = np.vstack(chunk_probs)
        max_probs = chunk_probs.max(axis=0)
        prediction = np.argmax(max_probs)
        predictions.append(prediction)

    return predictions

# Filter out rows with empty or missing body
initial_len = len(df_input)
df_input = df_input[df_input['body'].notna() & (df_input['body'].str.strip() != "")]
filtered_len = len(df_input)
print(f"Removed {initial_len - filtered_len} rows with empty or missing 'body'.")

# Load models
tokenizer = BertTokenizer.from_pretrained("./story_model")
model_story = BertForSequenceClassification.from_pretrained("./story_model")
model_story.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

print("Classifying story...")
df_input['story'] = classify_chunked(df_input['body'].tolist(), model_story, tokenizer)

tokenizer = BertTokenizer.from_pretrained("./agency_model")
model_agency = BertForSequenceClassification.from_pretrained("./agency_model")
model_agency.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

print("Classifying agency...")
df_input['agency'] = classify_chunked(df_input['body'].tolist(), model_agency, tokenizer)

tokenizer = BertTokenizer.from_pretrained("./event_sequencing_model")
model_event_sequencing = BertForSequenceClassification.from_pretrained("./event_sequencing_model")
model_event_sequencing.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

print("Classifying event sequencing...")
df_input['event_sequencing'] = classify_chunked(df_input['body'].tolist(), model_event_sequencing, tokenizer)

tokenizer = BertTokenizer.from_pretrained("./world_making_model")
model_world_making = BertForSequenceClassification.from_pretrained("./world_making_model")
model_world_making.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

print("Classifying world making...")
df_input['world_making'] = classify_chunked(df_input['body'].tolist(), model_world_making, tokenizer)

# Save output
output_file = "classified_output.csv"
df_input.to_csv(output_file, sep=',', index=False)
print(f"Done! Results saved to {output_file}")

Classifying story...


Token indices sequence length is longer than the specified maximum sequence length for this model (635 > 512). Running this sequence through the model will result in indexing errors


Classifying agency...


Token indices sequence length is longer than the specified maximum sequence length for this model (635 > 512). Running this sequence through the model will result in indexing errors


Classifying event sequencing...


Token indices sequence length is longer than the specified maximum sequence length for this model (635 > 512). Running this sequence through the model will result in indexing errors


Classifying world making...


Token indices sequence length is longer than the specified maximum sequence length for this model (635 > 512). Running this sequence through the model will result in indexing errors


Done! Results saved to classified_output.csv
