In [None]:
!pip install datasets
!pip install googletrans==3.1.0a0

import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch
from datasets import Dataset, DatasetDict
from googletrans import Translator

# Load Dataset
def load_english_data(filepath):
    data = pd.read_csv(filepath, sep="\t")

    # Clean labels by stripping leading/trailing spaces (no internal space changes)
    data['level1'] = data['level1'].str.strip()  # Only strips leading/trailing spaces

    # Map labels to integers based on actual label values
    label_mapping = {"ABUSIVE": 1, "NOT ABUSIVE": 0}
    data['label'] = data['level1'].map(label_mapping)

    # Check if mapping worked (print a few rows)
    print(data[['level1', 'label']].head())

    # Structure data into input (text) and output (label)
    input_text = data['text'].tolist()
    output_labels = data['label'].tolist()

    return input_text, output_labels, data

# Load data
input_text, output_labels, data = load_english_data("dataset-en.tsv")

# 80% train and dev, 20% test
train_dev_texts, test_texts, train_dev_labels, test_labels = train_test_split(
    input_text, output_labels, test_size=0.2, random_state=42, stratify=output_labels
)

# split 80 (train and dev) into 87.5% train and 12.5% dev (resulting in 70/10 split)
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_dev_texts, train_dev_labels, test_size=0.125, random_state=42, stratify=train_dev_labels
)

# Print some samples
print("Training Texts Sample:", train_texts[:3])
print("Training Labels Sample:", train_labels[:3])

        level1  label
0  NOT ABUSIVE      0
1  NOT ABUSIVE      0
2      ABUSIVE      1
3  NOT ABUSIVE      0
4  NOT ABUSIVE      0
Training Texts Sample: ["why is ankara and media in turkey dictating our presence in afghanistan? the tweets go back many years. it's the brotherhood and jihadists that are running 🇺🇸 the need to get rid of 🇮🇱 they hate jews always have!", '[USER] newsflash, asshole, muslims invented shit like making jews wear yellow stars to distinguish us and make it easier to identify and oppress within the first century that they existed', 'people of iran will turn the incoming presidential circus into a referendum on the legitimacy of the islamic republic. polls suggest about 80% say no. [USER] [USER] who are you negotiating with?! [USER] [USER] [USER] [USER] [URL]']
Training Labels Sample: [1, 1, 0]


Load and tokenize

In [None]:
# load the BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

# pad and truncate
def tokenize_data(texts, tokenizer, max_length=128):
    encoded_texts = tokenizer(
        texts,
        padding=True,  # Pad the sequences to the same length
        truncation=True,  # Truncate the sequences to max_length
        max_length=max_length,  # Set max token length
        return_tensors="pt",  # Return PyTorch tensors
        return_attention_mask=True  # Include attention mask
    )
    return encoded_texts

# Tokenize the training and validation data
train_encodings = tokenize_data(train_texts, tokenizer)
val_encodings = tokenize_data(val_texts, tokenizer)
test_encodings = tokenize_data(test_texts, tokenizer)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
# Convert tokenized data and labels into Hugging Face Dataset objects
train_dataset = Dataset.from_dict({
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask'],
    'labels': torch.tensor(train_labels)  # Ensure labels are tensors
})

# validation
val_dataset = Dataset.from_dict({
    'input_ids': val_encodings['input_ids'],
    'attention_mask': val_encodings['attention_mask'],
    'labels': torch.tensor(val_labels)  # Ensure labels are tensors
})

# testing
test_dataset = Dataset.from_dict({
    'input_ids': test_encodings['input_ids'],
    'attention_mask': test_encodings['attention_mask'],
    'labels': torch.tensor(test_labels)
})

In [None]:
training_args = TrainingArguments(
    output_dir="./results",               # Directory where the model checkpoints will be saved
    evaluation_strategy="steps",          # Evaluate periodically during training
    logging_steps=500,                    # Log every 500 steps
    save_strategy="steps",                # Save checkpoints periodically
    save_steps=1000,                      # Save every 1000 steps
    save_total_limit=1,                   # Keep only the latest checkpoint
    per_device_train_batch_size=32,       # Larger batch size for faster training
    per_device_eval_batch_size=16,        # Batch size for evaluation
    num_train_epochs=3,                   # Number of training epochs
    learning_rate=5e-5,                   # Learning rate
    weight_decay=0.01,                    # Weight decay for regularization
    load_best_model_at_end=True,          # Load the best model after training
    metric_for_best_model="f1",           # Use F1-score for model selection
    fp16=True,                            # Enable mixed precision
    dataloader_num_workers=4,             # Parallel data loading
)



In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np

def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)  # Get predicted labels

    # Debugging: Print predictions and labels
    print("Sample Predictions:", preds[:10])
    print("Sample Labels:", labels[:10])

    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary", zero_division=0)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}


## English Dataset Trained

In [None]:
# Load the pre-trained multilingual BERT model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=2)

trainer = Trainer(
    model=model,                          # The model to train
    args=training_args,                   # Training arguments
    train_dataset=train_dataset,          # The training dataset
    eval_dataset=val_dataset,             # The validation dataset
    tokenizer=tokenizer,                  # The tokenizer to process text during training
    compute_metrics=compute_metrics       # Metrics to evaluate the model
)

trainer.train()

trainer.save_model("./trained_model")  # Saves the trained model to a directory

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mmargaretrose-bowen[0m ([33mmargaretrose-bowen-universit-di-trento[0m). Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss,Validation Loss


In [None]:
eval_results = trainer.evaluate()

# Print the evaluation results
print("Evaluation English dataset Results:", eval_results)

test_results = trainer.evaluate(test_dataset)
print("Evaluation on English Test Set (20% holdout):", test_results)


## Italian Dataset

In [None]:
# check for nan values with math
import math


# Load Italian dataset
def load_italian_data(filepath):
   data = pd.read_csv(filepath, sep="\t")


   # Clean and preprocess labels if they exist
   data['level1'] = data['level1'].str.strip()  # Assuming the Italian dataset has a 'level1' column
   # note: Italian dataset, they randomly don't have the whole word ABUSIVE, just ABUS
   label_mapping = {"ABUS": 1, "NOT-ABUSIVE": 0}  # Assuming same labels
   data['label'] = data['level1'].map(label_mapping)


   # Check for NaN labels
   if data['label'].isna().any():
       print("Rows with NaN labels (Italian):")
       print(data[data['label'].isna()])


   # Extract text and labels
   input_text = data['text'].tolist()
   output_labels = data['label'].tolist()


   return input_text, output_labels


# Load the Italian dataset
italian_texts, italian_labels = load_italian_data("dataset-it.tsv")


# 80% train and dev, 20% test
italian_train_dev_texts, italian_test_texts, italian_train_dev_labels, italian_test_labels = train_test_split(
    italian_texts, italian_labels, test_size=0.2, random_state=42, stratify=italian_labels
)

# split 80% (train and dev) into 87.5% train and 12.5% val → 70/10 split overall
italian_train_texts, italian_val_texts, italian_train_labels, italian_val_labels = train_test_split(
    italian_train_dev_texts, italian_train_dev_labels, test_size=0.125, random_state=42, stratify=italian_train_dev_labels
)

In [None]:
# Tokenize the training and validation datasets
italian_train_encodings = tokenize_data(italian_train_texts, tokenizer)
italian_val_encodings = tokenize_data(italian_val_texts, tokenizer)
italian_test_encodings = tokenize_data(italian_test_texts, tokenizer)


# Convert tokenized tensors to lists
train_input_ids_list = italian_train_encodings['input_ids'].tolist()
train_attention_mask_list = italian_train_encodings['attention_mask'].tolist()
val_input_ids_list = italian_val_encodings['input_ids'].tolist()
val_attention_mask_list = italian_val_encodings['attention_mask'].tolist()

In [None]:
# Convert to Hugging Face Dataset objects
italian_train_dataset = Dataset.from_dict({
   'input_ids': train_input_ids_list,
   'attention_mask': train_attention_mask_list,
   'labels': torch.tensor(italian_train_labels)
})


italian_val_dataset = Dataset.from_dict({
   'input_ids': val_input_ids_list,
   'attention_mask': val_attention_mask_list,
   'labels': torch.tensor(italian_val_labels)
})

# Build HuggingFace Dataset for the Italian test set
italian_test_dataset = Dataset.from_dict({
    'input_ids': italian_test_encodings['input_ids'],
    'attention_mask': italian_test_encodings['attention_mask'],
    'labels': torch.tensor(italian_test_labels)
})

# print each to see if its working alright
print("Italian Training Dataset:", italian_train_dataset)
print("Italian Validation Dataset:", italian_val_dataset)
print("Italian Test Dataset:", italian_test_dataset)

In [None]:
# error checking cuz i was having some issues previously
# Check unique values in level1 column
print("Unique values in level1 column:", data['level1'].unique())

In [None]:
# Italian Dataset
italian_texts, italian_labels = load_italian_data("dataset-it.tsv")
print("Italian unique labels:", set(italian_labels))


Trainer for evaluation. Use english model to evaluate on Italian validation dataset

In [None]:
# Load the trained English model
model = AutoModelForSequenceClassification.from_pretrained("./trained_model")


# Initialize a Trainer for evaluation
trainer = Trainer(
   model=model,
   tokenizer=tokenizer,
   compute_metrics=compute_metrics  # Include the metrics function
)


# Evaluate the English-trained model on the Italian validation dataset
eval_results_italian = trainer.evaluate(italian_val_dataset)  # Use the validation split here

italian_test_results = trainer.evaluate(italian_test_dataset)
print("Evaluation on Italian Test Set (20% holdout):", italian_test_results)

# Print the zero-shot evaluation results
print("Zero-shot evaluation results on the Italian validation dataset:", eval_results_italian)
