In [None]:
# -------------------- IMPORTS --------------------
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from torch.nn import CrossEntropyLoss

In [None]:
!pip install transformers datasets scikit-learn torch --quiet

In [None]:
# -------------------- DATA LOADING --------------------
df = pd.read_csv('hate_sample_for_finetune.csv')

In [None]:
df = df.dropna(subset=['hate_type_teacher'])

In [None]:
# Map text labels to integers
label_mapping = {label: idx for idx, label in enumerate(df['hate_type_teacher'].unique())}
df['hate_type_teacher_int'] = df['hate_type_teacher'].map(label_mapping)
print("Label mapping:", label_mapping)


Label mapping: {'political framing hate': 0, 'sarcasm-based hate': 1, 'meme-language hate': 2, 'humor-based hate': 3, 'metaphor-based hate': 4}


In [None]:
# -------------------- TRAIN-TEST SPLIT --------------------
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['clean_text'], df['hate_type_teacher_int'], test_size=0.1, random_state=42, stratify=df['hate_type_teacher_int']
)


In [None]:
# -------------------- TOKENIZATION --------------------
model_name = "cardiffnlp/twitter-roberta-base-hate"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_mapping), ignore_mismatched_sizes=True)
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=256)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True, max_length=256)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-hate and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([2, 768]) in the checkpoint and torch.Size([5, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([2]) in the checkpoint and torch.Size([5]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# -------------------- CUSTOM DATASET --------------------
class HateSpeechDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels.reset_index(drop=True)
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(int(self.labels.iloc[idx]), dtype=torch.long)
        return item
    def __len__(self):
        return len(self.labels)

train_dataset = HateSpeechDataset(train_encodings, train_labels)
val_dataset = HateSpeechDataset(val_encodings, val_labels)


In [None]:
# -------------------- WEIGHTED TRAINER --------------------
# Compute class weights inversely proportional to frequency
label_counts = df['hate_type_teacher_int'].value_counts().sort_index()
class_weights = torch.tensor(1.0 / label_counts.values, dtype=torch.float)
class_weights = class_weights / class_weights.sum() * len(label_counts)  # Normalize
print("Class weights:", class_weights)

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = CrossEntropyLoss(weight=class_weights.to(model.device))
        loss = loss_fct(logits.view(-1, model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

Class weights: tensor([1.6748, 0.1467, 1.0444, 1.5364, 0.5977])


In [None]:
# -------------------- TRAINING ARGS --------------------
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,  # you can reduce to 2 if GPU time is an issue
    per_device_train_batch_size=8,  # smaller batch for less memory
    per_device_eval_batch_size=8,
    eval_strategy="epoch", # Removed due to TypeError in current environment
    # logging_strategy="epoch",    # Removed due to TypeError in current environment
    learning_rate=2e-5,
    weight_decay=0.01,
    report_to=[]
)

In [None]:
# -------------------- TRAINING --------------------
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,1.495937
2,No log,1.234332
3,1.353600,1.29663


TrainOutput(global_step=672, training_loss=1.2625169526963007, metrics={'train_runtime': 274.5595, 'train_samples_per_second': 19.548, 'train_steps_per_second': 2.448, 'total_flos': 706077535302144.0, 'train_loss': 1.2625169526963007, 'epoch': 3.0})

In [None]:

# -------------------- SAVE MODEL --------------------
model.save_pretrained('./twitter-roberta-hate-subtypes')
tokenizer.save_pretrained('./twitter-roberta-hate-subtypes')


('./twitter-roberta-hate-subtypes/tokenizer_config.json',
 './twitter-roberta-hate-subtypes/special_tokens_map.json',
 './twitter-roberta-hate-subtypes/vocab.json',
 './twitter-roberta-hate-subtypes/merges.txt',
 './twitter-roberta-hate-subtypes/added_tokens.json',
 './twitter-roberta-hate-subtypes/tokenizer.json')

In [None]:
# -------------------- EVALUATION --------------------
from sklearn.metrics import classification_report, accuracy_score

predictions = trainer.predict(val_dataset)
pred_labels = predictions.predictions.argmax(axis=1)
true_labels = predictions.label_ids

print("\nâœ… Accuracy:", accuracy_score(true_labels, pred_labels))
print("\nðŸ“Š Classification Report:\n", classification_report(true_labels, pred_labels, target_names=label_mapping.keys()))


âœ… Accuracy: 0.5326633165829145

ðŸ“Š Classification Report:
                         precision    recall  f1-score   support

political framing hate       0.35      0.55      0.43        11
    sarcasm-based hate       0.79      0.60      0.68       127
    meme-language hate       0.32      0.44      0.37        18
      humor-based hate       0.20      0.33      0.25        12
   metaphor-based hate       0.29      0.39      0.33        31

              accuracy                           0.53       199
             macro avg       0.39      0.46      0.41       199
          weighted avg       0.61      0.53      0.56       199

