##Detecting Microaggressions Using Fine-Tuned hateBERT Trained on Subtle/Implicit Toxicity

Here, I'm trying to combine knowledge of explicit and implicit hate speech using a hateBERT model fine-tuned on Jigsaw Toxic Comments and training it using the suble hate speech column of IShate.

In [None]:
!pip install -q transformers
!pip install -q torchinfo
!pip install -U -q datasets
!pip install -q evaluate

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m506.8/506.8 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf-cu12 25.6.0 requires pyarrow<20.0.0a0,>=14.0.0; platform_machine == "x86_64", but you have pyarrow 22.0.0 which is incompatible.
pylibcudf-cu12 25.6.0 requires pyarrow<20.0.0a0,>=14.0.0; platform_machine == "x86_64", but you have pyarrow 22.0.0 which is incompatible.[0m[31m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
#upload IShate

from datasets import load_dataset, Dataset, DatasetDict
import pandas as pd
from sklearn.preprocessing import LabelEncoder

import numpy as np

import transformers
import evaluate

from torchinfo import summary

from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer

splits = {
    'train': 'ishate_train.parquet.gzip',
    'validation': 'ishate_dev.parquet.gzip',
    'test': 'ishate_test.parquet.gzip'
}

df_train = pd.read_parquet("hf://datasets/BenjaminOcampo/ISHate/" + splits["train"])
df_val = pd.read_parquet("hf://datasets/BenjaminOcampo/ISHate/" + splits["validation"])
df_test = pd.read_parquet("hf://datasets/BenjaminOcampo/ISHate/" + splits["test"])

In [None]:

# Keep only rows where hateful_layer == "HS" and subtlety_layer is not NaN
df_train_hs = df_train[
    (df_train["hateful_layer"] == "HS") &
    (df_train["subtlety_layer"].notna())
].reset_index(drop=True)

df_val_hs = df_val[
    (df_val["hateful_layer"] == "HS") &
    (df_val["subtlety_layer"].notna())
].reset_index(drop=True)

df_test_hs = df_test[
    (df_test["hateful_layer"] == "HS") &
    (df_test["subtlety_layer"].notna())
].reset_index(drop=True)



In [None]:
#sanity check

df_train_hs['subtlety_layer'].value_counts(dropna=False)


Unnamed: 0_level_0,count
subtlety_layer,Unnamed: 1_level_1
Subtle,10867
Non-Subtle,7691


In [None]:
# create DatasetDict
ishate_dataset = DatasetDict({
    "train": Dataset.from_pandas(df_train_hs),
    "validation": Dataset.from_pandas(df_val_hs),
    "test": Dataset.from_pandas(df_test_hs)
})

In [None]:
#Encode labels
label_encoder = LabelEncoder()

y_train_encoded = label_encoder.fit_transform(ishate_dataset['train']['subtlety_layer'])
y_val_encoded = label_encoder.transform(ishate_dataset['validation']['subtlety_layer'])
y_test_encoded = label_encoder.transform(ishate_dataset['test']['subtlety_layer'])
ishate_train_data = ishate_dataset['train'].add_column('label', y_train_encoded.tolist())
ishate_val_data = ishate_dataset['validation'].add_column('label', y_val_encoded.tolist())
ishate_test_data = ishate_dataset['test'].add_column('label', y_test_encoded.tolist())

In [None]:
#sanity check

print(dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))

{np.str_('Non-Subtle'): np.int64(0), np.str_('Subtle'): np.int64(1)}


In [None]:
#tokenize text

max_sequence_length = 128

def preprocess_ishate(data, tokenizer):
    # Ensure text is a list of strings
    text = data['cleaned_text']
    encoded = tokenizer.batch_encode_plus(
            text,
            max_length=max_sequence_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_token_type_ids=True,
            return_tensors="pt"
        )

    return encoded


metric = evaluate.load('accuracy')

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)


Downloading builder script: 0.00B [00:00, ?B/s]

In [None]:
def fine_tune_classification_model(classification_model,
                                   tokenizer,
                                   train_data,
                                   dev_data,
                                   batch_size = 16,
                                   num_epochs = 2,
                                   learning_rate=2e-5):
    """
    Preprocess the data using the given tokenizer (we've give you the code for that part).
    Create the training arguments and trainer for the given model and data (write your code for that).
    Then train it.
    """

    preprocessed_train_data = train_data.map(preprocess_ishate, batched=True, fn_kwargs={'tokenizer': tokenizer})
    preprocessed_dev_data = dev_data.map(preprocess_ishate, batched=True, fn_kwargs={'tokenizer': tokenizer})

    # Referencing lesson 4 notebook & assignment 2 as an example:
    training_args = TrainingArguments(
      output_dir="FThatebert_ishate",
      per_device_train_batch_size=batch_size,
      per_device_eval_batch_size=batch_size,
      num_train_epochs=num_epochs,
      learning_rate=learning_rate,
      eval_strategy="epoch",
      save_strategy="epoch",
      report_to='none'
    )

    trainer = Trainer(
      model=classification_model,
      args=training_args,
      train_dataset=preprocessed_train_data,
      eval_dataset=preprocessed_dev_data,
      compute_metrics=compute_metrics
    )

    trainer.train()

    return trainer

In [None]:
#use hateBERT fine-tuned on Jigsaw toxic comments

from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline

model_checkpoint_name = "Jensvollends/hatebert-finetuned_v5"
hatebert_FT_classification_model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint_name, num_labels = 2)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint_name)


pipe = pipeline("text-classification", model=hatebert_FT_classification_model, tokenizer=tokenizer, top_k=None)

text = "You are a kind person"
result = pipe(text)
print(result)

config.json:   0%|          | 0.00/733 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Device set to use cuda:0


[[{'label': 'LABEL_0', 'score': 0.9996312856674194}, {'label': 'LABEL_1', 'score': 0.0003686983836814761}]]


In [None]:
#create trainer
hatebert_FT_trainer = fine_tune_classification_model(hatebert_FT_classification_model, tokenizer, ishate_train_data, ishate_val_data)

Map:   0%|          | 0/18558 [00:00<?, ? examples/s]

Map:   0%|          | 0/1687 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy
1,0.0746,0.097743,0.981624
2,0.0345,0.108709,0.97866


In [None]:
"""
Before moving on, save a checkpoint of the model you just trained in your Drive,
So that you can pick up where you left off later if needed
"""

# Modify this path to the location in your Drive where you want to save the model
hatebert_FT_model_checkpoint_filepath = 'drive/MyDrive/w266_project/model_checkpoints_project/hatebert_FT_classification_model'

In [None]:
# Run this line only after you've trained the model
hatebert_FT_classification_model.save_pretrained(hatebert_FT_model_checkpoint_filepath, from_pt=True)

In [None]:
# Run this line only if you need to reload the model you trained earlier

from transformers import T5TokenizerFast, T5Config, T5ForConditionalGeneration

hatebert_FT_classification_model = T5ForConditionalGeneration.from_pretrained(hatebert_FT_model_checkpoint_filepath)

In [None]:
#test on IShate data
preprocessed_test_data = ishate_test_data.map(preprocess_ishate, batched=True, fn_kwargs={'tokenizer': tokenizer})
predictions = hatebert_FT_trainer.predict(preprocessed_test_data)
preprocessed_test_pred = np.argmax(predictions.predictions, axis=1)
from sklearn.metrics import accuracy_score, classification_report

test_accuracy = accuracy_score(y_test_encoded, preprocessed_test_pred)
print(f"\nTest Accuracy: {test_accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test_encoded, preprocessed_test_pred, target_names=label_encoder.classes_))

Map:   0%|          | 0/1687 [00:00<?, ? examples/s]


Test Accuracy: 0.9751

Classification Report:
              precision    recall  f1-score   support

  Non-Subtle       0.99      0.99      0.99      1648
      Subtle       0.46      0.41      0.43        39

    accuracy                           0.98      1687
   macro avg       0.72      0.70      0.71      1687
weighted avg       0.97      0.98      0.97      1687



#Evaluation on Microaggressions Dataset

Haven't yet tried the datafrom microaggressions.com that I scraped. As Carlos has done, would be good to balance that dataset and evaluate.

In [None]:
#same initial load and format steps as Carlos

micro_agg_url = "https://huggingface.co/spaces/khanak27/microaggressionsdetector/resolve/main/micro_agg.csv"
# Try different encodings to handle Unicode issues
encodings_to_try = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1', 'utf-8-sig']

df_micro = None
for encoding in encodings_to_try:
    try:
        print(f"Trying encoding: {encoding}")
        df_micro = pd.read_csv(micro_agg_url, encoding=encoding)
        print(f"✅ Successfully loaded with {encoding} encoding")
        break
    except UnicodeDecodeError as e:
        print(f"❌ Failed with {encoding}: {str(e)[:100]}...")
        continue
    except Exception as e:
        print(f"❌ Other error with {encoding}: {str(e)[:100]}...")
        continue

if df_micro is None:
    print("❌ Failed to load dataset with any encoding. Trying with error handling...")
    try:
        df_micro = pd.read_csv(micro_agg_url, encoding='utf-8', encoding_errors='replace')
        print("✅ Loaded with UTF-8 and error replacement")
    except Exception as e:
        print(f"❌ Final attempt failed: {e}")
        raise

print(f"Dataset shape: {df_micro.shape}")
print(f"Columns: {df_micro.columns.tolist()}")
print(f"\nFirst few rows:")
display(df_micro.head(10))

print(f"\nData types:")
print(df_micro.dtypes)

print(f"\nMissing values:")
print(df_micro.isnull().sum())

print(f"\nLabel distribution:")
print(df_micro['label'].value_counts().sort_index())

# Check for any text preprocessing needed
print(f"\nSample texts:")
for i in range(3):
    print(f"{i+1}. Label {df_micro.iloc[i]['label']}: {df_micro.iloc[i]['speech']}")



df_micro['cleaned_text'] = df_micro['speech']
def map_micro_labels_to_hate_speech(micro_label):
    """Map microaggression labels to hate speech labels"""
    if micro_label == 1:  # Microaggression
        return 'Subtle'  # Map to subtle hateful speech
    else:  # Normal speech
        return 'Non-Subtle'  # Map to subtle non-hateful speech

# Apply the mapping
df_micro['mapped_label'] = df_micro['label'].apply(map_micro_labels_to_hate_speech)
y_micro_encoded = label_encoder.transform(df_micro['mapped_label'])

Trying encoding: utf-8
❌ Failed with utf-8: 'utf-8' codec can't decode byte 0xe2 in position 17: invalid continuation byte...
Trying encoding: latin-1
✅ Successfully loaded with latin-1 encoding
Dataset shape: (171, 2)
Columns: ['speech', 'label']

First few rows:


Unnamed: 0,speech,label
0,You're very articulate for someone like you.,1
1,Where are you really from?,1
2,You're not like other girls.,1
3,You must be good at math since you're Asian.,1
4,You're too pretty to be a software engineer.,1
5,Are you sure you want to lead this project?,1
6,You don't look gay.,1
7,You are a credit to your race.,1
8,"That's a strange name, is it foreign?",1
9,Do you even understand this topic?,1



Data types:
speech    object
label      int64
dtype: object

Missing values:
speech    0
label     0
dtype: int64

Label distribution:
label
0    87
1    84
Name: count, dtype: int64

Sample texts:
1. Label 1: You're very articulate for someone like you.
2. Label 1: Where are you really from?
3. Label 1: You're not like other girls.


In [None]:
#use fine-tuned hateBERT tokenizer

microaggressions_dataset = Dataset.from_dict({
    'cleaned_text': df_micro['cleaned_text'].fillna('').tolist(),
    'label': y_micro_encoded.tolist()
})

preprocessed_microaggressions_data = microaggressions_dataset.map(
    preprocess_ishate,
    batched=True,
    fn_kwargs={'tokenizer': tokenizer})

Map:   0%|          | 0/171 [00:00<?, ? examples/s]

In [None]:
#evaluation
micro_predictions = hatebert_FT_trainer.predict(preprocessed_microaggressions_data)
y_micro_pred = np.argmax(micro_predictions.predictions, axis=1)

micro_accuracy = accuracy_score(y_micro_encoded, y_micro_pred)
print(f"\nMicroaggressions Accuracy: {micro_accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(
    y_micro_encoded,
    y_micro_pred,
    target_names=['Normal Speech', 'Microaggression']
))


Microaggressions Accuracy: 0.5088

Classification Report:
                 precision    recall  f1-score   support

  Normal Speech       0.51      1.00      0.67        87
Microaggression       0.00      0.00      0.00        84

       accuracy                           0.51       171
      macro avg       0.25      0.50      0.34       171
   weighted avg       0.26      0.51      0.34       171



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
