In [None]:
! pip install dataset

In [None]:
from datasets import load_dataset, Dataset
from transformers import ElectraTokenizer, ElectraForSequenceClassification, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import numpy as np
import pandas as pd
import torch
import re

In [None]:
df = pd.read_csv('labeled_data.csv')

In [None]:
def preprocess_data(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    return text

In [None]:
df['processed_tweet'] = df['tweet'].apply(preprocess_data)

In [None]:
label_map = {0: 0, 1: 1, 2: 2}
df['label'] = df['class'].map(label_map)

In [None]:
train_texts, test_texts, train_labels, test_labels = train_test_split(df['processed_tweet'], df['label'], test_size=0.2, random_state=42, stratify=df['label'])

In [None]:
train_data = Dataset.from_dict({'text': train_texts, 'label': train_labels})
test_data = Dataset.from_dict({'text': test_texts, 'label': test_labels})

In [None]:
tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')

In [None]:
def tokenize_function(example):
    return tokenizer(example['text'], truncation=True, padding='max_length', max_length=128)

In [None]:
train_data = train_data.map(tokenize_function, batched=True)
test_data = test_data.map(tokenize_function, batched=True)

Map:   0%|          | 0/19826 [00:00<?, ? examples/s]

Map:   0%|          | 0/4957 [00:00<?, ? examples/s]

In [None]:
train_data.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_data.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

In [None]:
model = ElectraForSequenceClassification.from_pretrained('google/electra-small-discriminator', num_labels=3)

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')
    roc_auc = roc_auc_score(np.eye(3)[labels], logits, multi_class='ovo', average='weighted')
    return {"accuracy": acc, "f1": f1, "roc_auc": roc_auc}

In [None]:
training_args = TrainingArguments(
    output_dir="./electra_results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    logging_dir="./logs",
)



In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=test_data,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Roc Auc
1,0.5324,0.327543,0.899133,0.873488,0.928596


Epoch,Training Loss,Validation Loss,Accuracy,F1,Roc Auc
1,0.5324,0.327543,0.899133,0.873488,0.928596
2,0.3255,0.302273,0.903167,0.876475,0.94447
3,0.2873,0.28819,0.905588,0.879659,0.947325


TrainOutput(global_step=1860, training_loss=0.36145744323730467, metrics={'train_runtime': 12407.9348, 'train_samples_per_second': 4.794, 'train_steps_per_second': 0.15, 'total_flos': 437467088881152.0, 'train_loss': 0.36145744323730467, 'epoch': 3.0})

In [None]:
results = trainer.evaluate()
print(results)

{'eval_loss': 0.28818967938423157, 'eval_accuracy': 0.9055880572927174, 'eval_f1': 0.8796587090488337, 'eval_roc_auc': 0.9473246866441436, 'eval_runtime': 314.9173, 'eval_samples_per_second': 15.741, 'eval_steps_per_second': 0.492, 'epoch': 3.0}


In [None]:
def test_input(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    inputs = {key: value.to(model.device) for key, value in inputs.items()}
    outputs = model(**inputs)
    probabilities = torch.softmax(outputs.logits, dim=1).detach().cpu().numpy()[0]
    predicted_class = np.argmax(probabilities)
    class_labels = {0: 'Hate Speech', 1: 'Offensive Language', 2: 'Neither'}
    print(f"Input: {text}")
    print(f"Predicted Class: {class_labels[predicted_class]}")
    print("Class Probabilities:")
    for cls, prob in zip(class_labels.values(), probabilities):
        print(f"  {cls}: {prob:.4f}")

In [None]:
example_text = "Stop, you are not ugly but so pretty"
test_input(example_text)

Input: Stop, you are not ugly but so pretty
Predicted Class: Neither
Class Probabilities:
  Hate Speech: 0.0487
  Offensive Language: 0.0642
  Neither: 0.8871
