In [None]:
import pandas as pd
import numpy as np
import torch
import re
import emoji
from ftfy import fix_text

from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, precision_score, recall_score, f1_score

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorWithPadding

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [3]:
print("Torch:", torch.__version__, "CUDA:", torch.version.cuda)
print("GPU detected:", torch.cuda.get_device_name(0))
print("CUDA OK?", torch.cuda.is_available())

Torch: 2.5.1 CUDA: 12.1
GPU detected: NVIDIA GeForce RTX 4090
CUDA OK? True


In [4]:
def clean_tweet(text):
    text = fix_text(text)
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"#\w+", "", text)
    text = re.sub(r"[^\w\s!?.,]", "", text)
    text = emoji.demojize(text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df = pd.read_csv(r"D:\University of Illinois Chicago\Classes\CS521\data files\consolidated_sarcasm_dataset.csv")
df['comment'] = df['comment'].astype(str).apply(clean_tweet)
df = df[['comment', 'label']]

In [5]:
label_map = {"non_sarcastic": 0, "sarcastic": 1}

df["label"] = (
    df["label"]
    .astype(str)          # make sure they are strings
    .str.strip()          # remove leading/trailing spaces
    .str.lower()          # normalise case
    .map(label_map)       # convert to 0 / 1
)

assert df["label"].isin([0, 1]).all(), "Unexpected label values!"

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["comment"],
    df["label"],
    test_size=0.20,
    stratify=df["label"],
    random_state=42,
)

train_dataset = [
    {"comment": t, "label": int(l)} for t, l in zip(train_texts.tolist(), train_labels.tolist())
]
val_dataset = [
    {"comment": t, "label": int(l)} for t, l in zip(val_texts.tolist(), val_labels.tolist())
]

from datasets import Dataset
train_dataset = Dataset.from_list(train_dataset)
val_dataset   = Dataset.from_list(val_dataset)

In [6]:
# pick one public model ID
model_name = "l3cube-pune/hing-bert"      # or "nirantk/hinglish-bert"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model     = AutoModelForSequenceClassification.from_pretrained(
               model_name,
               num_labels=2
            ).to(device)

# tokenization — use the key you actually stored, here it's "comment"
def tokenize_fn(batch):
    return tokenizer(batch["comment"], truncation=True, padding=True)

train_dataset = train_dataset.map(tokenize_fn, batched=True)
val_dataset   = val_dataset.map(tokenize_fn, batched=True)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at l3cube-pune/hing-bert and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1765/1765 [00:00<00:00, 16031.71 examples/s]
Map: 100%|██████████| 442/442 [00:00<00:00, 18798.24 examples/s]


In [7]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

In [8]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    logging_dir="./logs",
    logging_steps=10,
    report_to="none"  # Disable wandb or hub logging unless needed
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [9]:
trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4254,0.496456,0.751131,0.821577,0.74717,0.782609
2,0.3621,0.494845,0.791855,0.791246,0.886792,0.836299
3,0.1037,0.667554,0.789593,0.790541,0.883019,0.834225
4,0.1511,0.792956,0.785068,0.807971,0.841509,0.824399


TrainOutput(global_step=444, training_loss=0.29890357437837234, metrics={'train_runtime': 46.0975, 'train_samples_per_second': 153.154, 'train_steps_per_second': 9.632, 'total_flos': 460762957923600.0, 'train_loss': 0.29890357437837234, 'epoch': 4.0})

In [10]:
model_dir = "./sarcasm_hingbert_model"

# Save model + tokenizer
model.save_pretrained(model_dir)
tokenizer.save_pretrained(model_dir)

('./sarcasm_hingbert_model\\tokenizer_config.json',
 './sarcasm_hingbert_model\\special_tokens_map.json',
 './sarcasm_hingbert_model\\vocab.txt',
 './sarcasm_hingbert_model\\added_tokens.json',
 './sarcasm_hingbert_model\\tokenizer.json')

In [11]:
model_dir = "./sarcasm_hingbert_model"

tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSequenceClassification.from_pretrained(model_dir).to(device)
model.eval()  # switch to inference mode

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [12]:
df_new = pd.read_csv(r"D:\University of Illinois Chicago\Classes\CS521\cs521\labeled_batches\labeled_batch_1.csv")

# Clean & map labels
label_map = {"non_sarcastic": 0, "sarcastic": 1}
df_new["label"] = df_new["sarcasm_label"].astype(str).str.strip().str.lower().map(label_map)
df_new = df_new.dropna(subset=["label"])

# Tokenize new comments
from torch.utils.data import DataLoader

def preprocess_texts(texts):
    return tokenizer(texts, padding=True, truncation=True, return_tensors="pt").to(device)

# Predict function
def predict_labels(comments):
    inputs = preprocess_texts(comments)
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.softmax(outputs.logits, dim=1)
        preds = torch.argmax(probs, dim=1)
    return preds.cpu().numpy()

# Run prediction
#create the new columns name as comment instead of transliterated_comment
df_new.rename(columns={"Transliterated_Comment": "comment"}, inplace=True)
preds = predict_labels(df_new["comment"].tolist())
true = df_new["label"].values


In [None]:
acc  = accuracy_score(true, preds)
prec = precision_score(true, preds)
rec  = recall_score(true, preds)
f1   = f1_score(true, preds)

print(f"Accuracy : {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall   : {rec:.4f}")
print(f"F1 Score : {f1:.4f}")


Accuracy : 0.8529
Precision: 1.0000
Recall   : 0.8529
F1 Score : 0.9206
