In [41]:
import pandas as pd
import numpy as np
import torch
import re
import emoji
from ftfy import fix_text
from datasets import Dataset
from torch.utils.data import DataLoader

from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, precision_score, recall_score, f1_score

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorWithPadding
import re, regex
import html, emoji, contractions
from ftfy import fix_text
from indic_transliteration import sanscript
from indic_transliteration.sanscript import transliterate

In [42]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [43]:
print("Torch:", torch.__version__, "CUDA:", torch.version.cuda)
print("GPU detected:", torch.cuda.get_device_name(0))
print("CUDA OK?", torch.cuda.is_available())

Torch: 2.5.1 CUDA: 12.1
GPU detected: NVIDIA GeForce RTX 4090
CUDA OK? True


# Basic Data Cleaning

In [44]:
DEVANAGARI_TO_ROMAN = True
LATIN_BLOCK= r'\p{Latin}'
DEVANAGARI_BLOCK = r'\p{Devanagari}'

In [45]:
censored_swear_words = {
    # Handles s!@#, s#!t, s**t, etc., variations of "shit"
    r'\b[Ss][^A-Za-z\s]{3,4}\b': 'shit',
    r'[Ss][\!\@\#\$\%\^\&\*]*[i1\|][\!\@\#\$\%\^\&\*]*[t]+': 'shit',
    r'[Ss][#!@$%^&*+\-=\[\]{};:\'",.<>?/\\|_~`]{2,}[Tt]\b': 'shit',


    # Handles f*ck, f@ck, f**k, f!ck, F*#k, F***, etc., variations of "fuck"
    r'f[\!\@\#\$\%\^\&\*]*[u][\!\@\#\$\%\^\&\*]*[c][\!\@\#\$\%\^\&\*]*[k]+': 'fuck',
    r'f[\!\@\#\$\%\^\&\*]*[\*]+[c][k]': 'fuck',                # Handles f*ck
    r'f[\!\@\#\$\%\^\&\*]*[\*]+[k]': 'fuck',                   # Handles f***
    r'f[\!\@\#\$\%\^\&\*]*[\*#\!]{1,}[c]?[kz]?': 'fuck',       # Handles F*#k, F***
    r'f[\!\@\#\$\%\^\&\*]*[\w@#$%^&*]+ing\b': 'fucking',        # Hnadles f#%^ing

    # Handles b*tch, b!tch, b1tch, etc., variations of "bitch"
    r'b[\!\@\#\$\%\^\&\*]*[i1\|][\!\@\#\$\%\^\&\*]*[t][\!\@\#\$\%\^\&\*]*[c][\!\@\#\$\%\^\&\*]*[h]+': 'bitch',

    # Handles "fuck" with various suffixes (e.g., "f***'z" -> "fucks")
    r'f[\!\@\#\$\%\^\&\*]*[u][\!\@\#\$\%\^\&\*]*[c][\!\@\#\$\%\^\&\*]*[k]+[\'’]?[zs]?': 'fucks',

    # Handles muthaf*ckin, motherf***ing, etc., variations of "motherfucking"
    r'm[ou]*th[\!\@\#\$\%\^\&\*]*[a][\!\@\#\$\%\^\&\*]*f[\!\@\#\$\%\^\&\*]*[u][\!\@\#\$\%\^\&\*]*[c][\!\@\#\$\%\^\&\*]*[k][\!\@\#\$\%\^\&\*]*[i][\!\@\#\$\%\^\&\*]*[n][\!\@\#\$\%\^\&\*]*': 'motherfucking',
}

In [46]:
hindi_swear_map = {

    # —— “madarchod” —— (mother‑****er)
    # Roman + common leet/censor + Devanagari
    r'\b(m[a@]d[ae]?[r]?[\W_]*ch[o0]d[a@]?|म[ाअ]*दर[चच]ोद|मादरचोद)\b': 'madarchod',

    # —— “bhenchod / behenchod” ——
    r'\b(b[h]*e+h[e]*n[\W_]*ch[o0]d[a@]?|भेंचोद|बह[ेे]न[चच]ोद|बहनचोद)\b': 'bhenchod',

    # —— “chutiya / ch**iya” ——
    r'\b(c[h]*u+t+i+y*a+|ch[u*]+t+iy?[ae]+|चूतिया|चुतिया|छूतिया)\b': 'chutiya',

    # —— “chu***” shortened form (e.g. chut** / ch****) ——
    r'\b(ch[\W_]*u[\W_]*t[\W_]*\*{2,})\b': 'chut***',

    # —— “gandu / gaandu / gaand” ——
    r'\b(g[a@]a*n+d[u]?[aei]?|गां*डू?|गाण्डू?)\b': 'gandu',

    # —— “gaand” stand‑alone  ——
    r'\b(ga+a+nd+|गांड|गांड|गाण्ड)\b': 'gaand',

    # —— “lauda / laudae / laude” —— 
    r'\b(l[a@]u+[d]+[aei]*|लौड़ा|लंड|लुंड)\b': 'lauda',

    # —— “lund” ——
    r'\b(l[u]n+d|लुंड|लंड)\b': 'lund',

    # —— “randi / r@ndi” ——
    r'\b(r[a@]n+d[iy]+|रंडी|रंड़ी)\b': 'randi',

    # —— “harami / haramzada / haramzadi” ——
    r'\b(h[a@]r+a+m[iy]*|हरामी)\b': 'harami',
    r'\b(h[a@]r+a+m[z$]?a+d[a@]|हरामज़ादा|हरामजादा)\b': 'haramzada',
    r'\b(h[a@]r+a+m[z$]?a+d[iy]|हरामज़ादी|हरामजादी)\b': 'haramzadi',

    # —— “kamine / kamina / kaminey” ——
    r'\b(k[a@]m+i+n[e]?[yie]*|कमीना|कमीने|कमीनी)\b': 'kamine',

    # —— “kutta / kutte / kuttiya” ——
    r'\b(k[u]t+t[a@e]*|कुत्ता|कुत्ते|कुते)\b': 'kutta',
    r'\b(k[u]t+t[i]y?a+|कुतिया|कुत्ती)\b': 'kutiya',

    # —— “saala / saale / sali” —— (brother‑in‑law, used as slur)
    r'\b(s+a+a*l+a+|साला|साले)\b': 'saala',
    r'\b(s+a+l+i+|साली)\b': 'sali',

    # —— “bakchod / bakchodi” ——
    r'\b(b[a@]k+ch[o0]d[iy]*|बकचोद|बकचोदी)\b' : 'bakchod',

    # —— milder insults ——
    r'\b(g[a@]dh[a@]a+|गधा|गधे)\b': 'gadha',

    # —— catch‑all “f**k” in Roman Hindi sentences ——
    r'\b(f[\W_]*u[\W_]*c[\W_]*k+)\b': 'fuck',

    # —— emojis or censor blocks containing Devanāgarī letters and stars
    r'[\u0900-\u097F]\*{2,}[\u0900-\u097F]*': 'censored_hindi_word',
}


In [47]:
ascii_emoticons = {
    ':-)': 'smiley_face',
    ':)': 'smiley_face',
    ':-D': 'grinning_face',
    ':D': 'grinning_face',
    ':-(': 'sad_face',
    ':(': 'sad_face',
    ':-P': 'playful_face',
    ':P': 'playful_face',
    ':-p': 'playful_face',
    ':p': 'playful_face',
    ';-)': 'winking_face',
    ';)': 'winking_face',
    ':-O': 'surprised_face',
    ':O': 'surprised_face',
    ':-o': 'surprised_face',
    ':o': 'surprised_face',
    r':-/': 'skeptical_face',
    r':/': 'skeptical_face',
    r':-\|': 'neutral_face',
    r':\|': 'neutral_face',
    '<3': 'heart',
    '^_^': 'happy_face',
    '-_-': 'expressionless_face',
    'o_O': 'confused_face',
    'O_o': 'confused_face',
    'o_o': 'confused_face',
    '>_<': 'frustrated_face',
    'O.o': 'confused_face',
    'o.O': 'confused_face',
    '0_o': 'confused_face',
    'o_0': 'confused_face',
    'xD': 'laughing_face',
    'XD': 'laughing_face',
}


In [48]:
def replace_emojis(text):
    return emoji.demojize(text) if isinstance(text, str) else text

In [49]:
def normalize_text(text):
    if not isinstance(text, str):
        return text
    try:
        # Attempt to encode to 'latin1' and decode to 'utf-8' to fix encoding issues
        clean_text = text.encode('latin1').decode('utf-8')
    except (UnicodeEncodeError, UnicodeDecodeError):
        # Fallback: ignore encoding errors if they occur
        clean_text = text.encode('latin1', errors='ignore').decode('utf-8', errors='ignore')
    return clean_text

In [50]:
def decode_html_entities(text):
    """Decode HTML entities in text, such as '&amp;' to '&'."""
    if not isinstance(text, str):
        return text
    # Use html.unescape to decode all HTML entities
    clean_text = html.unescape(text)
    return clean_text

In [51]:
def remove_html_tags(text):
    if not isinstance(text, str):
        return text
    clean_text = re.sub(r'</?[^>]+>', '', text)
    return clean_text

In [52]:
def replace_urls(text):
    if not isinstance(text, str):
        return text
    # Regex pattern to match URLs
    url_pattern = r'(http[s]?://\S+|www\.\S+)'
    clean_text = re.sub(url_pattern, ' SomeWebLink ', text)
    return clean_text

In [53]:
def replace_emails(text):
    if not isinstance(text, str):
        return text
    # Pattern to match email addresses
    email_pattern = r'[\w\.-]+@[\w\.-]+\.\w+'
    clean_text = re.sub(email_pattern, ' EMAILADDRESS ', text)
    return clean_text

In [54]:
def replace_censored_swear_words(text):
    if not isinstance(text, str):
        return text
    clean_text = text
    for pattern, replacement in censored_swear_words.items():
        # Do not add word boundaries if pattern already contains them
        if pattern.startswith(r'\b') and pattern.endswith(r'\b'):
            clean_pattern = pattern
        else:
            clean_pattern = r'\b' + pattern + r'\b'
        clean_text = re.sub(clean_pattern, replacement, clean_text, flags=re.IGNORECASE)
    return clean_text

In [55]:
def replace_hindi_swears(text: str) -> str:
    if not isinstance(text, str):
        return text
    out = text
    for pat, repl in hindi_swear_map.items():
        out = regex.sub(pat, repl, out, flags=regex.IGNORECASE)
    return out

In [56]:
def replace_mentions(text):
    """Replace mentions with 'SomeTaggedAccount', including when attached to other words."""
    if not isinstance(text, str):
        return text
    # Pattern to match @ followed by word characters
    # Replace mentions with spaces around to prevent word merging
    mention_pattern = r'@([A-Za-z0-9_]+)'
    clean_text = re.sub(mention_pattern, ' SomeTaggedAccount ', text)
    return clean_text

In [57]:
def replace_at_symbol(text):
    if not isinstance(text, str):
        return text
    # Replace '@' when used as 'at'
    # Conditions:
    # - '@' followed by a space
    # - '@' at the end of a string
    # - '@' followed by a number or time format
    # - '@' followed by a capitalized word (assuming location or time)
    clean_text = re.sub(r'@\s', 'at ', text)
    clean_text = re.sub(r'@$', 'at', clean_text)
    clean_text = re.sub(r'@(?=\d)', 'at ', clean_text)
    clean_text = re.sub(r'@ (?=[A-Z])', 'at ', clean_text)
    return clean_text

In [58]:
def remove_hashtags(text):
    if not isinstance(text, str):
        return text

    # Remove '#' symbol in various contexts
    clean_text = re.sub(r'(^|\s)#(\w*[^\s\w]?)', r'\1\2', text)
    clean_text = re.sub(r'(\w+)#([^\s\w]?)', r'\1\2', clean_text)
    clean_text = re.sub(r'([^\s\w])#', r'\1', clean_text)

    # Remove any remaining standalone '#' symbols
    clean_text = re.sub(r'(?<=\s)#(?=\s|$)|(?<=^)#(?=\s|$)', '', clean_text)

    return clean_text

In [59]:
def expand_contractions(text):
    if not isinstance(text, str):
        return text
    expanded_words = [contractions.fix(word) for word in text.split()]
    clean_text = ' '.join(expanded_words)
    return clean_text

In [60]:
def clean_unicode_punct(text: str) -> str:
    """
    Remove punctuation *except* ? ! . , and keep both Latin & Devanagari letters.
    Any other symbol is dropped.
    """
    if not isinstance(text, str):
        return text
    # keep letters, numbers, spaces and selected punctuation
    pattern = rf'[^\s{LATIN_BLOCK}{DEVANAGARI_BLOCK}0-9\?\!\.,]'
    return regex.sub(pattern, ' ', text)

In [61]:
def transliterate_devanagari(text: str) -> str:
    if not DEVANAGARI_TO_ROMAN or not isinstance(text, str):
        return text
    return transliterate(text, sanscript.DEVANAGARI, sanscript.ITRANS)

In [62]:
def remove_emoji_colons(text):
    """Remove colons around emoji descriptions in the text."""
    if not isinstance(text, str):
        return text
    # Regex pattern to match emoji descriptions with colons
    pattern = r':([a-zA-Z0-9_+-]+):'
    # Replace matches with the emoji description without colons
    clean_text = re.sub(pattern, r'\1', text)
    return clean_text

In [63]:
def replace_ascii_emoticons(text):
    """Replace ASCII emoticons in text with their descriptions."""
    if not isinstance(text, str):
        return text

    clean_text = text
    for emoticon, description in ascii_emoticons.items():
        # Escape the emoticon pattern to handle special characters
        escaped_emoticon = re.escape(emoticon)
        # Build the regex pattern to match emoticons not part of words
        pattern = r'(?<!\w)' + escaped_emoticon + r'(?!\w)'
        # Replace the emoticon with the description, ignoring case
        clean_text = re.sub(pattern, f"{description}", clean_text, flags=re.IGNORECASE)
    return clean_text

In [64]:
def replace_trade_mark(text):
    """Replace 'trade_mark' with a single quote (')."""
    if not isinstance(text, str):
        return text
    # Replace 'trade_mark' with "'"
    clean_text = text.replace("trade_mark", "'")
    return clean_text

In [65]:
def remove_extra_whitespace(text):
    if not isinstance(text, str):
        return text
    clean_text = ' '.join(text.split())
    return clean_text

In [66]:
cleaning_functions = [
    replace_emojis,
    normalize_text,
    decode_html_entities,
    remove_html_tags,
    replace_urls,
    replace_emails,
    replace_censored_swear_words,
    replace_hindi_swears,
    replace_mentions,
    replace_at_symbol,
    remove_hashtags,
    expand_contractions,
    clean_unicode_punct,
    transliterate_devanagari,
    remove_emoji_colons,
    replace_ascii_emoticons,
    replace_trade_mark,
    remove_extra_whitespace
]

In [67]:
def clean_tweet(text: str) -> str:
    if not isinstance(text, str):
        return ''
    # ftfy first (handles mojibake)
    text = fix_text(text)
    for fn in cleaning_functions:
        text = fn(text)
    return text

# Data load and preprocess

In [68]:
df = pd.read_csv(r"D:\University of Illinois Chicago\Classes\CS521\data files\consolidated_sarcasm_dataset.csv")
df["comment"] = df["comment"].astype(str).apply(clean_tweet)
df = df[["comment", "label"]]

In [69]:
for t in df["comment"].sample(5):
    print("»", t)

» ha ha nain aisa kuch nahin hai
» superb again!! line mat kato kutton, ap qatar mein
» aab kaun karega aanhoni ko honi ? dhoni retirement cricket
» tum logon ka vikas concept badiya hai
» gandu ladki. aabhi to tum 18 ki bhi nahi hui thik se !


In [70]:
label_map = {"non_sarcastic": 0, "sarcastic": 1}

df["label"] = (
    df["label"]
    .astype(str)          # make sure they are strings
    .str.strip()          # remove leading/trailing spaces
    .str.lower()          # normalise case
    .map(label_map)       # convert to 0 / 1
)

assert df["label"].isin([0, 1]).all(), "Unexpected label values!"

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["comment"],
    df["label"],
    test_size=0.20,
    stratify=df["label"],
    random_state=42,
)

train_dataset = [
    {"comment": t, "label": int(l)} for t, l in zip(train_texts.tolist(), train_labels.tolist())
]
val_dataset = [
    {"comment": t, "label": int(l)} for t, l in zip(val_texts.tolist(), val_labels.tolist())
]


train_dataset = Dataset.from_list(train_dataset)
val_dataset   = Dataset.from_list(val_dataset)

# Model Training

In [71]:
# pick one public model ID
model_name = "l3cube-pune/hing-bert"      # or "nirantk/hinglish-bert"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model     = AutoModelForSequenceClassification.from_pretrained(
               model_name,
               num_labels=2
            ).to(device)

# tokenization — use the key you actually stored, here it's "comment"
def tokenize_fn(batch):
    return tokenizer(batch["comment"], truncation=True, padding=True)

train_dataset = train_dataset.map(tokenize_fn, batched=True)
val_dataset   = val_dataset.map(tokenize_fn, batched=True)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at l3cube-pune/hing-bert and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1765/1765 [00:00<00:00, 32341.12 examples/s]
Map: 100%|██████████| 442/442 [00:00<00:00, 27625.77 examples/s]


In [72]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

In [73]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    logging_dir="./logs",
    logging_steps=10,
    report_to="none"  # Disable wandb or hub logging unless needed
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [74]:
trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4485,0.487102,0.782805,0.766562,0.916981,0.835052
2,0.4018,0.474221,0.794118,0.782468,0.909434,0.841187
3,0.1284,0.638674,0.803167,0.792763,0.909434,0.8471
4,0.0955,0.753422,0.800905,0.812721,0.867925,0.839416


TrainOutput(global_step=444, training_loss=0.33638698600970945, metrics={'train_runtime': 38.3338, 'train_samples_per_second': 184.172, 'train_steps_per_second': 11.582, 'total_flos': 272104109010000.0, 'train_loss': 0.33638698600970945, 'epoch': 4.0})

# Save model

In [75]:
model_dir = "./sarcasm_hingbert_model"

# Save model + tokenizer
model.save_pretrained(model_dir)
tokenizer.save_pretrained(model_dir)

('./sarcasm_hingbert_model\\tokenizer_config.json',
 './sarcasm_hingbert_model\\special_tokens_map.json',
 './sarcasm_hingbert_model\\vocab.txt',
 './sarcasm_hingbert_model\\added_tokens.json',
 './sarcasm_hingbert_model\\tokenizer.json')

# model inference

In [76]:
model_dir = "./sarcasm_hingbert_model"

tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSequenceClassification.from_pretrained(model_dir).to(device)
model.eval()  # switch to inference mode

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [77]:
df_new = pd.read_csv(
    r"D:\University of Illinois Chicago\Classes\CS521\cs521\labeled_batches\labeled_batch_30.csv"
)

df_new.rename(columns={"Transliterated_Comment": "comment"}, inplace=True)
label_map = {"non_sarcastic": 0, "sarcastic": 1}
df_new["label"] = df_new["sarcasm_label"].astype(str).str.strip().str.lower().map(label_map)
df_new = df_new.dropna(subset=["label"])

In [78]:
df_new["comment"] = df_new["comment"].astype(str).apply(clean_tweet)

In [79]:
BATCH_SIZE = 64

def predict_labels(text_series):
    preds_all = []
    loader = DataLoader(text_series.tolist(), batch_size=BATCH_SIZE, shuffle=False)
    with torch.no_grad():
        for batch_texts in loader:
            enc = tokenizer(
                batch_texts,
                padding=True,
                truncation=True,
                return_tensors="pt"
            ).to(device)

            logits = model(**enc).logits
            batch_preds = torch.argmax(logits, dim=1)
            preds_all.extend(batch_preds.cpu().numpy())
    return np.array(preds_all)

# Testing Results

In [80]:
preds = predict_labels(df_new["comment"])
true  = df_new["label"].values.astype(int)

acc  = accuracy_score(true, preds)
prec = precision_score(true, preds)
rec  = recall_score(true, preds)
f1   = f1_score(true, preds)

print(f"Accuracy : {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall   : {rec:.4f}")
print(f"F1 Score : {f1:.4f}")

Accuracy : 0.8537
Precision: 1.0000
Recall   : 0.8537
F1 Score : 0.9211
