In [1]:
!pip install -q torch datasets transformers scikit-learn accelerate pandas



In [None]:
!pip install -U "transformers>=4.40" "datasets>=2.19" "accelerate>=0.30"


In [2]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
import torch
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          Trainer, TrainingArguments)
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# -------- CONFIG --------
BASE_MODEL  = "ai4bharat/indic-bert"   # IndicBERT
OUTPUT_DIR  = "outputs/initial"
MODEL_OUT   = os.path.join(OUTPUT_DIR, "model")

TRAIN_PATHS = ["../data/seed_labels.csv", "../data/expanded_seed.csv", "../data/uncertain_labels.csv"]  # optional if some missing
TEST_SIZE   = 0.15
SEED        = 42
MAX_LENGTH  = 128
BATCH_SIZE  = 25
EPOCHS      = 5
LR          = 2e-5
# ------------------------

torch.manual_seed(SEED)
os.makedirs(OUTPUT_DIR, exist_ok=True)
print("Using base model:", BASE_MODEL)


Using base model: ai4bharat/indic-bert


In [3]:
# Load all available labeled sets
dfs = []
for p in TRAIN_PATHS:
    if os.path.exists(p):
        df = pd.read_csv(p)
        if set(["text","label"]).issubset(df.columns):
            dfs.append(df[["text","label"]])
        else:
            print(f"Skipping {p}: missing columns.")
    else:
        print(f"Not found: {p}")

if not dfs:
    raise ValueError("No labeled data found.")

df = pd.concat(dfs, ignore_index=True).dropna().copy()
df["text"]  = df["text"].astype(str)
df["label"] = df["label"].astype(int)
df = df.drop_duplicates(subset="text").reset_index(drop=True)
print("Total labeled rows:", len(df))
print("Class counts:", df["label"].value_counts().to_dict())

# Split
train_df, val_df = train_test_split(
    df, test_size=TEST_SIZE, random_state=SEED, stratify=df["label"]
)

print("Train size:", len(train_df), "Val size:", len(val_df))


Total labeled rows: 398
Class counts: {0: 199, 1: 199}
Train size: 338 Val size: 60


In [4]:
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

def tokenize_batch(batch):
    return tokenizer(
        batch["text"],
        padding=True,
        truncation=True,
        max_length=MAX_LENGTH
    )

train_ds = Dataset.from_pandas(train_df)
val_ds   = Dataset.from_pandas(val_df)

train_ds = train_ds.map(tokenize_batch, batched=True)
val_ds   = val_ds.map(tokenize_batch,   batched=True)

train_ds = train_ds.remove_columns([c for c in train_ds.column_names if c not in ("input_ids","attention_mask","label")])
val_ds   = val_ds.remove_columns(  [c for c in val_ds.column_names   if c not in ("input_ids","attention_mask","label")])

train_ds = train_ds.with_format("torch")
val_ds   = val_ds.with_format("torch")


Map:   0%|          | 0/338 [00:00<?, ? examples/s]

Map:   0%|          | 0/60 [00:00<?, ? examples/s]

In [5]:
id2label = {0: "non_hate", 1: "hate"}
label2id = {"non_hate": 0, "hate": 1}

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=1)
    acc  = accuracy_score(labels, preds)
    f1   = f1_score(labels, preds, average="binary", pos_label=1)
    prec = precision_score(labels, preds, average="binary", pos_label=1)
    rec  = recall_score(labels, preds, average="binary", pos_label=1)
    return {"accuracy": acc, "f1": f1, "precision": prec, "recall": rec}

# Optional: compute class weights (inverse freq)
counts = train_df["label"].value_counts()
cw = torch.tensor([1.0, 1.0])
for cls in [0,1]:
    if cls in counts:
        cw[cls] = len(train_df) / (2.0 * counts[cls])
cw = cw / cw.mean()  # normalize

class WeightedTrainer(Trainer):
    def __init__(self, class_weights=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        if self.class_weights is not None:
            loss_fct = torch.nn.CrossEntropyLoss(weight=self.class_weights.to(logits.device))
        else:
            loss_fct = torch.nn.CrossEntropyLoss()
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss


In [9]:
import transformers, os
print("Transformers:", transformers.__version__)

model = AutoModelForSequenceClassification.from_pretrained(
    BASE_MODEL,
    num_labels=2,
    id2label=id2label,
    label2id=label2id
)

args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=LR,
    logging_dir=os.path.join(OUTPUT_DIR, "logs"),
    logging_steps=50
    # NOTE: no evaluation_strategy/save_strategy/load_best_model_at_end here
)

trainer = WeightedTrainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    class_weights=cw
)

trainer.train()
metrics = trainer.evaluate()
metrics


Transformers: 4.55.0


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  super().__init__(*args, **kwargs)


TypeError: WeightedTrainer.compute_loss() got an unexpected keyword argument 'num_items_in_batch'

In [None]:
os.makedirs(MODEL_OUT, exist_ok=True)
trainer.save_model(MODEL_OUT)
tokenizer.save_pretrained(MODEL_OUT)

print("Saved to:", MODEL_OUT)

from transformers import pipeline
clf = pipeline("text-classification", model=MODEL_OUT, tokenizer=MODEL_OUT, device=0 if torch.cuda.is_available() else -1)

examples = [
    "તને અહીં જીવવા ન દઈએ",         # potentially hateful
    "શું હાલ છે? બરાબર?",           # benign
    "Harami loko ne jawab aapo",      # potentially hate/abuse
]
print(clf(examples))
