In [1]:
!pip install -q torch datasets transformers pandas



In [2]:
import os
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# -------- CONFIG --------
MODEL_PATH      = "../outputs/initial/model"     # make sure train_initial ran
UNLABELED_PATH  = "../data/unlabeled_pool.csv"
LABELED_PATHS   = ["../data/seed_labels.csv", "../data/expanded_seed.csv", "../data/uncertain_labels.csv"]
OUT_PATH        = "../data/pseudo_labels.csv"

CONF_THRESHOLD  = 0.80     # lower if you get 0 rows
TOP_K_MIN       = 50       # fallback: keep at least this many
BATCH_SIZE      = 64
MAX_POOL        = 1000     # set to None to use full pool
MAX_LENGTH      = 128
SEED            = 42
# ------------------------

torch.manual_seed(SEED)
print("Config OK.")


Config OK.


In [3]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model     = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device).eval()
print("Using device:", device)


Using device: cpu


In [4]:
labeled_texts = set()
for p in LABELED_PATHS:
    if os.path.exists(p):
        df_l = pd.read_csv(p)
        if "text" in df_l.columns:
            labeled_texts.update(df_l["text"].dropna().astype(str).tolist())
print("Known labeled examples:", len(labeled_texts))


Known labeled examples: 398


In [5]:
df_u = pd.read_csv(UNLABELED_PATH)
df_u = df_u[df_u["text"].notna()].copy()
df_u["text"] = df_u["text"].astype(str)

# Exclude anything already labeled; drop exact dupes
df_u = df_u[~df_u["text"].isin(labeled_texts)].drop_duplicates("text").reset_index(drop=True)

# Optional chunking to keep size reasonable
if MAX_POOL and len(df_u) > MAX_POOL:
    df_u = df_u.sample(n=MAX_POOL, random_state=SEED).reset_index(drop=True)

print(f"Unlabeled candidates after exclusions: {len(df_u)}")
df_u.head(3)


Unlabeled candidates after exclusions: 1000


Unnamed: 0,text
0,Very Bad Vedio
1,હિંમતવાન શાબાશ રોનકભાઈ
2,જે કલાકારો નું સન્માન થયું એ ભાજપના ઈસારે નાચવ...


In [6]:
@torch.no_grad()
def predict_batch(texts):
    labels, confs = [], []
    for i in range(0, len(texts), BATCH_SIZE):
        batch = texts[i:i+BATCH_SIZE]
        enc = tokenizer(
            batch,
            padding=True,
            truncation=True,
            max_length=MAX_LENGTH,
            return_tensors="pt"
        ).to(device)
        out   = model(**enc)
        probs = torch.softmax(out.logits, dim=1)
        maxp, preds = probs.max(dim=1)
        labels.extend(preds.cpu().tolist())
        confs.extend(maxp.cpu().tolist())
    return labels, confs

print("Predict helper ready.")


Predict helper ready.


In [7]:
if len(df_u) == 0:
    pd.DataFrame(columns=["text","label"]).to_csv(OUT_PATH, index=False, encoding="utf-8")
    print("No eligible examples to pseudo-label.")
else:
    texts = df_u["text"].tolist()
    labels, confs = predict_batch(texts)
    df_pl = pd.DataFrame({"text": texts, "label": labels, "conf": confs})

    # Threshold first
    df_high = df_pl[df_pl["conf"] >= CONF_THRESHOLD].copy()

    # Fallback to top-K if too few
    if len(df_high) < TOP_K_MIN:
        df_high = df_pl.sort_values("conf", ascending=False).head(TOP_K_MIN).copy()

    # Save without the conf column
    df_high[["text","label"]].to_csv(OUT_PATH, index=False, encoding="utf-8")

    print(f"✅ Saved {len(df_high)} pseudo-labels to {OUT_PATH} "
          f"(threshold={CONF_THRESHOLD}, pool={len(df_u)})")
    print("Class counts:", df_high["label"].value_counts().to_dict())


✅ Saved 50 pseudo-labels to ../data/pseudo_labels.csv (threshold=0.8, pool=1000)
Class counts: {1: 50}


In [8]:
peek = pd.read_csv(OUT_PATH)
print(peek.shape)
peek.head(5)


(50, 2)


Unnamed: 0,text,label
0,Rajakot ma tamara samaj ni dikari upar najar b...,1
1,Gunatinanade jo jalaram ne ashirvad aapya hoy ...,1
2,Kem aaje Ronak Bhai ma utsah no abhav dekhay c...,1
3,Mahadeve jer pithi e vathat e samaya hato a sa...,1
4,Modiji hai to tenson kaya hai bhro n bil sark...,1
