In [1]:

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Model names
FAST_MODEL = "joeddav/distilbert-base-uncased-go-emotions-student"
ACCURATE_MODEL = "SamLowe/roberta-base-go_emotions"

# Load both tokenizers and models
tokenizers = {
    "fast": AutoTokenizer.from_pretrained(FAST_MODEL),
    "accurate": AutoTokenizer.from_pretrained(ACCURATE_MODEL)
}

models = {
    "fast": AutoModelForSequenceClassification.from_pretrained(FAST_MODEL),
    "accurate": AutoModelForSequenceClassification.from_pretrained(ACCURATE_MODEL)
}

# Move models to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
for model in models.values():
    model.to(device).eval()

# Label maps (identical between the two)
id2label = models["fast"].config.id2label

def predict_emotions(df, text_column="chunk", top_k=3, batch_size=32, model_type="fast"):
    print(f"[predict_emotions] Using model: {model_type}")

    model = models[model_type]
    tokenizer = tokenizers[model_type]

    texts = df[text_column].tolist()
    all_probs = []

    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        inputs = tokenizer(
            batch,
            return_tensors="pt",
            truncation=True,
            padding=True,
            max_length=512
        )
        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = model(**inputs)

        probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
        all_probs.append(probs.cpu())

    probs = torch.cat(all_probs, dim=0)

    top_emotions = []
    predicted_labels = []

    for row in probs:
        top_indices = torch.topk(row, top_k).indices.tolist()
        top_scores = [round(row[i].item(), 3) for i in top_indices]
        top_labels = [id2label[i] for i in top_indices]

        top_emotions.append(dict(zip(top_labels, top_scores)))
        predicted_labels.append(top_labels[0])

    df["Predicted_Emotion"] = predicted_labels
    df["Top_3_Emotions"] = top_emotions

    print("[predict_emotions] Prediction complete.")
    return df


2025-06-11 16:35:59.469853: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749652559.491198  149937 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749652559.496714  149937 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1749652559.511908  149937 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1749652559.512007  149937 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1749652559.512009  149937 computation_placer.cc:177] computation placer alr

In [None]:
predict_emotions(df, text_column="chunk", top_k=3, batch_size=32, model_type="accurate")