In [None]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset

# Load the pre-trained model and tokenizer for 28 emotions
model_name = "SamLowe/roberta-base-go_emotions"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=28)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load the SetFit emotion dataset
setfit_dataset = load_dataset("SetFit/emotion")

# Drop all columns except 'text'
setfit_dataset = setfit_dataset.remove_columns([col for col in setfit_dataset['train'].column_names if col != 'text'])

# Preprocess the SetFit dataset
def preprocess_function(examples):
    inputs = tokenizer(examples['text'], padding="max_length", truncation=True, max_length=128)

    # Create dummy labels since we're not using the sentiment column; they are just placeholders
    num_labels = 28  # Number of emotions in the GoEmotions dataset
    dummy_labels = np.zeros((len(examples['text']), num_labels))

    inputs["labels"] = torch.tensor(dummy_labels, dtype=torch.float32)
    return inputs

# Apply the preprocessing to the SetFit dataset
tokenized_setfit_dataset = setfit_dataset.map(preprocess_function, batched=True)

# Prepare the dataset for PyTorch
tokenized_setfit_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

# Split the dataset into training and validation subsets
train_dataset = tokenized_setfit_dataset["train"]
validation_dataset = tokenized_setfit_dataset["test"]

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results_custom_28_emotions",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,  # Adjust as needed
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=50,
    save_total_limit=2,
)

# Define the Trainer for fine-tuning
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
)

# Start fine-tuning
trainer.train()

# Evaluate the model on the validation dataset
results = trainer.evaluate()
print("Evaluation results:", results)

# Test the model with custom input
custom_inputs = ["I am thrilled with joy!", "Feeling sad and lonely today."]
encoded_inputs = tokenizer(custom_inputs, return_tensors="pt", padding=True, truncation=True, max_length=128)

# Make predictions with the model
model.eval()
with torch.no_grad():
    outputs = model(**encoded_inputs)

# Convert logits to probabilities using sigmoid
probabilities = torch.sigmoid(outputs.logits)

# Define the original emotion labels from GoEmotions
emotion_labels = [
    "admiration", "amusement", "anger", "annoyance", "approval", "caring",
    "confusion", "curiosity", "desire", "disappointment", "disapproval", "disgust",
    "embarrassment", "excitement", "fear", "gratitude", "grief", "joy", "love",
    "nervousness", "optimism", "pride", "realization", "relief", "remorse",
    "sadness", "surprise", "neutral"
]

# Print the probabilities for each emotion label
print("\nProbabilities for each emotion label for each input:")
for i, text in enumerate(custom_inputs):
    print(f"\nText: {text}")
    for emotion, prob in zip(emotion_labels, probabilities[i].tolist()):
        print(f"  {emotion}: {prob:.4f}")

# Apply a threshold to get predicted emotions
threshold = 0.5  # Adjust as needed
predictions = (probabilities > threshold).int()

# Print the predicted emotions based on the threshold
print("\nPredicted emotions based on threshold:")
for i, text in enumerate(custom_inputs):
    print(f"\nText: {text}")
    print("Predicted Emotions: ", end="")
    for emotion, pred in zip(emotion_labels, predictions[i].tolist()):
        if pred == 1:
            print(f"{emotion}", end=", ")
    print()


# Output

Probabilities for each emotion label for each input:

Text: I am thrilled with joy!
  admiration: 0.0179
  amusement: 0.0121
  anger: 0.0035
  annoyance: 0.0060
  approval: 0.0134
  caring: 0.0076
  confusion: 0.0039
  curiosity: 0.0047
  desire: 0.0026
  disappointment: 0.0016
  disapproval: 0.0040
  disgust: 0.0008
  embarrassment: 0.0010
  excitement: 0.2538
  fear: 0.0018
  gratitude: 0.0104
  grief: 0.0009
  joy: 0.7898
  love: 0.0061
  nervousness: 0.0034
  optimism: 0.0043
  pride: 0.0042
  realization: 0.0045
  relief: 0.0117
  remorse: 0.0005
  sadness: 0.0024
  surprise: 0.0069
  neutral: 0.0327

Text: Feeling sad and lonely today.
  admiration: 0.0066
  amusement: 0.0044
  anger: 0.0059
  annoyance: 0.0139
  approval: 0.0089
  caring: 0.0079
  confusion: 0.0029
  curiosity: 0.0045
  desire: 0.0038
  disappointment: 0.1013
  disapproval: 0.0073
  disgust: 0.0037
  embarrassment: 0.0019
  excitement: 0.0018
  fear: 0.0033
  gratitude: 0.0020
  grief: 0.0229
  joy: 0.0064
  love: 0.0072
  nervousness: 0.0062
  optimism: 0.0021
  pride: 0.0004
  realization: 0.0105
  relief: 0.0018
  remorse: 0.0125
  sadness: 0.8918
  surprise: 0.0023
  neutral: 0.0243

Predicted emotions based on threshold:

Text: I am thrilled with joy!
Predicted Emotions: joy, 

Text: Feeling sad and lonely today.
Predicted Emotions: sadness, 