In [1]:
pip install transformers datasets torch scikit-learn numpy pandas


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


DEPRECATION: Loading egg at c:\programdata\anaconda3\lib\site-packages\vboxapi-1.0-py3.12.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330


In [4]:
import pandas as pd

# Load dataset
df = pd.read_csv("goemotions.csv")

# Select relevant columns: 'text' + chosen emotions
TEXT_COLUMN = "text"
SELECTED_LABELS = ["gratitude", "approval", "anger", "joy", "sadness"]

df = df[[TEXT_COLUMN] + SELECTED_LABELS]  # Keep only required columns
df.dropna(inplace=True)  # Remove missing values

print("\nDataset Preview:\n", df.head())



Dataset Preview:
                                                 text  gratitude  approval  \
0                                    That game hurt.          0         0   
1   >sexuality shouldn’t be a grouping category I...          0         0   
2     You do right, if you don't care then fuck 'em!          0         0   
3                                 Man I love reddit.          0         0   
4  [NAME] was nowhere near them, he was by the Fa...          0         0   

   anger  joy  sadness  
0      0    0        1  
1      0    0        0  
2      0    0        0  
3      0    0        0  
4      0    0        0  


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Convert text and labels to lists
text_data = df[TEXT_COLUMN].tolist()
labels = df[SELECTED_LABELS].values  # Convert selected labels to NumPy array

# Split into train and test sets (80-20)
X_train, X_test, y_train, y_test = train_test_split(text_data, labels, test_size=0.2, random_state=42)

# Convert text to TF-IDF features (faster than BERT)
vectorizer = TfidfVectorizer(max_features=5000, stop_words="english")  # Limit features for speed
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

print("\nTF-IDF Shape:", X_train_tfidf.shape)



TF-IDF Shape: (168980, 5000)


In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier

# Multi-label classification using Logistic Regression
clf = MultiOutputClassifier(LogisticRegression(max_iter=200))
clf.fit(X_train_tfidf, y_train)

# Predict on test set
y_pred = clf.predict(X_test_tfidf)


In [7]:
from sklearn.metrics import f1_score, hamming_loss

# Compute F1 Score & Hamming Loss
f1 = f1_score(y_test, y_pred, average="macro")
h_loss = hamming_loss(y_test, y_pred)

print(f"\nF1 Score: {f1}")
print(f"Hamming Loss: {h_loss}")



F1 Score: 0.2807858784826701
Hamming Loss: 0.04142975500059179


In [8]:
def predict_emotion(text):
    text_tfidf = vectorizer.transform([text])
    preds = clf.predict(text_tfidf)

    detected_emotions = [SELECTED_LABELS[i] for i in range(len(preds[0])) if preds[0][i] == 1]
    return detected_emotions

# Test with a new sentence
print(predict_emotion("I feel very happy and excited today!"))


['joy']
