In [20]:
!pip install datasets scikit-learn nltk numpy pandas skl2onnx onnxruntime --quiet

In [21]:
import re
import string
import numpy as np
import pandas as pd

from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score, classification_report
from sklearn.preprocessing import MultiLabelBinarizer

import string

STOPWORDS = {
    "a", "an", "the", "and", "or", "but", "if", "in", "on", "with", "to", "of",
    "for", "at", "by", "from", "up", "down", "out", "over", "under", "again",
    "further", "then", "once", "here", "there", "when", "where", "why", "how",
    "all", "any", "both", "each", "few", "more", "most", "other", "some",
    "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too",
    "very", "can", "will", "just", "should", "now", "am", "is", "are", "was",
    "were", "be", "been", "being", "have", "has", "had", "do", "does", "did",
    "he", "she", "it", "they", "them", "his", "her", "its", "their", "you",
    "your", "i", "me", "my", "we", "us", "our"
}


SUFFIXES = {
    "ing", "ed", "ly", "s", "es", "er", "est",
    "ment", "ness", "ful", "less", "able"
}

dataset = load_dataset("go_emotions")



In [22]:
def lowercase(text):
    return text.lower()


def remove_punctuation(text):
    result = []
    for c in text:
        if c in string.punctuation and c not in {"!", "?"}:
            continue
        result.append(c)
    return "".join(result)


def tokenize(text):
    return text.split()


def remove_stopwords(tokens):
    clean = []
    for token in tokens:
        if token and token not in STOPWORDS:
            clean.append(token)
    return clean


def stem(tokens):
    stemmed = []
    for token in tokens:
        if len(token) <= 3:
            stemmed.append(token)
            continue

        for suffix in SUFFIXES:
            if len(token) > len(suffix) + 1 and token.endswith(suffix):
                token = token[:-len(suffix)]
                break

        stemmed.append(token)

    return stemmed


def preprocess(text):
    text = lowercase(text)
    text = remove_punctuation(text)
    tokens = tokenize(text)
    tokens = remove_stopwords(tokens)
    tokens = stem(tokens)
    return " ".join(tokens)

In [23]:
def preprocess_batch(batch):
    batch["processed_str"] = [preprocess(t) for t in batch["text"]]
    return batch

dataset = dataset.map(preprocess_batch, batched=True)

In [24]:
X_train = list(dataset["train"]["processed_str"])
X_val   = list(dataset["validation"]["processed_str"])
X_test  = list(dataset["test"]["processed_str"])


y_train = list(dataset["train"]["labels"])
y_val = list(dataset["validation"]["labels"])
y_test = list(dataset["test"]["labels"])

from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()

y_train = mlb.fit_transform(y_train)
y_val = mlb.transform(y_val)
y_test = mlb.transform(y_test)

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=20000, ngram_range=(1, 2))

X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf   = vectorizer.transform(X_val)
X_test_tfidf  = vectorizer.transform(X_test)

In [26]:
model = OneVsRestClassifier(
    LogisticRegression(
        max_iter=1000,
        n_jobs=-1
    )
)

model.fit(X_train_tfidf, y_train)

In [27]:
y_pred = model.predict(X_test_tfidf)

micro_f1 = f1_score(y_test, y_pred, average="micro")
macro_f1 = f1_score(y_test, y_pred, average="macro")

print("Micro F1:", micro_f1)
print("Macro F1:", macro_f1)

Micro F1: 0.40534117913319
Macro F1: 0.22449112370686894


In [28]:
emotion_names = dataset["train"].features["labels"].feature.names

def predict_emotions(text, threshold=0.2):
    processed = preprocess(text)
    vec = vectorizer.transform([processed])

    probs = model.predict_proba(vec)

    pred = (probs >= threshold).astype(int)

    indices = mlb.inverse_transform(pred)
    indices = indices[0]

    emotions = [emotion_names[i] for i in indices]
    return emotions

example = "I can't believe this happened, I am so happy!"
print(predict_emotions(example))

['joy', 'surprise']


## Exporting the model

In [29]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ("tfidf", vectorizer),
    ("clf", model)
])

In [30]:
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import StringTensorType

initial_type = [("input", StringTensorType([None, 1]))]

onnx_model = convert_sklearn(pipeline, initial_types=initial_type)

with open("emotion_classifier.onnx", "wb") as f:
    f.write(onnx_model.SerializeToString())

In [31]:
vocab_items = sorted(vectorizer.vocabulary_.items(), key=lambda x: x[1])

with open("vocab.txt", "w", encoding="utf-8") as f:
    for token, _ in vocab_items:
        f.write(token + "\n")

In [32]:
import numpy as np

np.savetxt("idf.txt", vectorizer.idf_, fmt="%.8f")