In [1]:
import pandas as pd
import numpy as np
from hmmlearn import hmm
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# Load dataset
df = pd.read_csv(
    "../../Datasets/emotion_data_merged_3.csv"
)  # Make sure to replace 'your_dataset.csv' with your actual file path

df = df.dropna()
# df = df[:10000]

In [3]:
df

Unnamed: 0,sentence,emotion
0,In a certain mill lived an old miller who had ...,neutral
1,"As they had been with him several years, he on...",neutral
2,"The third of the boys was, however, the drudge...",neutral
3,"Then all three went out together, and when the...",neutral
4,"Hans, however, went with them, and when it was...",neutral
...,...,...
524147,When I was informed that a short story I had w...,happiness
524148,When my friend got very low marks in field wor...,disgust
524149,When I was in a little fishing boat and we ran...,fear
524150,"I was \best man\"" at my brother's wedding a ye...",happiness


In [4]:
# Preprocessing: Convert sentences into a sequence of integers using CountVectorizer
# Note: This is a very naive approach and might not work well for HMM
vectorizer = CountVectorizer(max_features=1000)
X = vectorizer.fit_transform(df["sentence"]).toarray()

# Convert emotion labels to integers
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df["emotion"])

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
# Initialize and train HMM
# Note: We are using GaussianHMM as an example; this might need to be adapted based on your feature representation
model = hmm.GaussianHMM(n_components=7, covariance_type="diag", n_iter=50)
model.fit(X_train)

In [None]:
# Predict the sequence of emotions for the test set
y_pred = model.predict(X_test)

# Calculate and print the accuracy and F1 score
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")

Accuracy: 0.07706346627935864
F1 Score: 0.02190251186295934


In [None]:
# Load the test data
group_test_df = pd.read_csv("../../Datasets/test_group.csv", sep="\t")

# Preprocess the sentences using the same CountVectorizer
X_group_test = vectorizer.transform(group_test_df["sentence"]).toarray()

# Use the trained HMM model to predict the emotions
group_test_pred = model.predict(X_group_test)

# Map the predicted integer labels back to emotion labels
group_test_pred_labels = label_encoder.inverse_transform(group_test_pred)

# Create a DataFrame with the predictions
predictions_df = pd.DataFrame(
    {"id": group_test_df["id"], "predicted_emotion": group_test_pred_labels}
)

predictions_df

In [None]:
# Save the predictions to a CSV file
predictions_df.to_csv("predictions.csv", index=False)