In [73]:
#Get Original DataFrame
import pandas as pd

df = pd.read_json("../data/emotion_dataset.json", orient="records", lines=True)

print(df.head())

  Emotion  Score                                              Tweet
0   anger  0.562  @xandraaa5 @amayaallyn6 shut up hashtags are c...
1   anger  0.750  it makes me so fucking irate jesus. nobody is ...
2   anger  0.417         Lol Adam the Bull with his fake outrage...
3   anger  0.354  @THATSSHAWTYLO passed away early this morning ...
4   anger  0.438  @Kristiann1125 lol wow i was gonna say really?...


In [74]:
#Preprocessing
import re

def preprocess(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'@\w+', 'user', text)
    return text.strip()

df_preprocessed = df[['Tweet', 'Emotion', 'Score']].copy()
df_preprocessed['Tweet'] = df['Tweet'].apply(preprocess)
print(df_preprocessed.head())


                                               Tweet Emotion  Score
0      user user shut up hashtags are cool #offended   anger  0.562
1  it makes me so fucking irate jesus. nobody is ...   anger  0.750
2         lol adam the bull with his fake outrage...   anger  0.417
3  user passed away early this morning in a fast ...   anger  0.354
4  user lol wow i was gonna say really?! haha hav...   anger  0.438


In [75]:
# Vectorize, Train, Predict and Test for Logistic Regression (with TF-IDF)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Split
X_train, X_test, y_train, y_test = train_test_split(df_preprocessed['Tweet'], df_preprocessed['Emotion'], test_size=0.2, random_state=42)

# Vectorize
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Train
regression = LogisticRegression(max_iter=1000)
regression.fit(X_train_vec, y_train)

# Evaluate
y_pred = regression.predict(X_test_vec)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       anger       0.71      0.69      0.70       639
        fear       0.67      0.76      0.71       724
         joy       0.86      0.80      0.83       587
     sadness       0.66      0.62      0.64       577

    accuracy                           0.72      2527
   macro avg       0.72      0.72      0.72      2527
weighted avg       0.72      0.72      0.72      2527



In [76]:
# Load Glove and Vectorize Tweet and Emotion

import numpy as np
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import hstack
from scipy import sparse
from sklearn.model_selection import train_test_split

def load_glove(path):
    emb = {}
    with open(path, encoding='utf8') as f:
        for line in f:
            parts = line.split()
            word, vec = parts[0], np.array(parts[1:], dtype=float)
            emb[word] = vec
    return emb

glove = load_glove("../data/glove.6B.100d.txt")

# Turn each tweet into GloVe vector
def tweet_to_vec(tweet, embeddings, dim=100):
    words = tweet.split()
    vecs = [embeddings[w] for w in words if w in embeddings]
    if vecs:
        return np.mean(vecs, axis=0)
    else:
        return np.zeros(dim)

X_glove = np.vstack(df_preprocessed['Tweet'].apply(lambda t: tweet_to_vec(t, glove)))

# One-hot encode Emotion
ohe = OneHotEncoder()
one_hot_encoder_em = ohe.fit_transform(df_preprocessed[['Emotion']])

In [77]:
#Fit, predict and test for Linear Regression

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from scipy.sparse import hstack
from scipy import sparse

# Combine GloVe with one-hot
X_glove_sparse = sparse.csr_matrix(X_glove)
X_feat = hstack([X_glove_sparse, one_hot_encoder_em])

y = df_preprocessed['Score']

# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X_feat, y, test_size=0.2, random_state=42)

# Train and evaluate
reg = LinearRegression()
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)

print("MSE:", mean_squared_error(y_test, y_pred))
print("R² :", r2_score(y_test, y_pred))

MSE: 0.0330991314415823
R² : 0.07760563083657324
