<a href="https://colab.research.google.com/github/pairr/SentimentAnalysis/blob/main/SentimentAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder


In [None]:
train_df = pd.read_csv("/content/training.csv")
val_df   = pd.read_csv("/content/validation.csv")
test_df  = pd.read_csv("/content/test.csv")

X_train = train_df["text"]
y_train = train_df["label"]

X_val = val_df["text"]
y_val = val_df["label"]

X_test = test_df["text"]
y_test = test_df["label"]


In [None]:
pipeline = Pipeline(
    steps=[
        (
            "tfidf",
            TfidfVectorizer(
                lowercase=True,
                ngram_range=(1, 2),      # unigrams + bigrams
                max_df=0.95,
                min_df=5
            )
        ),
        (
            "clf",
            LogisticRegression(
                max_iter=1000,
                class_weight="balanced",  # important for emotion imbalance
                n_jobs=-1
            )
        )
    ]
)


In [None]:
pipeline.fit(X_train, y_train_enc)


In [None]:
val_preds = pipeline.predict(X_val)

print("VALIDATION RESULTS\n")
print(
    classification_report(
        y_val_enc,
        val_preds,
        target_names=[str(c) for c in label_encoder.classes_],
        digits=4
    )
)


print("Confusion matrix:\n")
print(confusion_matrix(y_val_enc, val_preds))


VALIDATION RESULTS

              precision    recall  f1-score   support

           0     0.9156    0.8873    0.9012       550
           1     0.9281    0.8807    0.9038       704
           2     0.7511    0.9494    0.8387       178
           3     0.8901    0.8836    0.8869       275
           4     0.8230    0.8113    0.8171       212
           5     0.7174    0.8148    0.7630        81

    accuracy                         0.8790      2000
   macro avg     0.8375    0.8712    0.8518      2000
weighted avg     0.8840    0.8790    0.8801      2000

Confusion matrix:

[[488  18  10  16  15   3]
 [ 21 620  39   7   9   8]
 [  0   7 169   1   1   0]
 [ 12   9   5 243   6   0]
 [  9   9   1   6 172  15]
 [  3   5   1   0   6  66]]


In [None]:
emotion_names = ["anger", "fear", "joy", "love", "sadness", "surprise"]

In [None]:
print(
    classification_report(
        y_val_enc,
        val_preds,
        target_names=emotion_names,
        digits=4
    )
)

              precision    recall  f1-score   support

       anger     0.9156    0.8873    0.9012       550
        fear     0.9281    0.8807    0.9038       704
         joy     0.7511    0.9494    0.8387       178
        love     0.8901    0.8836    0.8869       275
     sadness     0.8230    0.8113    0.8171       212
    surprise     0.7174    0.8148    0.7630        81

    accuracy                         0.8790      2000
   macro avg     0.8375    0.8712    0.8518      2000
weighted avg     0.8840    0.8790    0.8801      2000

