In [2]:
# emotion_svm_classifier.py

import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

In [3]:
df=pd.read_csv('emotions.csv')
dff = df.head(10000)

In [4]:
dff

Unnamed: 0,text,label
0,i just feel really helpless and heavy hearted,4
1,ive enjoyed being able to slouch about relax a...,0
2,i gave up my internship with the dmrg and am f...,4
3,i dont know i feel so lost,0
4,i am a kindergarten teacher and i am thoroughl...,4
...,...,...
9995,i feel that shakespeare is a talented man of h...,1
9996,ill be attending college classes and ill have ...,1
9997,i am a girl and i am utterly dependent on my i...,0
9998,ive been feeling less inhibited,4


In [5]:
# Step 2: Text Cleaning
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|@\S+|#\S+", "", text)  # Remove URLs, mentions, hashtags
    text = re.sub(r"[^a-z\s]", "", text)                  # Remove punctuation and numbers
    return text.strip()

df["text"] = df["text"].apply(clean_text)

In [6]:
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X = vectorizer.fit_transform(dff["text"])
y = dff["label"] 

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


In [9]:
# Try different kernels
kernels = ['linear', 'rbf', 'poly', 'sigmoid']
best_kernel = None
best_model = None
best_accuracy = 0.0

for kernel in kernels:
    print(f"\nTraining SVM with kernel = '{kernel}'...")
    model = SVC(kernel=kernel)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    
    print(f"Validation Accuracy for {kernel}: {acc:.4f}")
    
    if acc > best_accuracy:
        best_accuracy = acc
        best_kernel = kernel
        best_model = model

# Report best kernel
print(f"\n✅ Best kernel: {best_kernel} with accuracy = {best_accuracy:.4f}")



Training SVM with kernel = 'linear'...
Validation Accuracy for linear: 0.8345

Training SVM with kernel = 'rbf'...
Validation Accuracy for rbf: 0.7495

Training SVM with kernel = 'poly'...
Validation Accuracy for poly: 0.4995

Training SVM with kernel = 'sigmoid'...
Validation Accuracy for sigmoid: 0.8345

✅ Best kernel: linear with accuracy = 0.8345


In [10]:
# Evaluate on test set
y_test_pred = best_model.predict(X_test)
print("\n=== Test Set Results ===")
print("Accuracy:", accuracy_score(y_test, y_test_pred))
print("Classification Report:\n", classification_report(y_test, y_test_pred))


=== Test Set Results ===
Accuracy: 0.8345
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.90      0.87       587
           1       0.81      0.93      0.86       679
           2       0.84      0.51      0.63       158
           3       0.89      0.79      0.84       273
           4       0.84      0.75      0.79       230
           5       0.87      0.55      0.67        73

    accuracy                           0.83      2000
   macro avg       0.85      0.74      0.78      2000
weighted avg       0.84      0.83      0.83      2000



In [11]:
tweet = "I just got a promotion at work and I'm so happy!"
best_model.predict(vectorizer.transform([clean_text(tweet)]))

array([1], dtype=int64)