In [1]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Load the dataset
df = pd.read_csv('data.csv')




In [2]:
# Preprocess the text
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuations and digits
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    # Remove stop words
    stop_words = stopwords.words('english')
    text = ' '.join([word for word in text.split() if word not in stop_words])
    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    return text

df['text'] = df['text'].apply(preprocess_text)

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert text to numerical vectors
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df['text'])
y = df['emotion']

In [4]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a linear support vector classifier
model = LinearSVC()
model.fit(X_train, y_train)


In [5]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Make predictions on the testing set
y_pred = model.predict(X_test)

In [6]:
# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print('Accuracy using Linear SVM:', accuracy)
print('Precision using Linear SVM:', precision)
print('Recall using Linear SVM:', recall)
print('F1-score using Linear SVM:', f1)

Accuracy using Linear SVM: 0.9000465983224604
Precision using Linear SVM: 0.8992366302005806
Recall using Linear SVM: 0.9000465983224604
F1-score using Linear SVM: 0.899027161380638


In [7]:
#Save the model as a pickle file
'''
import pickle
with open('model_lsvc.pkl', 'wb') as file:
    pickle.dump(model, file)
with open('vectorizer_lsvc.pkl', 'wb') as file:
    pickle.dump(tfidf, file)

'''


90

In [8]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test,y_pred))

[[ 536   18   19    5   37    2]
 [  17  453   16    0   28   17]
 [   9   11 1305   27   23    6]
 [   3    2   61  244    8    0]
 [  20   12   33    5 1203    4]
 [   2   19   13    1   11  122]]


In [9]:
from sklearn.metrics import classification_report
print("Classification Report of Linear SVM\n",classification_report(y_test,y_pred))

Classification Report of Linear SVM
               precision    recall  f1-score   support

       anger       0.91      0.87      0.89       617
        fear       0.88      0.85      0.87       531
       happy       0.90      0.94      0.92      1381
        love       0.87      0.77      0.81       318
     sadness       0.92      0.94      0.93      1277
    surprise       0.81      0.73      0.76       168

    accuracy                           0.90      4292
   macro avg       0.88      0.85      0.86      4292
weighted avg       0.90      0.90      0.90      4292

