In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('final-after-preprocessing/after-preprocessing.csv')

In [3]:
df

Unnamed: 0.1,Unnamed: 0,text,emotion,preprocessed_text
0,0,i didnt feel humiliated,sadness,feel humiliated
1,1,i can go from feeling so hopeless to so damned...,sadness,go feeling hopeless damned hopeful around some...
2,2,im grabbing a minute to post i feel greedy wrong,anger,grabbing minute post feel greedy wrong
3,3,i am feeling grouchy,anger,feeling grouchy
4,4,ive been feeling a little burdened lately wasn...,sadness,feeling little burdened lately sure
...,...,...,...,...
23649,23649,I received a letter from a distant friend.,joy,received letter distant friend
23650,23650,My parents were out and I was the eldest at ho...,fear,parent eldest home midnight male stranger phon...
23651,23651,Two years back someone invited me to be the tu...,anger,two year back someone invited tutor granddaugh...
23652,23652,I had taken the responsibility to do something...,sadness,taken responsibility something prepared howeve...


In [4]:
df= df[["preprocessed_text", "emotion"]]

In [5]:
df

Unnamed: 0,preprocessed_text,emotion
0,feel humiliated,sadness
1,go feeling hopeless damned hopeful around some...,sadness
2,grabbing minute post feel greedy wrong,anger
3,feeling grouchy,anger
4,feeling little burdened lately sure,sadness
...,...,...
23649,received letter distant friend,joy
23650,parent eldest home midnight male stranger phon...,fear
23651,two year back someone invited tutor granddaugh...,anger
23652,taken responsibility something prepared howeve...,sadness


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

# Train-Test split 80-20

In [7]:
# train test split
x_train,x_test,y_train,y_test = train_test_split(df.preprocessed_text.values, df.emotion.values, test_size = 0.2, random_state=42)


In [8]:

x_train.shape,y_train.shape,x_test.shape,y_test.shape

((18923,), (18923,), (4731,), (4731,))

In [9]:
tfidf = TfidfVectorizer()
tfidf.fit(x_train)
X_train = tfidf.transform(x_train)
X_test = tfidf.transform(x_test)


In [10]:
X_train

<18923x12563 sparse matrix of type '<class 'numpy.float64'>'
	with 166464 stored elements in Compressed Sparse Row format>

In [11]:
X_test

<4731x12563 sparse matrix of type '<class 'numpy.float64'>'
	with 40284 stored elements in Compressed Sparse Row format>

# Label encoding 

In [12]:
enc = LabelEncoder()
enc.fit(y_train)
Y_train = enc.transform(y_train)
Y_test = enc.transform(y_test)

In [13]:
labels = enc.classes_
labels


array(['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise'],
      dtype=object)

In [14]:
Y_test

array([3, 4, 3, ..., 0, 3, 3])

# Machine Learning Algorithms

In [15]:
Y_test.shape

(4731,)

In [16]:
Y_train.shape

(18923,)

In [17]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.pipeline import Pipeline

In [18]:
result = {}

## Logistic Regression

In [19]:
# Instantiate logistic regression model
log = LogisticRegression()

# Fit the model
log.fit(X_train, Y_train)  # Assuming X_train and Y_train are the training data

# Evaluate the model
log_score = log.score(X_test, Y_test)  # Assuming X_test and Y_test are the test data

# Update the result dictionary
result['Logistic Regression'] = log_score

# Print the score
print(log_score)

0.8425280067638977


# Multinomial NB

In [20]:
nb = MultinomialNB()
nb.fit(X_train,Y_train)
nb_score = nb.score(X_test,Y_test)
result['Naive Bayes']=nb_score
print(nb_score)

0.7178186429930248


# SVM

In [21]:
svm = SVC()
svm.fit(X_train,Y_train)
svm_score = svm.score(X_test,Y_test)
result['SVM']=svm_score
print(svm_score)

0.8437962375819066


# K-means Clustering

In [22]:
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train,Y_train)
knn_score = knn.score(X_test,Y_test)
result['K means Clustering']=knn_score
print(knn_score)

0.766856901289368


# Decision Tree

In [23]:
dt = DecisionTreeClassifier()
dt.fit(X_train,Y_train)
dt_score = dt.score(X_test,Y_test)
result['Decision Tree']=dt_score
print(dt_score)

0.8258296343267808


In [24]:
# display results
result_df = []
for model,score in result.items():
    result_df.append([model,score])
    
result_df = pd.DataFrame(result_df, columns=['Model','Test Score'])
result_df = result_df.style
result_df = result_df.highlight_max(subset=['Test Score'], color = 'lightgreen')
result_df = result_df.highlight_min(subset=['Test Score'], color = 'pink')
display(result_df)

Unnamed: 0,Model,Test Score
0,Logistic Regression,0.842528
1,Naive Bayes,0.717819
2,SVM,0.843796
3,K means Clustering,0.766857
4,Decision Tree,0.82583


# Parameter tuning for each

# Logistic Regression

In [None]:
# Hyper parameter tuning and cross validation for the Logistic Regression model

from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(LogisticRegression(max_iter=1000), {
    'C': [0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky'] 
}, cv=5, return_train_score=False)

grid.fit(X_train,Y_train)
grid = pd.DataFrame(grid.cv_results_).sort_values(by='rank_test_score', ascending=True)
grid[['param_C', 'param_penalty', 'param_solver', 'mean_test_score', 'rank_test_score']].tail(20)