In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('final-after-preprocessing/after-preprocessing.csv')

In [3]:
df

Unnamed: 0.1,Unnamed: 0,text,emotion,preprocessed_text
0,0,i didnt feel humiliated,sadness,feel humiliated
1,1,i can go from feeling so hopeless to so damned...,sadness,go feeling hopeless damned hopeful around some...
2,2,im grabbing a minute to post i feel greedy wrong,anger,grabbing minute post feel greedy wrong
3,3,i am feeling grouchy,anger,feeling grouchy
4,4,ive been feeling a little burdened lately wasn...,sadness,feeling little burdened lately sure
...,...,...,...,...
23649,23649,I received a letter from a distant friend.,joy,received letter distant friend
23650,23650,My parents were out and I was the eldest at ho...,fear,parent eldest home midnight male stranger phon...
23651,23651,Two years back someone invited me to be the tu...,anger,two year back someone invited tutor granddaugh...
23652,23652,I had taken the responsibility to do something...,sadness,taken responsibility something prepared howeve...


In [4]:
df= df[["preprocessed_text", "emotion"]]

In [5]:
df

Unnamed: 0,preprocessed_text,emotion
0,feel humiliated,sadness
1,go feeling hopeless damned hopeful around some...,sadness
2,grabbing minute post feel greedy wrong,anger
3,feeling grouchy,anger
4,feeling little burdened lately sure,sadness
...,...,...
23649,received letter distant friend,joy
23650,parent eldest home midnight male stranger phon...,fear
23651,two year back someone invited tutor granddaugh...,anger
23652,taken responsibility something prepared howeve...,sadness


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

# Train-Test split 80-20

In [7]:
# train test split
x_train,x_test,y_train,y_test = train_test_split(df.preprocessed_text.values, df.emotion.values, test_size = 0.2, random_state=42)


In [8]:

x_train.shape,y_train.shape,x_test.shape,y_test.shape

((18923,), (18923,), (4731,), (4731,))

In [9]:
tfidf = TfidfVectorizer()
tfidf.fit(x_train)
X_train = tfidf.transform(x_train)
X_test = tfidf.transform(x_test)


In [10]:
X_train

<18923x12563 sparse matrix of type '<class 'numpy.float64'>'
	with 166464 stored elements in Compressed Sparse Row format>

In [11]:
X_test

<4731x12563 sparse matrix of type '<class 'numpy.float64'>'
	with 40284 stored elements in Compressed Sparse Row format>

# Label encoding 

In [12]:
enc = LabelEncoder()
enc.fit(y_train)
Y_train = enc.transform(y_train)
Y_test = enc.transform(y_test)

In [13]:
labels = enc.classes_
labels


array(['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise'],
      dtype=object)

In [14]:
Y_test

array([3, 4, 3, ..., 0, 3, 3])

# Machine Learning Algorithms

In [15]:
Y_test.shape

(4731,)

In [16]:
Y_train.shape

(18923,)

In [17]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.pipeline import Pipeline

In [24]:
from sklearn.preprocessing import StandardScaler

# Instantiate StandardScaler
# Instantiate StandardScaler with with_mean=False for sparse matrices
scaler = StandardScaler(with_mean=False)

# Fit scaler to the training data and transform the training data
X_train = scaler.fit_transform(X_train)

# Transform the test data using the scaler fitted on the training data
X_test = scaler.transform(X_test)

In [25]:
result = {}

## Logistic Regression

In [26]:
# Instantiate logistic regression model
log = LogisticRegression()

# Fit the model
log.fit(X_train, Y_train)  # Assuming X_train and Y_train are the training data

# Evaluate the model
log_score = log.score(X_test, Y_test)  # Assuming X_test and Y_test are the test data

# Update the result dictionary
result['Logistic Regression'] = log_score

# Print the score
print(log_score)

0.7085182836609596


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Multinomial NB

In [27]:
nb = MultinomialNB()
nb.fit(X_train,Y_train)
nb_score = nb.score(X_test,Y_test)
result['Naive Bayes']=nb_score
print(nb_score)

0.5768336503910378


# SVM

In [28]:
svm = SVC()
svm.fit(X_train,Y_train)
svm_score = svm.score(X_test,Y_test)
result['SVM']=svm_score
print(svm_score)

0.6444726273515113


# K-means Clustering

In [29]:
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train,Y_train)
knn_score = knn.score(X_test,Y_test)
result['K means Clustering']=knn_score
print(knn_score)

0.6442612555485099


# Decision Tree

In [30]:
dt = DecisionTreeClassifier()
dt.fit(X_train,Y_train)
dt_score = dt.score(X_test,Y_test)
result['Decision Tree']=dt_score
print(dt_score)

0.82287042908476


In [31]:
# display results
result_df = []
for model,score in result.items():
    result_df.append([model,score])
    
result_df = pd.DataFrame(result_df, columns=['Model','Test Score'])
result_df = result_df.style
result_df = result_df.highlight_max(subset=['Test Score'], color = 'lightgreen')
result_df = result_df.highlight_min(subset=['Test Score'], color = 'pink')
display(result_df)

Unnamed: 0,Model,Test Score
0,Logistic Regression,0.708518
1,Naive Bayes,0.576834
2,SVM,0.644473
3,K means Clustering,0.644261
4,Decision Tree,0.82287


# Parameter tuning for each

# Logistic Regression

In [None]:
# Hyper parameter tuning and cross validation for the Logistic Regression model

from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(LogisticRegression(max_iter=1000), {
    'C': [0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky'] 
}, cv=5, return_train_score=False)

grid.fit(X_train,Y_train)
grid = pd.DataFrame(grid.cv_results_).sort_values(by='rank_test_score', ascending=True)
grid[['param_C', 'param_penalty', 'param_solver', 'mean_test_score', 'rank_test_score']].tail(20)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from scipy.stats import uniform

# Define hyperparameter distributions
param_dist = {
    'C': uniform(loc=0, scale=100),  # Continuous distribution for regularization parameter
    'penalty': ['l1', 'l2'],         # Penalty term
    'solver': ['liblinear', 'saga']  # Solver algorithm
}

# Instantiate logistic regression model
logistic_regression = LogisticRegression(max_iter=1000)

# Create RandomizedSearchCV object
random_search = RandomizedSearchCV(estimator=logistic_regression, param_distributions=param_dist,
                                   n_iter=10, cv=5, scoring='accuracy', random_state=42)

# Perform random search
random_search.fit(X_train, Y_train)

# Get the best parameters and best score
best_params = random_search.best_params_
best_score = random_search.best_score_

# Print the best parameters and best score
print("Best Parameters:", best_params)
print("Best Score:", best_score)


# Ensemble learning

In [21]:
from sklearn.ensemble import VotingClassifier

# Instantiate base models
logistic_regression = LogisticRegression()
svm = SVC()

# Define the ensemble classifier using voting
ensemble_classifier = VotingClassifier(estimators=[('lr', logistic_regression), ('svm', svm)], voting='hard')

# Fit the ensemble classifier on the training data
ensemble_classifier.fit(X_train, Y_train)

# Evaluate the ensemble classifier on the test data
ensemble_score = ensemble_classifier.score(X_test, Y_test)

# Update the result dictionary
result['Ensemble'] = ensemble_score
print("Ensemble Score:", ensemble_score)


Ensemble Score: 0.8456985838089199


In [22]:
from sklearn.ensemble import StackingClassifier
# from sklearn.linear_model import LogisticRegression
# from sklearn.svm import SVC

# # Instantiate base classifiers
# logistic_regression = LogisticRegression()
# svm = SVC()

# Define the stacking classifier with meta-classifier as Logistic Regression
stacking_classifier = StackingClassifier(estimators=[('lr', logistic_regression), ('svm', svm)],
                                          final_estimator=LogisticRegression())

# Fit the stacking classifier on the training data
stacking_classifier.fit(X_train, Y_train)

# Evaluate the stacking classifier on the test data
stacking_score = stacking_classifier.score(X_test, Y_test)

# Update the result dictionary
result['Stacking Classifier'] = stacking_score
print("Stacking Classifier Score:", stacking_score)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Stacking Classifier Score: 0.848869160853942
