In [1]:
import pandas as pd
import numpy as np
dataset = pd.read_csv('text_emotion.csv')

In [2]:
data = dataset[['sentiment', 'content']]

In [3]:
print(data['sentiment'].value_counts())

neutral       8638
worry         8459
happiness     5209
sadness       5165
love          3842
surprise      2187
fun           1776
relief        1526
hate          1323
empty          827
enthusiasm     759
boredom        179
anger          110
Name: sentiment, dtype: int64


In [4]:
classes = ['neutral', 'worry', 'happiness', 'sadness']
data = data[data['sentiment'].isin(classes)]

In [5]:
from sklearn import preprocessing
X = data['content'].values.tolist()
encoder = preprocessing.LabelEncoder()
y = encoder.fit_transform(data['sentiment'].values.tolist())

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.1, shuffle=True)

In [7]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

stop = stopwords.words('english')
lemmatizer = WordNetLemmatizer()
def pre_process(text):
    words = word_tokenize(text)
    words = [x for x in words if x.isalpha()]
    words = [x.lower() for x in words]
    words = [lemmatizer.lemmatize(x) for x in words]
    words = [x for x in words if x not in stop]
    return " ".join(words)


In [8]:
pre_process("I ate Something I don't know what it is... Why do I keep Telling things about food")

'ate something know keep telling thing food'

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(analyzer='word', max_features=10000, preprocessor=pre_process, ngram_range=(1,2))
vectorizer.fit(X)
X_train = vectorizer.transform(X_train).toarray()
X_test = vectorizer.transform(X_test).toarray()


In [10]:
print(len(X_train))
print(len(X_test))

24723
2748


In [11]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

nb = MultinomialNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)
print(classification_report(y_test, y_pred, target_names=classes))
print('accuracy ', accuracy_score(y_pred, y_test))

              precision    recall  f1-score   support

     neutral       0.54      0.51      0.53       543
       worry       0.45      0.46      0.46       854
   happiness       0.39      0.31      0.35       516
     sadness       0.42      0.48      0.45       835

    accuracy                           0.45      2748
   macro avg       0.45      0.44      0.44      2748
weighted avg       0.45      0.45      0.45      2748

accuracy  0.447962154294032
