In [1]:
import pandas as pd
df = pd.read_csv('Emotion_classify_Data.csv')
df.head()

Unnamed: 0,Comment,Emotion
0,i seriously hate one subject to death but now ...,fear
1,im so full of life i feel appalled,anger
2,i sit here to write i start to dig out my feel...,fear
3,ive been really angry with r and i feel like a...,joy
4,i feel suspicious if there is no one outside l...,fear


In [2]:
df.Emotion.value_counts()

Emotion
anger    2000
joy      2000
fear     1937
Name: count, dtype: int64

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
import spacy
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from matplotlib import pyplot as plt
import seaborn as sn

In [7]:
df['Emotion_number'] = df.Emotion.map({
    'anger': 0,
    'joy': 1,
    'fear': 2,
})

In [8]:
df.head(5)

Unnamed: 0,Comment,Emotion,Emotion_numeber,Emotion_number
0,i seriously hate one subject to death but now ...,fear,2,2
1,im so full of life i feel appalled,anger,0,0
2,i sit here to write i start to dig out my feel...,fear,2,2
3,ive been really angry with r and i feel like a...,joy,1,1
4,i feel suspicious if there is no one outside l...,fear,2,2


In [9]:
df.drop('Emotion_numeber',axis=1,inplace=True)

In [10]:
df.head(5)

Unnamed: 0,Comment,Emotion,Emotion_number
0,i seriously hate one subject to death but now ...,fear,2
1,im so full of life i feel appalled,anger,0
2,i sit here to write i start to dig out my feel...,fear,2
3,ive been really angry with r and i feel like a...,joy,1
4,i feel suspicious if there is no one outside l...,fear,2


In [13]:
X_train, X_test,y_train,y_test = train_test_split(df.Comment,df.Emotion_number, test_size = 0.2, random_state = 2022,stratify = df.Emotion_number)
X_train.shape

(4749,)

In [14]:
X_test.shape

(1188,)

In [16]:
clf = Pipeline([
    ('vect',CountVectorizer(ngram_range=(3, 3))),
    ('classifier', RandomForestClassifier()),
    ])
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test,y_pred))


              precision    recall  f1-score   support

           0       0.50      0.37      0.43       400
           1       0.61      0.25      0.36       400
           2       0.40      0.74      0.52       388

    accuracy                           0.45      1188
   macro avg       0.50      0.46      0.43      1188
weighted avg       0.51      0.45      0.43      1188



In [18]:
clf = Pipeline([
    ('vect',CountVectorizer(ngram_range=(1, 2))),
    ('classifier', MultinomialNB()),
    ])
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.89      0.89      0.89       400
           1       0.88      0.88      0.88       400
           2       0.86      0.86      0.86       388

    accuracy                           0.88      1188
   macro avg       0.88      0.88      0.88      1188
weighted avg       0.88      0.88      0.88      1188



In [19]:
clf = Pipeline([
    ('vect',CountVectorizer(ngram_range=(1, 2))),
    ('classifier', RandomForestClassifier()),
    ])
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.93      0.87      0.90       400
           1       0.86      0.96      0.91       400
           2       0.94      0.89      0.91       388

    accuracy                           0.91      1188
   macro avg       0.91      0.91      0.91      1188
weighted avg       0.91      0.91      0.91      1188



In [20]:
clf = Pipeline([
    ('vect',TfidfVectorizer()),
    ('classifier', RandomForestClassifier()),
    ])
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.92      0.90      0.91       400
           1       0.91      0.92      0.91       400
           2       0.91      0.92      0.92       388

    accuracy                           0.91      1188
   macro avg       0.91      0.91      0.91      1188
weighted avg       0.91      0.91      0.91      1188



In [21]:
nlp = spacy.load('en_core_web_sm')
def preprocess(text):
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_punct or token.is_stop:
            continue
        filtered_tokens.append(token.lemma_)

    return " ".join(filtered_tokens)


  from .autonotebook import tqdm as notebook_tqdm


In [23]:
# create a new column "preprocessed_comment" and use the utility function above to get the clean data
df['preprocessed_comment'] = df.Comment.apply(preprocess)
# this will take some time, please be patient

In [24]:
X_train, X_test,y_train,y_test = train_test_split(df.preprocessed_comment,df.Emotion_number, test_size = 0.2, random_state = 2022,stratify = df.Emotion_number)
X_train.shape

(4749,)

In [25]:
df

Unnamed: 0,Comment,Emotion,Emotion_number,preprocessed_comment
0,i seriously hate one subject to death but now ...,fear,2,seriously hate subject death feel reluctant drop
1,im so full of life i feel appalled,anger,0,m life feel appalled
2,i sit here to write i start to dig out my feel...,fear,2,sit write start dig feeling think afraid accep...
3,ive been really angry with r and i feel like a...,joy,1,ve angry r feel like idiot trust place
4,i feel suspicious if there is no one outside l...,fear,2,feel suspicious outside like rapture happen
...,...,...,...,...
5932,i begun to feel distressed for you,fear,2,begin feel distressed
5933,i left feeling annoyed and angry thinking that...,anger,0,leave feel annoyed angry thinking center stupi...
5934,i were to ever get married i d have everything...,joy,1,marry d ready offer ve get club perfect good l...
5935,i feel reluctant in applying there because i w...,fear,2,feel reluctant apply want able find company kn...


In [26]:
clf = Pipeline([
    ('vect',CountVectorizer(ngram_range=(1, 2))),
    ('classifier', RandomForestClassifier()),
    ])
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.93      0.94      0.93       400
           1       0.94      0.95      0.95       400
           2       0.94      0.91      0.93       388

    accuracy                           0.94      1188
   macro avg       0.94      0.94      0.94      1188
weighted avg       0.94      0.94      0.94      1188



              precision    recall  f1-score   support

           0       0.92      0.92      0.92       400
           1       0.94      0.95      0.94       400
           2       0.92      0.92      0.92       388

    accuracy                           0.93      1188
   macro avg       0.93      0.93      0.93      1188
weighted avg       0.93      0.93      0.93      1188

