In [1]:
# Import packages
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
import neattext.functions as nfx

In [2]:
# Read Datasets
df = pd.read_csv("model-training-dataset.csv")

# Print First 5 Rows
df.head()

Unnamed: 0,text,label
0,homegirl baby funeral hate funerals shows blessed,sadness
1,babe hugggzzz babe naamazed babe despite nega ...,happiness
2,couldnt wait live missing wasnt painful enuf s...,sadness
3,maken stop moment hele project ziet zitten der...,surprise
4,krijg cadeautje tweep melike,surprise


In [3]:
df['label'] = [x if x in ['anger','fear','sadness'] else 'non-negative' for x in df['label']]

In [4]:
df.to_csv('negPos_data.csv')

In [5]:
# Display all count of emotion categories
df.value_counts('label')

label
non-negative    17432
anger            6400
fear             6400
sadness          6400
dtype: int64

In [6]:
# Display count of emotion label categories
myValues = df.value_counts('label')

# Percentage of Distribution
sum = 0
for i in myValues:
    sum = sum + i

for i in (myValues.index):
    percentage = int((myValues[i] / sum) * 100)
    print(f'{i}: {percentage}%')


non-negative: 47%
anger: 17%
fear: 17%
sadness: 17%


In [7]:
# Create Lemmatizer Tokenizer
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

# Build a Machine Learning pipeline using sklearn.pipeline package
pipe_lr1 = Pipeline(steps=[
    ('cv', CountVectorizer(
        strip_accents = 'unicode',
        stop_words = 'english',
        lowercase=True,
        max_df=0.5,
        min_df = 10)),
    ('lr',LogisticRegression(
        max_iter=1000))
    ])

In [8]:
df2 = pd.read_csv('model-test-dataset.csv')
df2['label'] = [x if x in ['anger','fear','sadness'] else 'non-negative' for x in df2['label']]
df2.to_csv('negPos_test_data.csv')

In [9]:
# Separate features and labels (target output)
xfeatures = df['text']
ylabels = df['label']

pipe_lr1.fit(xfeatures,ylabels)

# Testing Model's Accuracy at Predicting Values Not From Training/Testing Set
test_data = pd.read_csv('negPos_test_data.csv')
x_test = test_data['text']
y_test = test_data['label']

In [10]:
# List to store predicted emotions
predictions = []

# For Loop to Predict Each Individual Text Input
for i in range(len(test_data)):
    r = pipe_lr1.predict([test_data.loc[i,'text']])
    predictions.append(r[0])

# Add column 'result' to DataFrame
test_data['result'] = predictions

# Print Overall Model's Accuracy:
print(f'Overall Model Score: {int(pipe_lr1.score(x_test,y_test)*100)}%')

# Store Individual Emotions Label
emotionCount = test_data.value_counts('label')

# For Each Emotion Label, Calculate the Accuracy of the Model
for emotion in emotionCount.index:
    count_correct:int = 0
    count_incorrect:int = 0
    for i in range(len(test_data)):

        if (test_data.loc[i, 'label'] == emotion):
            if ((test_data.loc[i, 'label']) == (test_data.loc[i, 'result'])):
                count_correct = count_correct + 1
            else:
                count_incorrect = count_incorrect + 1
        else:
            continue
    
    sum:int = count_correct + count_incorrect
    accuracy:int = int((count_correct / (count_correct+count_incorrect)) * 100)
    print(f"Model's Accuracy at Detecting {emotion}: {accuracy}%")

Overall Model Score: 89%
Model's Accuracy at Detecting non-negative: 92%
Model's Accuracy at Detecting sadness: 86%
Model's Accuracy at Detecting anger: 84%
Model's Accuracy at Detecting fear: 77%


In [11]:
import pickle
pickle.dump(pipe_lr1, open('negative_emotion_classifier.pkl', 'wb'))