# Emotion Classification Using Naïve Bayes

In [1]:
import pandas as pd
import nltk
import math
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/majd/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
# Read the Excel sheet into a pandas DataFrame
df = pd.read_excel('Files/data.xlsx')

# Select the first 30 rows of the DataFrame
df = df.head(30)

# Define the emotions to count
emotions = ['Sadness', 'Joy', 'Fear', 'Anger', 'Surprise', 'Disgust']

# Loop through the first 6 rows in even-numbered columns of the DataFrame
for i, col_name in enumerate(df.columns[1::2][:6]):
    # Get the sentences from the column
    sentences = df[col_name].tolist()
    tokens = nltk.word_tokenize(sentences[0])
    print(f"{emotions[i]}: {tokens}")
            

# Count the number of total sentences in doc
total_num_sentences = 0
for col_name in df.columns[1::2]:
    # Check how many sentences in that column
    num_sentences = 0
    for value in df[col_name]:
        if not pd.isna(value):
            num_sentences += 1
    # Print how many sentences that column has
    print(f"Column '{col_name}' has {num_sentences} sentences.")
    # Add number of sentences to total number
    total_num_sentences += num_sentences


# Initialize a dictionary to store the sentence counts for each emotion
emotion_counts = {emotion: 0 for emotion in emotions}

# Loop through each even-numbered column in the DataFrame
for i, col_name in enumerate(df.columns[1::2]):
    # Remove "Sentences" from the column name
    emotion_name = col_name.replace(" Sentences", "")
    # Check if the column contains any emotions to count
    column_emotions = []
    for emotion in emotions:
        if emotion in emotion_name:
            column_emotions.append(emotion)
    # If the column contains at least one emotion to count, add the number of non-empty sentences to the count
    if len(column_emotions) > 0:
        non_empty_sentences = df[col_name].dropna().count()
        for emotion in column_emotions:
            emotion_counts[emotion] += non_empty_sentences
        # Print the number of non-empty sentences for each emotion and column
        print(f"Column '{col_name}' has {non_empty_sentences} non-empty sentences.")
        for emotion in column_emotions:
            print(f" {non_empty_sentences} sentences added to '{emotion}' count")

total_count = 0
# Print the total non-empty sentence counts for each emotion
for emotion, count in emotion_counts.items():
    total_count += count


print(f"Total number of sentences (that belongs to 6 emotions): {total_count}")

# Initialize a variable to store the sum of all probabilities
total_prob = 0

# Print the total non-empty sentence counts for each emotion
for emotion, count in emotion_counts.items():
    prob = count / total_count
    total_prob += prob
    print(f"P({emotion})= {count} / {total_count} = {prob}")

# Print the sum of all probabilities
print(f"Total probability: {total_prob}")


# Define the emotions to count
emotions = ['Sadness', 'Joy', 'Fear', 'Anger', 'Surprise', 'Disgust']

emotion_sentences_count=[]
emotions_lexicon_bag_of_words = []
emotions_sentences_bag_of_words = []

# Loop through each emotion
for i, col_name in enumerate(df.columns[1::2]):

    # Extract the emotion name from the column name
    emotion = col_name.replace(' Sentences', '')

    # Get the index of the "Sentences" column for this emotion
    sentences_col_idx = df.columns.get_loc(col_name)

    # Get emotion lexicon from the "Lexicon" column
    lexicon = df.iloc[0:, sentences_col_idx-1].tolist()

    # Get the sentences from the "Sentences" column
    sentences = df.iloc[0:, sentences_col_idx].tolist()

    # Create an empty list to store the lexicon
    lex_tokens = []

    # Create an empty list to store the sentences tokens
    sentences_tokens = []

    # Loop through each lexicon col
    for lexeme in lexicon:
        # Check if the lexicon is a valid string
        if isinstance(lexeme, str):
            # Tokenize the words in the each row "lexeme"
            tokens = nltk.word_tokenize(lexeme)
            # Remove commas
            tokens_without_commas = [token for token in tokens if token != ',']
            # Append to array
            lex_tokens.extend(tokens_without_commas)

    # Remove duplicates lexicon
    lex_tokens = list(set(lex_tokens))

    # Loop through each sentence col
    for sentence in sentences:
        # Check if the sentence is a valid string
        if isinstance(sentence, str):
          # Tokenize the words in each row "sentence"
          tokens = nltk.word_tokenize(sentence)
          # Append to array
          sentences_tokens.extend(tokens)


    # If the emotion contains more than one emotion, split it and add the words to the respective index arrays
    if '+' in emotion:
        sub_emotions = emotion.split(' + ')
        for sub_emotion in sub_emotions:
            sub_emotion_index = emotions.index(sub_emotion)
            emotions_lexicon_bag_of_words[sub_emotion_index].extend(lex_tokens)
            emotions_sentences_bag_of_words[sub_emotion_index].extend(sentences_tokens)
    else:
        emotion_index = emotions.index(emotion)
        emotions_lexicon_bag_of_words.insert(emotion_index, lex_tokens)
        emotions_sentences_bag_of_words.insert(emotion_index, sentences_tokens)


# Loop through the array of arrays and remove duplicates while converting to lowercase
for i, bag in enumerate(emotions_lexicon_bag_of_words):
    # Convert each string to lowercase and remove duplicates in place
    unique_bag = []
    for s in bag:
        if s.lower() not in unique_bag:
            unique_bag.append(s.lower())
    emotions_lexicon_bag_of_words[i] = unique_bag


# Set the vocabulary size to the total number of words in all sentences doc
vocabulary_size = 0

for bag in emotions_sentences_bag_of_words:
    vocabulary_size += len(bag)

# Initialize a dictionary to store the prior_probabilities for each emotion
prior_probabilities = {emotion: {} for emotion in emotions}

# Loop through each emotion and their respective tokenized sentences
for emotion in emotions:
    sentences_words_bag = emotions_sentences_bag_of_words[emotions.index(emotion)]

    # Count the number of occurrences of each word
    word_counts = FreqDist(sentences_words_bag)

    for word, frequency in word_counts.most_common():
      # print(f'{word}: {frequency}')
      likelihood = (frequency + 1) / (len(sentences_words_bag) + vocabulary_size)
      prior_probabilities[emotion][word] = likelihood


s_text = "As she hugged her daughter goodbye on the first day of college, she felt both sad to see her go and joyful knowing that she was embarking on a new and exciting chapter in her life."

s_tokens = nltk.word_tokenize(s_text)

# Initialize a dictionary to store the posterior probabilities for each emotion
posteriors_probabilities = {emotion: 1 for emotion in emotions}

# Loop through each token in the text
for token in s_tokens:
    # Loop through each emotion
    for emotion in emotions:
        # Check if the token is in the likelihood dictionary for the current emotion
        if token in prior_probabilities[emotion]:
            # Update the posterior probability for the current emotion using the likelihood of the token
            posteriors_probabilities[emotion] *= prior_probabilities[emotion][token]

# Normalize the posterior probabilities by dividing by the sum of all probabilities
sum_posteriors = sum(posteriors_probabilities.values())
posteriors_normalized = {emotion: posteriors_probabilities[emotion] / sum_posteriors for emotion in emotions}

total_sum_prob = 0
# Print the normalized posterior probabilities for each emotion
print("Normalized posterior probabilities for each emotion:\n")
for emotion, probability in posteriors_normalized.items():
    print(f"P({emotion}|S) = {probability}")
    total_sum_prob+=probability

print(f"\nTotal Sum Probabilities = {round(total_sum_prob, 1000)}")

# Determine the predicted emotion as the one with the highest posterior probability
predicted_emotion = max(posteriors_normalized, key=posteriors_normalized.get)

print(f"\nText: {s_text}")
# Print the predicted emotion and the formula used to calculate it
print(f"\nPredicted emotion: {predicted_emotion}")

Sadness: ['The', 'devastating', 'news', 'of', 'the', 'child', "'s", 'abduction', 'left', 'a', 'solemn', 'shadow', 'over', 'the', 'family', 'for', 'the', 'next', 'month', '.']
Joy: ['It', 'was', 'a', 'sunny', 'summer', 'morning', 'and', 'the', 'laughter', 'of', 'children', 'could', 'be', 'heard', 'from', 'the', 'pool', 'as', 'they', 'splashed', 'water', 'onto', 'each', 'other', '.']
Fear: ['As', 'he', 'walked', 'in', 'the', 'dead', 'of', 'night', 'he', 'could', 'hear', 'sudden', 'footsteps', 'echoing', 'from', 'the', 'alleyway', ',', 'a', 'shiver', 'went', 'down', 'his', 'spine', '.']
Anger: ['While', 'driving', 'his', 'family', 'to', 'a', 'restaurant', 'a', 'car', 'recklessly', 'changed', 'lanes', 'barely', 'missing', 'him', ',', 'his', 'face', 'flushed', 'red', 'with', 'fury', 'knowing', 'how', 'close', 'his', 'family', 'was', 'to', 'being', 'seriously', 'harmed', '.']
Surprise: ['She', 'was', 'startled', 'unexpectedly', 'as', 'everyone', 'sprang', 'from', 'their', 'hiding', 'spot', '