# Emotion Classification Using Logistic Regression

In [5]:
import pandas as pd
import nltk
import math
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from sklearn.metrics import confusion_matrix
nltk.download('punkt')


[nltk_data] Downloading package punkt to /Users/majd/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
#Read the Excel sheet into a pandas DataFrame
df = pd.read_excel('Files/data.xlsx')

# Select the first 30 rows of the DataFrame
df = df.head(30)

# Define the emotions to count
emotions = ['Sadness', 'Joy', 'Fear', 'Anger', 'Surprise', 'Disgust']

# Initialize a dictionary to store the sentence counts for each emotion
emotion_sentences_counts = {emotion: 0 for emotion in emotions}

# Loop through each even-numbered column in the DataFrame
for i, col_name in enumerate(df.columns[1::2]):
    # Remove "Sentences" from the column name
    emotion_name = col_name.replace(" Sentences", "")
    # Check if the column contains any emotions to count
    column_emotions = []
    for emotion in emotions:
        if emotion in emotion_name:
            column_emotions.append(emotion)
    # If the column contains at least one emotion to count, add the number of non-empty sentences to the count
    if len(column_emotions) > 0:
        non_empty_sentences = df[col_name].dropna().count()
        for emotion in column_emotions:
            emotion_sentences_counts[emotion] += non_empty_sentences
        # Print the number of non-empty sentences for each emotion and column
        print(f"Column '{col_name}' has {non_empty_sentences} non-empty sentences.")
        for emotion in column_emotions:
            print(f" {non_empty_sentences} sentences added to '{emotion}' count")

# Get total count of sentences in "Sadness" + "Joy"
total_count = emotion_sentences_counts['Sadness'] + emotion_sentences_counts['Joy']

emotions_lexicon_bag_of_words = {emotion: [] for emotion in emotions}
emotions_sentences_bag_of_words = {emotion: [] for emotion in emotions}

# Loop through each emotion
for i, col_name in enumerate(df.columns[1::2]):

    # Extract the emotion name from the column name
    emotion = col_name.replace(' Sentences', '')
    # Get the index of the "Sentences" column for this emotion
    sentences_col_idx = df.columns.get_loc(col_name)
    # Get emotion lexicon from the "Lexicon" column
    lexicon = df.iloc[0:, sentences_col_idx-1].tolist()
    # Get the sentences from the "Sentences" column
    sentences = df.iloc[0:, sentences_col_idx].tolist()
    # Create an empty list to store the lexicon
    lex_tokens = []
    # Create an empty list to store the sentences tokens
    sentences_tokens = []
    # Loop through each lexicon col
    for lexeme in lexicon:
        # Check if the lexicon is a valid string
        if isinstance(lexeme, str):
            # Tokenize the words in each row "lexeme"
            tokens = nltk.word_tokenize(lexeme)
            # Remove commas
            tokens_without_commas = [token for token in tokens if token != ',']
            # Append to array
            lex_tokens.extend(tokens_without_commas)

    # Set to lowercase and remove duplicates lexicon
    lex_tokens = list(set(token.lower() for token in lex_tokens))
    # Loop through each sentence col
    for sentence in sentences:
        # Check if the sentence is a valid string
        if isinstance(sentence, str):
          # Tokenize the words in each row "sentence"
          tokens = nltk.word_tokenize(sentence)
          # Append to array
          sentences_tokens.append(tokens)
    # If the emotion contains more than one emotion, split it and add the words to the respective index arrays
    if '+' in emotion:
        sub_emotions = emotion.split(' + ')
        for sub_emotion in sub_emotions:
            emotions_lexicon_bag_of_words[sub_emotion].extend(lex_tokens)
            emotions_sentences_bag_of_words[sub_emotion].extend(sentences_tokens)
    else:
        emotions_lexicon_bag_of_words[emotion].extend(lex_tokens)
        emotions_sentences_bag_of_words[emotion].extend(sentences_tokens)

# Remove duplicates from "Sadness" and "Joy" lexicon
emotions_lexicon_bag_of_words['Sadness'] = list(set(emotions_lexicon_bag_of_words['Sadness']))
emotions_lexicon_bag_of_words['Joy'] = list(set(emotions_lexicon_bag_of_words['Joy']))

# Joy lexicon 
x1 = len(emotions_lexicon_bag_of_words['Joy'])
# Sadness lexicon 
x2 = len(emotions_lexicon_bag_of_words['Sadness'])
# Sadness sentences tokens + Sadness sentences tokens
x3 = math.log(len(emotions_sentences_bag_of_words['Joy']) + len(emotions_sentences_bag_of_words['Sadness']))
# y label
y = 1
# Weights vector
w = [0, 0, 0]
# Bias
b = 0

Column 'Sadness Sentences' has 30 non-empty sentences.
 30 sentences added to 'Sadness' count
Column 'Joy Sentences' has 30 non-empty sentences.
 30 sentences added to 'Joy' count
Column 'Fear Sentences' has 30 non-empty sentences.
 30 sentences added to 'Fear' count
Column 'Anger Sentences' has 30 non-empty sentences.
 30 sentences added to 'Anger' count
Column 'Surprise Sentences' has 29 non-empty sentences.
 29 sentences added to 'Surprise' count
Column 'Disgust Sentences' has 30 non-empty sentences.
 30 sentences added to 'Disgust' count
Column 'Sadness + Joy Sentences' has 30 non-empty sentences.
 30 sentences added to 'Sadness' count
 30 sentences added to 'Joy' count
Column 'Fear + Anger Sentences' has 30 non-empty sentences.
 30 sentences added to 'Fear' count
 30 sentences added to 'Anger' count
Column 'Surprise + Disgust Sentences' has 28 non-empty sentences.
 28 sentences added to 'Surprise' count
 28 sentences added to 'Disgust' count
Column 'Sadness + Joy + Fear Sentences'

In [7]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def logistic_regression_prob(x1, x2, x3, w, b):
    # Combine the features into a single input vector
    x = np.array([x1, x2, x3])

    # Compute the logit (linear combination of features and weights)
    logit = np.dot(x, w) + b

    # Compute the probability using the sigmoid function
    probability = sigmoid(logit)

    return probability

# Call the logistic regression classifier function
joy_probability = logistic_regression_prob(x1, x2, x3, w, b)

# Print the computed probability
print("Joy probability = " + str(joy_probability))
print("Sadness probability = " + str(1- joy_probability))


# Compute Cross Entropu Loss
def cross_entropy_loss(x1, x2, x3, w, b, y):

    return -1*(y*math.log(logistic_regression_prob(x1, x2, x3, w, b)))+((1-y)*(1-logistic_regression_prob(x1, x2, x3, w, b)))

loss_1 = cross_entropy_loss(x1, x2, x3, w, b, y)
loss_2 = cross_entropy_loss(x1, x2, x3, w, b, 0)
print("Loss when p(y = 1) = " + str(loss_1))
print("Loss when p(y = 0) = " + str(loss_2))

Joy probability = 0.5
Sadness probability = 0.5
Loss when p(y = 1) = 0.6931471805599453
Loss when p(y = 0) = 0.5


In [8]:
# Read the Excel sheet into a pandas DataFrame
df = pd.read_excel('Files/data.xlsx')

# Select the next validation rows from index 30 to 40
df = df.head(40).tail(10)

# Define the emotions to count
emotions = ['Sadness', 'Joy', 'Fear', 'Anger', 'Surprise', 'Disgust']

# Initialize a dictionary to store the sentence counts for each emotion
emotion_sentences_counts = {emotion: 0 for emotion in emotions}

# Loop through each even-numbered column in the DataFrame
for i, col_name in enumerate(df.columns[1::2]):
    # Remove "Sentences" from the column name
    emotion_name = col_name.replace(" Sentences", "")
    # Check if the column contains any emotions to count
    column_emotions = []
    for emotion in emotions:
        if emotion in emotion_name:
            column_emotions.append(emotion)
    # If the column contains at least one emotion to count, add the number of non-empty sentences to the count
    if len(column_emotions) > 0:
        non_empty_sentences = df[col_name].dropna().count()
        for emotion in column_emotions:
            emotion_sentences_counts[emotion] += non_empty_sentences
        # Print the number of non-empty sentences for each emotion and column
        print(f"Column '{col_name}' has {non_empty_sentences} non-empty sentences.")
        for emotion in column_emotions:
            print(f" {non_empty_sentences} sentences added to '{emotion}' count")

# Get total count of sentences in "Sadness" + "Joy"
total_count = emotion_sentences_counts['Sadness'] + emotion_sentences_counts['Joy']

emotions_lexicon_bag_of_words = {emotion: [] for emotion in emotions}
emotions_sentences_bag_of_words = {emotion: [] for emotion in emotions}

print("------------------------------------------")
# Loop through each emotion
for i, col_name in enumerate(df.columns[1::2]):

    # Extract the emotion name from the column name
    emotion = col_name.replace(' Sentences', '')
    # Get the index of the "Sentences" column for this emotion
    sentences_col_idx = df.columns.get_loc(col_name)
    # Get emotion lexicon from the "Lexicon" column
    lexicon = df.iloc[0:, sentences_col_idx-1].tolist()
    # Get the sentences from the "Sentences" column
    sentences = df.iloc[0:, sentences_col_idx].tolist()
    # Create an empty list to store the lexicon
    lex_tokens = []
    # Create an empty list to store the sentences tokens
    sentences_tokens = []
    # Loop through each lexicon col
    for lexeme in lexicon:
        # Check if the lexicon is a valid string
        if isinstance(lexeme, str):
            # Tokenize the words in each row "lexeme"
            tokens = nltk.word_tokenize(lexeme)
            # Remove commas
            tokens_without_commas = [token for token in tokens if token != ',']
            # Append to array
            lex_tokens.extend(tokens_without_commas)

    # Set to lowercase and remove duplicates lexicon
    lex_tokens = list(set(token.lower() for token in lex_tokens))
    # Loop through each sentence col
    for sentence in sentences:
        # Check if the sentence is a valid string
        if isinstance(sentence, str):
          # Tokenize the words in each row "sentence"
          tokens = nltk.word_tokenize(sentence)
          # Append to array
          sentences_tokens.append(tokens)
    # If the emotion contains more than one emotion, split it and add the words to the respective index arrays
    if '+' in emotion:
        sub_emotions = emotion.split(' + ')
        for sub_emotion in sub_emotions:
            emotions_lexicon_bag_of_words[sub_emotion].extend(lex_tokens)
            emotions_sentences_bag_of_words[sub_emotion].extend(sentences_tokens)
    else:
        emotions_lexicon_bag_of_words[emotion].extend(lex_tokens)
        emotions_sentences_bag_of_words[emotion].extend(sentences_tokens)


# Remove duplicates from "Sadness" and "Joy" lexicon
emotions_lexicon_bag_of_words['Sadness'] = list(set(emotions_lexicon_bag_of_words['Sadness']))
emotions_lexicon_bag_of_words['Joy'] = list(set(emotions_lexicon_bag_of_words['Joy']))

# Joy lexicon 
x1 = len(emotions_lexicon_bag_of_words['Joy'])
# Sadness lexicon 
x2 = len(emotions_lexicon_bag_of_words['Sadness'])
# Sadness sentences tokens + Sadness sentences tokens
x3 = math.log(len(emotions_sentences_bag_of_words['Joy']) + len(emotions_sentences_bag_of_words['Sadness']))
# y label
y = 1
# Weights vector
w = [0, 0, 0]
# Bias
b = 0

# Compute Cross Entropy Loss
def cross_entropy_loss(x1, x2, x3, w, b, y):

    return -1*(y*math.log(logistic_regression_prob(x1, x2, x3, w, b)))+((1-y)*(1-logistic_regression_prob(x1, x2, x3, w, b)))


def stochastic_gradient_descent(x1, x2, x3, y, learning_rate, iterations_num):
    # Initialize weights and bias
    w = np.zeros(3)
    b = 0

    for i in range(iterations_num):
        # Compute the predicted probability
        prob = logistic_regression_prob(x1, x2, x3, w, b)

        # Compute the gradients
        gradient_w = (prob - y) * np.array([x1, x2, x3])
        gradient_b = prob - y

        # Update the weights and bias
        w -= learning_rate * gradient_w
        b -= learning_rate * gradient_b

    return w, b

# Learning rates
learning_rates= [0.00001, 0.00005, 0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5]

# Number of iterations
iterations_num = 10000

best_probability = 0
lowest_loss = float('inf')
best_learning_rate = None
best_w = None
best_b = None
learning_rate_losses = []

# Iterate through learning rates
for learning_rate in learning_rates:
    # Perform stochastic gradient descent
    w, b = stochastic_gradient_descent(x1, x2, x3, y, learning_rate, iterations_num)
    # Compute the probability using the obtained weights and bias
    probability = logistic_regression_prob(x1, x2, x3, w, b)
    # Compute the cross-entropy loss
    loss = cross_entropy_loss(x1, x2, x3, w, b, y)
    # Update the best results if the current probability is higher
    if probability > best_probability:
        best_probability = probability
        best_learning_rate = learning_rate
        best_w = w
        best_b = b
    # Update the lowest loss and corresponding learning rate if the current loss is lower
    if loss < lowest_loss:
        lowest_loss = loss
        best_learning_rate = learning_rate
        best_w = w
        best_b = b
    # Append the learning rate and validation loss to the list
    learning_rate_losses.append((learning_rate, probability, loss))

print("1. Best Learning Rate:" + str(best_learning_rate))
print("2. Lowest Validation Loss:"+ str(lowest_loss) + " for learning rate:" + str(best_learning_rate))
print("3. Best Probability:" + str(best_probability) + " for learning rate:" + str( best_learning_rate))
print("4. Best Weight Vector:", best_w)
print("5. Best Bias:", best_b)
print("6. List of all Learning Rates and Validation Losses:")
for rate, probability, loss in learning_rate_losses:
    print("   Learning Rate:" + str("{:.5f}".format(rate)) + " Probability:" + str("{:.5f}".format(probability)) + " Validation Loss:" + str(loss))

Column 'Sadness Sentences' has 10 non-empty sentences.
 10 sentences added to 'Sadness' count
Column 'Joy Sentences' has 10 non-empty sentences.
 10 sentences added to 'Joy' count
Column 'Fear Sentences' has 10 non-empty sentences.
 10 sentences added to 'Fear' count
Column 'Anger Sentences' has 10 non-empty sentences.
 10 sentences added to 'Anger' count
Column 'Surprise Sentences' has 10 non-empty sentences.
 10 sentences added to 'Surprise' count
Column 'Disgust Sentences' has 10 non-empty sentences.
 10 sentences added to 'Disgust' count
Column 'Sadness + Joy Sentences' has 10 non-empty sentences.
 10 sentences added to 'Sadness' count
 10 sentences added to 'Joy' count
Column 'Fear + Anger Sentences' has 10 non-empty sentences.
 10 sentences added to 'Fear' count
 10 sentences added to 'Anger' count
Column 'Surprise + Disgust Sentences' has 10 non-empty sentences.
 10 sentences added to 'Surprise' count
 10 sentences added to 'Disgust' count
Column 'Sadness + Joy + Fear Sentences'

In [9]:
# Read the Excel sheet into a pandas DataFrame
df = pd.read_excel('Files/data.xlsx')

# Select the next test rows from index 40 to 50
df = df.tail(10)

# Define the emotions to count
emotions = ['Sadness', 'Joy', 'Fear', 'Anger', 'Surprise', 'Disgust']

# Counting the number of sentences for each emotion
emotion_sentences_counts = {emotion: 0 for emotion in emotions}

# Loop through each even-numbered column in the DataFrame
for i, col_name in enumerate(df.columns[1::2]):
    # Remove "Sentences" from the column name
    emotion_name = col_name.replace(" Sentences", "")
    # Check if the column contains any emotions to count
    column_emotions = []
    for emotion in emotions:
        if emotion in emotion_name:
            column_emotions.append(emotion)
    # If the column contains at least one emotion to count, add the number of non-empty sentences to the count
    if len(column_emotions) > 0:
        non_empty_sentences = df[col_name].dropna().count()
        for emotion in column_emotions:
            emotion_sentences_counts[emotion] += non_empty_sentences
        # Print the number of non-empty sentences for each emotion and column
        print(f"Column '{col_name}' has {non_empty_sentences} non-empty sentences.")
        for emotion in column_emotions:
            print(f" {non_empty_sentences} sentences added to '{emotion}' count")

# Get total count of sentences in "Sadness" + "Joy"
total_count = emotion_sentences_counts['Sadness'] + emotion_sentences_counts['Joy']
# Store our lexicons and sentences
emotions_lexicon_bag_of_words = {emotion: [] for emotion in emotions}
emotions_sentences_bag_of_words = {emotion: [] for emotion in emotions}

# Loop through each emotion
for i, col_name in enumerate(df.columns[1::2]):

    # Extract the emotion name from the column name
    emotion = col_name.replace(' Sentences', '')
    # Get the index of the "Sentences" column for this emotion
    sentences_col_idx = df.columns.get_loc(col_name)
    # Get emotion lexicon from the "Lexicon" column
    lexicon = df.iloc[0:, sentences_col_idx-1].tolist()
    # Get the sentences from the "Sentences" column
    sentences = df.iloc[0:, sentences_col_idx].tolist()
    # Create an empty list to store the lexicon
    lex_tokens = []
    # Create an empty list to store the sentences tokens
    sentences_tokens = []
    # Loop through each lexicon col
    for lexeme in lexicon:
        # Check if the lexicon is a valid string
        if isinstance(lexeme, str):
            # Tokenize the words in the each row "lexeme"
            tokens = nltk.word_tokenize(lexeme)
            # Remove commas
            tokens_without_commas = [token for token in tokens if token != ',']
            # Append to array
            lex_tokens.extend(tokens_without_commas)

    # Set to lowercase and remove duplicates lexicon
    lex_tokens = list(set(token.lower() for token in lex_tokens))
    # Loop through each sentence col
    for sentence in sentences:
        # Check if the sentence is a valid string
        if isinstance(sentence, str):
          # Tokenize the words in each row "sentence"
          tokens = nltk.word_tokenize(sentence)
          # Append to array
          sentences_tokens.append(tokens)
    # If the emotion contains more than one emotion, split it and add the words to the respective index arrays
    if '+' in emotion:
        sub_emotions = emotion.split(' + ')
        for sub_emotion in sub_emotions:
            emotions_lexicon_bag_of_words[sub_emotion].extend(lex_tokens)
            emotions_sentences_bag_of_words[sub_emotion].extend(sentences_tokens)
    else:
        emotions_lexicon_bag_of_words[emotion].extend(lex_tokens)
        emotions_sentences_bag_of_words[emotion].extend(sentences_tokens)

# Remove duplicates from "Sadness" and "Joy" lexicon
emotions_lexicon_bag_of_words['Sadness'] = list(set(emotions_lexicon_bag_of_words['Sadness']))
emotions_lexicon_bag_of_words['Joy'] = list(set(emotions_lexicon_bag_of_words['Joy']))
# Define the ground truth labels
y_true = [1] * 29 + [0] * 29
# Define the predicted labels
y_pred = []

print("\nPOSITIVE SENTENCES\n")
for sentence in emotions_sentences_bag_of_words['Joy']:
    frequency_distribution = nltk.FreqDist(sentence)
    positive_count = sum(frequency_distribution[word] for word in emotions_lexicon_bag_of_words['Joy'])
    negative_count = sum(frequency_distribution[word] for word in emotions_lexicon_bag_of_words['Sadness'])
    sentence_length = math.log(len(sentence))
    w = [0.2325, 0.235, 0.01023586]
    b = 0.0025
    probability = logistic_regression_prob(positive_count, negative_count, sentence_length, w, b)
    if(probability > 0.65):
      y_pred.append(1)
    else:
      y_pred.append(0)
    print("Sentence:", sentence)
    print("Length:", sentence_length)
    print("Positive Count:", positive_count)
    print("Negative Count:", negative_count)
    print("Probability:", probability)
    print("---------")

print("------------------------------------------")

print("\nNEGATIVE SENTENCES\n")
for sentence in emotions_sentences_bag_of_words['Sadness']:
    frequency_distribution = nltk.FreqDist(sentence)
    positive_count = sum(frequency_distribution[word] for word in emotions_lexicon_bag_of_words['Joy'])
    negative_count = sum(frequency_distribution[word] for word in emotions_lexicon_bag_of_words['Sadness'])
    sentence_length = math.log(len(sentence))
    # Weights vector
    w = [0.2325, 0.235, 0.01023586]
    b = 0.0025
    probability = logistic_regression_prob(negative_count, positive_count, sentence_length, w, b)
    if(probability > 0.65):
      y_pred.append(0)
    else:
      y_pred.append(1)
    print("Sentence:", sentence)
    print("Length:", sentence_length)
    print("Positive Count:", positive_count)
    print("Negative Count:", negative_count)
    print("Probability:", probability)
    print("---------")



# Compute the confusion matrix
cm = confusion_matrix(y_true, y_pred)

# Print the confusion matrix
print("Confusion Matrix:")
print(cm)



# Extract the individual elements of the confusion matrix
tn, fp, fn, tp = cm.ravel()

# Compute evaluation metrics based on the confusion matrix
accuracy = (tp + tn) / (tp + tn + fp + fn)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2 * (precision * recall) / (precision + recall)

# Print the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1_score)


Column 'Sadness Sentences' has 10 non-empty sentences.
 10 sentences added to 'Sadness' count
Column 'Joy Sentences' has 10 non-empty sentences.
 10 sentences added to 'Joy' count
Column 'Fear Sentences' has 10 non-empty sentences.
 10 sentences added to 'Fear' count
Column 'Anger Sentences' has 10 non-empty sentences.
 10 sentences added to 'Anger' count
Column 'Surprise Sentences' has 10 non-empty sentences.
 10 sentences added to 'Surprise' count
Column 'Disgust Sentences' has 10 non-empty sentences.
 10 sentences added to 'Disgust' count
Column 'Sadness + Joy Sentences' has 10 non-empty sentences.
 10 sentences added to 'Sadness' count
 10 sentences added to 'Joy' count
Column 'Fear + Anger Sentences' has 10 non-empty sentences.
 10 sentences added to 'Fear' count
 10 sentences added to 'Anger' count
Column 'Surprise + Disgust Sentences' has 9 non-empty sentences.
 9 sentences added to 'Surprise' count
 9 sentences added to 'Disgust' count
Column 'Sadness + Joy + Fear Sentences' ha