# Detecting Emotions in Text with GloVe Embeddings
**Name**: Meagan Choo-Kang

In [18]:
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from keras.utils import to_categorical
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import LinearSVC
import pandas as pd

from keras import layers
from keras.layers import LSTM, Activation, Dropout, Dense, Input, MaxPooling1D, Conv1D, Flatten
from tqdm import tqdm
from keras.models import Model
import string
import re
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelBinarizer
from keras.preprocessing.sequence import pad_sequences
import keras
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, make_scorer, confusion_matrix, accuracy_score, precision_score, recall_score, precision_recall_curve

In [19]:
!wget https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
!unzip -q glove.6B.zip

--2024-04-19 13:46:19--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’

glove.6B.zip          7%[>                   ]  65.74M  5.14MB/s    eta 1m 47s ^C
[glove.6B.zip]
  End-of-central-directory signature not found.  Either this file is not
  a zipfile, or it constitutes one disk of a multi-part archive.  In the
  latter case the central directory and zipfile comment will be found on
  the last disk(s) of this archive.
unzip:  cannot find zipfile directory in one of glove.6B.zip or
        glove.6B.zip.zip, and cannot find glove.6B.zip.ZIP, period.


# Preprocess Dataset

In [21]:
# Stop words copied from: https://github.com/ketanvaidya25/IMDb-Movie-Sentiment-Analysis/blob/main/IMDb_Movie_Sentiment_Analysis.ipynb
stopwords = [ "@", "im", "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because",
             "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during",
             "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here",
             "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into",
             "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or",
             "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should",
             "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's",
             "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up",
             "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's",
             "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've",
             "your", "yours", "yourself", "yourselves" ]

In [22]:

def remove_stopwords(df, category):
  df["removed_stopwords_" + category] = df[category].apply(lambda x : ' '.join([word for word in x.split() if word not in (stopwords)]))
  return df

def remove_tags(string):
    result = re.sub('<.*?>','',string)
    return result

In [23]:
# Read dataset
df = pd.read_csv('tweet_emotions.csv')

# Filter sentiments to just 8
emotions = ["anger", "worry", "happiness", "love", "sadness", "surprise"]
df = df[df['sentiment'].isin(emotions)]

# Get the labels for the targets so we can keep track of their values
# when targets get one-hot-encoded
target_labels = sorted(list(df["sentiment"].unique()))

#Initialize OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)
categorical_columns = df['sentiment'].to_numpy().reshape(-1, 1)

# Apply one-hot encoding to the categorical columns
one_hot_encoded = encoder.fit_transform(categorical_columns)

#Create a DataFrame with the one-hot encoded columns
one_hot_df = pd.DataFrame(one_hot_encoded, columns=target_labels)

# Concatenate the one-hot encoded dataframe with the original dataframe
df_encoded = pd.concat([df.reset_index(drop=True), one_hot_df.reset_index(drop=True)], axis=1)

# Clean up content by removing unnecessary words
# For future, would also be nice to remove twitter username
df_encoded['content'] = df_encoded['content'].str.lower()
df_clean = remove_stopwords(df_encoded, "content")
df_clean['clean_content']= df_clean['removed_stopwords_content'].apply(lambda cw : remove_tags(cw))
df_clean['clean_content'] = df_clean['clean_content'].str.replace('[{}]'.format(string.punctuation), ' ')
display(df_clean.head())

# Define targets and features
x = df_clean['clean_content']
y = df_clean[emotions]

X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.1, random_state = 45)

Unnamed: 0,tweet_id,sentiment,content,anger,happiness,love,sadness,surprise,worry,removed_stopwords_content,clean_content
0,1956967666,sadness,layin n bed with a headache ughhhh...waitin o...,0.0,0.0,0.0,1.0,0.0,0.0,layin n bed headache ughhhh...waitin call...,layin n bed headache ughhhh...waitin call...
1,1956967696,sadness,funeral ceremony...gloomy friday...,0.0,0.0,0.0,1.0,0.0,0.0,funeral ceremony...gloomy friday...,funeral ceremony...gloomy friday...
2,1956968477,worry,re-pinging @ghostridah14: why didn't you go to...,0.0,0.0,0.0,0.0,0.0,1.0,re-pinging @ghostridah14: didn't go prom? bc b...,re-pinging @ghostridah14: didn't go prom? bc b...
3,1956968487,sadness,"i should be sleep, but im not! thinking about ...",0.0,0.0,0.0,1.0,0.0,0.0,"sleep, not! thinking old friend want. married ...","sleep, not! thinking old friend want. married ..."
4,1956968636,worry,hmmm. http://www.djhero.com/ is down,0.0,0.0,0.0,0.0,0.0,1.0,hmmm. http://www.djhero.com/,hmmm. http://www.djhero.com/


## Prepare Embedding Layer with GloVe

In [24]:
# set up GloVe
path_to_glove_file = "glove.6B.100d.txt"

with open(path_to_glove_file, 'r', encoding='UTF-8') as f:
  words = set()
  word_to_vec_map = {}
  for line in f:
    w_line = line.split()
    curr_word = w_line[0]
    word_to_vec_map[curr_word] = np.array(w_line[1:], dtype=np.float64)

print("Found %s word vectors." % len(word_to_vec_map))

Found 400000 word vectors.


In [25]:
#Create word-to-index dictionary using Tokenizer
# key = word, value = index of word
token = Tokenizer()

# trains tokenizer
token.fit_on_texts(x)

# converts sentence to numeric form
seq = token.texts_to_sequences(x)

# add padding to make the lengths all the same
maxLen = 1000
pad_seq = pad_sequences(seq,maxlen=maxLen)

# dictionary mapping words to their index
words_to_index = token.word_index

In [26]:
# convert features/training data to their index form
X_train_indices = token.texts_to_sequences(X_train)

# add padding to ensure same length
X_train_indices = pad_sequences(X_train_indices, maxlen=maxLen, padding='post')
print(X_train_indices.shape)


(22474, 1000)


## Create embedding layer values

In [27]:
# define embedding matrix
vocab_len = len(words_to_index)+1
embed_vector_len = word_to_vec_map['moon'].shape[0]

# words that are not in the GloVe dictionary = 0
emb_matrix = np.zeros((vocab_len, embed_vector_len))

# find words in GloVe for embedding matrix
for word, index in words_to_index.items():
  embedding_vector = word_to_vec_map.get(word)
  if embedding_vector is not None:
    emb_matrix[index, :] = embedding_vector

# define embedding layer
# maps words to their embedding vectors from the embedding matrix
embedding_layer = keras.layers.Embedding(input_dim=vocab_len, output_dim=embed_vector_len, input_length=maxLen, weights = [emb_matrix], trainable=False)


# Create Model

In [28]:
def createModel(input_shape):

  X_indices = Input(input_shape)

  # Embedding layer
  embeddings = embedding_layer(X_indices)

  # Convolutional layer
  X = Conv1D(filters=32, kernel_size=3, padding='same', activation='relu')(embeddings)

  # Max pooling layer
  X = MaxPooling1D(pool_size=2)(X)

  # Hidden layers
  X = LSTM(128, return_sequences=True)(X)

  X = Dropout(0.5)(X)

  X = LSTM(128, return_sequences=True)(X)

  X = Dropout(0.5)(X)

  # Flatten to make the model shape compatible
  X = Flatten()(X)

  # Output layer
  X = Dense(6, activation='softmax')(X)

  model = Model(inputs=X_indices, outputs=X)

  return model

In [29]:
# build model
model = createModel(maxLen)
adam = keras.optimizers.Adam(learning_rate = 0.01)
model.compile(optimizer=adam, loss='categorical_crossentropy', metrics=[keras.metrics.CategoricalAccuracy()])
model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 1000)]            0         
                                                                 
 embedding_2 (Embedding)     (None, 1000, 100)         3421600   
                                                                 
 conv1d_1 (Conv1D)           (None, 1000, 32)          9632      
                                                                 
 max_pooling1d_1 (MaxPoolin  (None, 500, 32)           0         
 g1D)                                                            
                                                                 
 lstm_2 (LSTM)               (None, 500, 128)          82432     
                                                                 
 dropout_2 (Dropout)         (None, 500, 128)          0         
                                                           

# Train Model

In [30]:
history = model.fit(X_train_indices, Y_train, batch_size=120, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


# Get Predictions & Results

In [31]:
# set up testing data
X_test_indices = token.texts_to_sequences(X_test)
X_test_indices = pad_sequences(X_test_indices, maxlen=maxLen, padding='post')

# get predictions
y_val_pred = model.predict(X_test_indices)

# clean up predictions
y_pred_clean = []
for rowIndex in range(len(y_val_pred)):
  # find maximum value in each row to determine predicted emotion and set it to one
  # while rest of predictions are 0
  newRow =  [0 for element in range(6)]
  max_value = max(y_val_pred[rowIndex])
  max_index = np.where(y_val_pred[rowIndex] == max_value)[0][0]
  newRow[max_index] = 1
  y_pred_clean.append(newRow)

# get performance metrics
f1, precision, recall = f1_score(Y_test, y_pred_clean, average=None), precision_score(Y_test, y_pred_clean, average=None), recall_score(Y_test, y_pred_clean, average=None)
print("f1 score: ", f1)
print("precision: ", precision)
print("recall: ", recall)


f1 score:  [0.         0.54291939 0.44665461 0.42809365 0.22661397 0.        ]
precision:  [0.         0.42788462 0.42367067 0.59259259 0.35390947 0.        ]
recall:  [0.         0.74255066 0.47227533 0.33507853 0.16666667 0.        ]


  _warn_prf(average, modifier, msg_start, len(result))


# Results
|Results|  anger  | worry      | happiness | love    |sadness | surprise |
| ------|------| -----------| ----------| ------------| --------|---------|
|f1 score:| 0.  | 0.54291939 |0.44665461 | 0.42809365 | 0.22661397 | 0. |
precision:| 0.  | 0.42788462 |0.42367067 |0.59259259 | 0.35390947   | 0. |  
recall:   |0.   | 0.74255066 |0.47227533 | 0.33507853 | 0.16666667 |0. |       