# EMOTION DETECTION - ISEAR (International Survey on Emotion Antecedents and Reactions)

## Importing required libraries

In [74]:
import pandas as pd
import numpy as np

from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Bidirectional, LSTM, Dropout
from tensorflow.keras.layers import *

import nltk
from nltk.tokenize import word_tokenize,sent_tokenize
nltk.download('punkt_tab')

from sklearn.model_selection import cross_val_score

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/patricijamarijanovic/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


## Loading the dataset

In [75]:
df = pd.read_csv('isear.csv', header = None) # odmah pocinju podaci, nema imena stupaca
df.columns = ["emotion", "text"]
df.head()

Unnamed: 0,emotion,text
0,joy,[ On days when I feel close to my partner and ...
1,fear,Every time I imagine that someone I love or I ...
2,anger,When I had been obviously unjustly treated and...
3,sadness,When I think about the short time that we live...
4,disgust,At a gathering I found myself involuntarily si...


7 jedinstvenih emocija - joy, fear, anger, sadness, disgust, shame, guilt

In [76]:
print(df["emotion"].unique())

['joy' 'fear' 'anger' 'sadness' 'disgust' 'shame' 'guilt']


In [77]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7652 entries, 0 to 7651
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   emotion  7652 non-null   object
 1   text     7652 non-null   object
dtypes: object(2)
memory usage: 119.7+ KB


In [78]:
# drop the null data "No Response." from text column
noResponse = df[df['text'] == '[ No response.]'].index # Pronalazak INDEKSA redaka koji sadrže "[ No response.]"
df.drop(noResponse, inplace=True) # inplace=True mijenja df izravno bez potrebe za spremanjem u novu varijablu
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7575 entries, 0 to 7651
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   emotion  7575 non-null   object
 1   text     7575 non-null   object
dtypes: object(2)
memory usage: 177.5+ KB


## Tokenization

- proces razbijanja teksta na manje jedinice (tokene)
- omogućava računalima da bolje razumiju i obrađuju tekstualne podatke
- primjer: "i am sad." --> "i", "am", "sad", "."

In [79]:
# select the text column
emotion_arr = df["text"]

emotion_arr[0]

'[ On days when I feel close to my partner and other friends.  \nWhen I feel at peace with myself and also experience a close \ncontact with people whom I regard greatly.]'

In [80]:

# loop over text and tokenize each statement
emotion_arr = [word_tokenize(sent) for sent in emotion_arr]
print(emotion_arr[0])

['[', 'On', 'days', 'when', 'I', 'feel', 'close', 'to', 'my', 'partner', 'and', 'other', 'friends', '.', 'When', 'I', 'feel', 'at', 'peace', 'with', 'myself', 'and', 'also', 'experience', 'a', 'close', 'contact', 'with', 'people', 'whom', 'I', 'regard', 'greatly', '.', ']']


## Padding
- usklađivanje duljina rečenica unutar skupa podataka

In [81]:
# sentence length = 100
def padding(arr):
    for i in range(100-len(arr)):
        arr.append("")
    return arr[:100]

# call the padding function for each sentence in emotion_arr
for i in range (len(emotion_arr)):
    emotion_arr[i] = padding(emotion_arr[i])


print(emotion_arr[0])

['[', 'On', 'days', 'when', 'I', 'feel', 'close', 'to', 'my', 'partner', 'and', 'other', 'friends', '.', 'When', 'I', 'feel', 'at', 'peace', 'with', 'myself', 'and', 'also', 'experience', 'a', 'close', 'contact', 'with', 'people', 'whom', 'I', 'regard', 'greatly', '.', ']', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']


## Word embeddings using GloVe
- pretvaranje riječi u numerički oblik
- GloVe vektori
    - unaprijed trenirani
    - svaka riječ predstavljena pomoću vektora od 50 brojeva
    - slične riječi imaju slične vektore (npr vektori za "king" i "queen" su slični jer su semantički povezani)
- GloVe datoteka
    - Svaka linija u GloVe datoteci izgleda ovako: apple 0.123 0.456 -0.789 ... 50 brojeva ...

In [82]:
vocab_f = "glove.6B.50d.txt"

embeddings_index = {}
with open(vocab_f) as file:
    for line in file:
        values = line.rstrip().rsplit(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

# the embedding index of word 'happy'
embeddings_index['happy']

array([ 0.092086,  0.2571  , -0.58693 , -0.37029 ,  1.0828  , -0.55466 ,
       -0.78142 ,  0.58696 , -0.58714 ,  0.46318 , -0.11267 ,  0.2606  ,
       -0.26928 , -0.072466,  1.247   ,  0.30571 ,  0.56731 ,  0.30509 ,
       -0.050312, -0.64443 , -0.54513 ,  0.86429 ,  0.20914 ,  0.56334 ,
        1.1228  , -1.0516  , -0.78105 ,  0.29656 ,  0.7261  , -0.61392 ,
        2.4225  ,  1.0142  , -0.17753 ,  0.4147  , -0.12966 , -0.47064 ,
        0.3807  ,  0.16309 , -0.323   , -0.77899 , -0.42473 , -0.30826 ,
       -0.42242 ,  0.055069,  0.38267 ,  0.037415, -0.4302  , -0.39442 ,
        0.10511 ,  0.87286 ], dtype=float32)

In [83]:
# embedding each word of the emotion_arr

embedded_emotion_arr = []

for sent in emotion_arr:
    embedded_emotion_arr.append([])
    for word in sent:
        if word.lower() in embeddings_index:
            vektor = embeddings_index[word.lower()]
            embedded_emotion_arr[-1].append(vektor) # dodavanje GloVe vektora trenutne riječi u zadnju dodanu rečenicu
        else:
            # if the word to be embedded is '' append 0 fifty times
            embedded_emotion_arr[-1].append([0]*50)

print(embedded_emotion_arr[0][0])

[-0.61201   0.98226   0.11539   0.014623  0.23873  -0.067035  0.30632
 -0.64742  -0.38517  -0.03691   0.094788  0.57631  -0.091557 -0.54825
  0.25255  -0.14759   0.13023   0.21658  -0.30623   0.30028  -0.23471
 -0.17927   0.9518    0.54258   0.31172  -0.51038  -0.65223  -0.48858
  0.13486  -0.40132   2.493    -0.38777  -0.26456  -0.49414  -0.3871
 -0.20983   0.82941  -0.46253   0.39549   0.014881  0.79485  -0.79958
 -0.16243   0.013862 -0.53536   0.52536   0.019818 -0.16353   0.30649
  0.81745 ]


## One-hot encoding - emotion categories
- Svaka riječ (ili kategorija) se predstavlja kao vektor pun nula, osim na jednoj poziciji koja je 1

In [84]:
# one-hot encoding from sklearn
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore') # ako naiđemo na nepoznatu emociju, ignoriramo umjesto da bacimo grešku

Y = enc.fit_transform(np.array(df["emotion"]).reshape(-1, 1)).toarray() # sredene kategorije
X = np.array(embedded_emotion_arr) # embedded text

print(enc.categories_) # lista kategorija u istom redoslijedu kao i one-hot vektori


[array(['anger', 'disgust', 'fear', 'guilt', 'joy', 'sadness', 'shame'],
      dtype=object)]


## Splitting the dataset
- na train set, test set

In [85]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

## Creating the model

Bidirectional LSTM
- koristi se za obradu sekvencijalnih podataka (npr. rečenica)
- Bidirectional - obrađuje sekvencu u oba smjera (naprijed i unatrag)

In [86]:

# Defining the BiLSTM Mode
def model(X, Y, input_size1, input_size2, output_size):
    m = Sequential() # dodajemo slojeve jedan iza drugog

    # Add a Bidirectional LSTM layer with 100 units
    # input_size1 = br. rijeci u recenici
    # input_size2 = duljina embedding vektora
    m.add(Bidirectional(LSTM(100, input_shape = (input_size1, input_size2))))

    # Add a dropout layer with 50% dropout rate
    # randomly drops 50% of the connections to prevent overfitting
    m.add(Dropout(0.5))

    # potpuno povezani zadnji sloj
    # softmax aktivacija jer imamo više klasa emocija
    m.add(Dense(output_size, activation='softmax'))

    # model compiling
    # categorical cross-entropy loss (suitable for multiclass classification)
    m.compile("Adam", loss="categorical_crossentropy", metrics=["accuracy"])

    # model training
    m.fit(X, Y, epochs = 32, batch_size = 128)

    return m


In [87]:
moj_model = model(X_train, Y_train, 100, 50, 7)

  super().__init__(**kwargs)


Epoch 1/32
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 105ms/step - accuracy: 0.1833 - loss: 1.9270
Epoch 2/32
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 106ms/step - accuracy: 0.3339 - loss: 1.7203
Epoch 3/32
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 171ms/step - accuracy: 0.3711 - loss: 1.6468
Epoch 4/32
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 196ms/step - accuracy: 0.3966 - loss: 1.5722
Epoch 5/32
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 181ms/step - accuracy: 0.4186 - loss: 1.5354
Epoch 6/32
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 182ms/step - accuracy: 0.4438 - loss: 1.4819
Epoch 7/32
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 180ms/step - accuracy: 0.4398 - loss: 1.4673
Epoch 8/32
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 175ms/step - accuracy: 0.4662 - loss: 1.4277
Epoch 9/32
[1m48/48[0m [32m━━━━━━━━━━

In [88]:
# Model testing
moj_model.evaluate(X_test, Y_test)

[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.5076 - loss: 1.5461


[1.5275814533233643, 0.5168316960334778]

## Prediction

In [89]:
def preprocess_input_text(text):
    tokenized_text = word_tokenize(text)
    padded_text = padding(tokenized_text)

    embedded_text = []
    for word in tokenized_text:
        if word.lower() in embeddings_index:
            embedded_text.append(embeddings_index[word.lower()])
        else:
            embedded_text.append([0]*50)
    return np.array(embedded_text)

In [90]:
input_text = "i am feeling very happy today"
processed_input_text = preprocess_input_text(input_text)


In [91]:
# perform prediction
predicted_probs = moj_model.predict(np.array([processed_input_text]))
predicted_emotion_index = np.argmax(predicted_probs)

print(predicted_probs)
print(predicted_emotion_index)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 98ms/step
[[2.8278702e-03 6.1375536e-03 7.7215210e-04 1.1797990e-03 9.0221101e-01
  8.0782562e-02 6.0890615e-03]]
4


In [92]:
# indeks --> emocija
predicted_emotion = enc.categories_[0][predicted_emotion_index]

In [93]:
print("Predicted Emotion:", predicted_emotion)
print("Predicted Emotion Probabilities:", predicted_probs)

Predicted Emotion: joy
Predicted Emotion Probabilities: [[2.8278702e-03 6.1375536e-03 7.7215210e-04 1.1797990e-03 9.0221101e-01
  8.0782562e-02 6.0890615e-03]]
