# Initializing Notebook

## Installing Dependencies 

❗❗Restart runtime after installing dependencies and before importing libraries to use updated libraries




In [None]:
%pip install tensorflow_datasets 
%pip install tf-models-official
%pip install transformers
%pip install tensorflow
%pip install scikit-learn
%pip install seaborn
%pip install emoji
%pip install contractions
%pip install sentencepiece
%pip install nltk
%pip install matplotlib
%pip install wordcloud
%pip install plotly
%pip install tqdm


## Importing Libraries & Setting Seed

In [None]:
import sys
import seaborn
import sklearn
import os
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from functools import partial
import PIL
import PIL.Image
import pandas as pd

# %tensorflow_version 2.x
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow import keras
import tensorflow as tf
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import precision_recall_fscore_support

from keras import backend as K

from transformers import AutoTokenizer

from collections import Counter
import collections
import re
import unicodedata
import emoji
import contractions
import tqdm

### Setting Global Seed

In [None]:
 # note that you must use the same seed to ensure consistentcy in your training/validation/testing
np.random.seed(42)
tf.random.set_seed(42)

In [None]:
#Suppress 'SettingWithCopyWarning' from pandas 
pd.options.mode.chained_assignment = None

# Data Handling

### Loading the dataset

In [None]:
ds = tfds.load('goemotions')
#look up how it can split it splits easily

### Cleaning the Data

We remove all the data which has multiple labels assigned. 

In [None]:
def remove_multilabels(ds, split):
  '''
  Removes items with multiple labels
  '''
  df = tfds.as_dataframe(ds[split])
  df = df[df[df.columns.difference(['comment_text'])].sum(axis=1) == 1]
  return df


In [None]:
def merge_columns(df, split):
  df.loc[:, 'love'] += df['caring']
  df.loc[:, 'approval'] += df['admiration']
  df.loc[:, 'disapproval'] += df['disgust'] + df['anger'] + df['annoyance']
  df.loc[:, 'surprise'] += df['curiosity']
  df.loc[:, 'fear'] += df['nervousness']
  df.loc[:, 'sadness'] += df['embarrassment'] + df['grief'] + df['remorse'] + df['disappointment']
  df.loc[:, 'joy'] += df['amusement'] + df['excitement'] + df['relief']
  df.drop(['caring', 'admiration', 'disgust', 'anger', 'annoyance', 'curiosity', 'nervousness', 'embarrassment',
            'grief', 'remorse', 'disappointment', 'relief', 'excitement', 'amusement'], axis=1, inplace=True)
  return df

In [None]:
ds_train = merge_columns(remove_multilabels(ds, 'train'), 'train')
ds_valid = merge_columns(remove_multilabels(ds, 'validation'), 'validation')
ds_test = merge_columns(remove_multilabels(ds, 'test'), 'test')

Using the str.decode() method to convert any bytes objects in the column to UTF-8 encoded strings. This is necessary if the 'comment_text' column contains data that has been encoded as bytes and needs to be converted to strings before being processed further.

In [None]:
for ds in [ds_train, ds_valid, ds_test]:
    ds['comment_text'] = ds['comment_text'].str.decode("utf-8")

### Data Expolration and Analysis

In [None]:
print(ds_train.columns)

In [None]:
ds_train.head()

In [None]:
# Basic Facts for dataset

GE_taxonomy = ['approval',
               'confusion', 'desire',
               'disapproval', 'fear',
               'gratitude','joy', 'love', 'neutral',
               'optimism', 'pride', 'realization', 'sadness',
               'surprise']
#Calculating Number of labels
num_labels = len(GE_taxonomy)
print(f'Total Number or Labels: {num_labels}')

# Data Analysis

In [None]:
sorted_columns = ds_train.drop('comment_text', axis=1).sum().sort_values(ascending=False)


plt.bar(range(len(sorted_columns)), sorted_columns.values)
plt.xlabel('Label')
plt.ylabel('Count')
plt.xticks(range(len(sorted_columns)), sorted_columns.index, rotation=90)
plt.show()

We can see how unequal the dataset is when it comes to distribution frequency.
where the pride emotion is almost insignificant.

In [None]:
print(sorted_columns)

Lets check the word frequency for each emotion label in the dataset


# ➤ Experiment Variation 2 
## *Run only 1 of the variation cells*

As a part of the experiment we will use the NLTK Library to manipulate the content of the corpus with famous preprocessing techniques and observe any differences in order to conclude a hypothesis.

#### We will do the following:

* Stemming
* Lemmetization
* Removing of stop words

## ➢ Variation 2 - a
In this variation the data is cleaned and preprocessed using NLTK libraries
For lemmetization, Stemming, and Stop Words Removal

In [None]:
# Import Libraries
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Download the vocab
nltk.download("punkt")
stemmer = PorterStemmer()

lemmatizer = nltk.stem.WordNetLemmatizer()
nltk.download('stopwords')
nltk.download('wordnet')
sw_nltk = stopwords.words('english')
print(sw_nltk)

In [None]:
# Building a preprocessing function to clean text
def preprocess_corpus_1b(x):

  # Adding a space between words and punctation
  x = re.sub( r'([a-zA-Z\[\]])([,;.!?])', r'\1 \2', x)
  x = re.sub( r'([,;.!?])([a-zA-Z\[\]])', r'\1 \2', x)
  
  # Demojize
  x = emoji.demojize(x)
  
  # Expand contraction
  x = contractions.fix(x)
  
  # Lower
  x = x.lower()

  # # Handling emojis
  x = re.sub(r"<3", " love_heart ", x)
  x = re.sub(r"xd", " smiling_face_with_open_mouth_and_tightly_closed_eyes ", x)
  x = re.sub(r":\)", " smiling_face ", x)
  x = re.sub(r"^_^", " smiling_face ", x)
  x = re.sub(r"\*_\*", " star_struck ", x)
  x = re.sub(r":\(", " frowning_face ", x)
  x = re.sub(r":\^\(", " frowning_face ", x)
  x = re.sub(r";\(", " frowning_face ", x)
  x = re.sub(r":\/",  " confused_face ", x)
  x = re.sub(r";\)",  " wink ", x)
  x = re.sub(r">__<",  " unamused ", x)
  x = re.sub(r"\b([xo]+x*)\b", " xoxo ", x)
  x = re.sub(r"\b(n+a+h+)\b", " nah ", x)

  # Remove special characters and numbers replace by space + remove double space
  x = re.sub(r"\b([.]{3,})"," dots ", x)
  x = re.sub(r"[^A-Za-z!?_]+"," ", x)
  x = re.sub(r"\b([s])\b *","", x)
  x = re.sub(r" +"," ", x)
  x = x.strip()
  
  
  return x

In [None]:

#Lemmatizer using NLTK WordNetLemmatizer
def lemmatize_text(text):
  s = lemmatizer.lemmatize(text)
  return s
#Stemming using PorterStemmer
def stemming(text):
  s = stemmer.stem(text)
  return s

I made multiple columns here so its easier for me to compare each step of the process.

In [None]:
# Applying the preprocessing function on the dataset

datasets = [ds_train, ds_valid, ds_test]
for dataset in datasets:
  #cleaning of data
  dataset["wash_text"] = dataset["comment_text"].apply(preprocess_corpus_1b)
  #removal of stop words
  dataset["prep_text"] = dataset["wash_text"].apply(lambda x: ' '.join([word for word in x.split() if word not in (sw_nltk)]))
  #Lemmatization of words
  dataset["short_text"] = dataset["prep_text"].apply(lemmatize_text)
  #Stemming of words
  dataset["clean_text"] = dataset["short_text"].apply(lemmatize_text)
   

# Preview of data
display(ds_train[['comment_text', 'clean_text']].sample(5))



Checking max number of words in a sentences in order to apply padding - \<pad>

In [None]:
# we check through entire corpus
max_length = pd.concat([ds_train['clean_text'], ds_test['clean_text'], ds_valid['clean_text']]).apply(lambda x: len(x.split())).max()
print(f'Maximum Length of a sentence after preprocessing: {max_length}')

## ➢ Variation 2 - b


Where **NO** tokenizations techniques are applied, the data is only cleaned in order to preprocess

In [None]:
# Building a preprocessing function to clean text
def preprocess_corpus_1a(x):

  # Adding a space between words and punctation
  x = re.sub( r'([a-zA-Z\[\]])([,;.!?])', r'\1 \2', x)
  x = re.sub( r'([,;.!?])([a-zA-Z\[\]])', r'\1 \2', x)
  
  # Demojize
  x = emoji.demojize(x)
  
  # Expand contraction
  x = contractions.fix(x)
  
  # Lower
  x = x.lower()

  # # Handling emojis
  x = re.sub(r"<3", " love_heart ", x)
  x = re.sub(r"xd", " smiling_face_with_open_mouth_and_tightly_closed_eyes ", x)
  x = re.sub(r":\)", " smiling_face ", x)
  x = re.sub(r"^_^", " smiling_face ", x)
  x = re.sub(r"\*_\*", " star_struck ", x)
  x = re.sub(r":\(", " frowning_face ", x)
  x = re.sub(r":\^\(", " frowning_face ", x)
  x = re.sub(r";\(", " frowning_face ", x)
  x = re.sub(r":\/",  " confused_face ", x)
  x = re.sub(r";\)",  " wink ", x)
  x = re.sub(r">__<",  " unamused ", x)
  x = re.sub(r"\b([xo]+x*)\b", " xoxo ", x)
  x = re.sub(r"\b(n+a+h+)\b", " nah ", x)

  # Remove special characters and numbers replace by space + remove double space
  x = re.sub(r"\b([.]{3,})"," dots ", x)
  x = re.sub(r"[^A-Za-z!?_]+"," ", x)
  x = re.sub(r"\b([s])\b *","", x)
  x = re.sub(r" +"," ", x)
  x = x.strip()

  return x

In [None]:
# Applying the preprocessing function on the dataset
datasets = [ds_train, ds_valid, ds_test]
for dataset in datasets:
    dataset["clean_text"] = dataset["comment_text"].apply(preprocess_corpus_1a)


# Preview of data
display(ds_train[['comment_text', 'clean_text']].sample(5))

checking max number of words in a sentences in order to apply padding - \<pad>

In [None]:
max_length = pd.concat([ds_train['clean_text'], ds_test['clean_text'], ds_valid['clean_text']]).apply(lambda x: len(x.split())).max()
print(f'Maximum Length of a sentence after cleaning: {max_length}')

## Word Frequency for each label

In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

sw_nltk = stopwords.words('english')

print(sw_nltk)

In [None]:
ds_train2 = ds_train.drop('comment_text', axis = 1)

sw_nltk.extend(['!', '?', 'name'])

# Count the frequency of each word for each label
label_counts = {}
for label in ds_train2.columns[1:]:
    words2 = [w for w in ' '.join(ds_train2[ds_train2[label] == 1]['clean_text'].values).split() if w not in sw_nltk] # split the text into a list of words
    counts = Counter(words2)
    label_counts[label] = counts
    label_counts[label] = dict(counts.most_common(5))  # keep only the top 5 most frequent words


# Plot the word frequency for each label
fig, axes = plt.subplots(nrows=4, ncols=4, figsize=(20, 20))
for i, label in enumerate(label_counts.keys()):
    ax = axes[i//4, i%4]
    ax.bar(label_counts[label].keys(), label_counts[label].values())
    ax.set_title(label)
    ax.tick_params(axis='x', rotation=45)
plt.tight_layout()
plt.show()


We can see what words are the most common for each of the labels. and its quite evident how each words maps so distictly to each emotion respectively

## Data Vectorization

### Creating Train, Validation and Test Variables

The splits can be automated here to generate multiple train and validate ratios

In [None]:
# Creating train, validation and test variables
X_train = ds_train['clean_text']
y_train = ds_train.loc[:, GE_taxonomy].values.astype(float)

X_valid = ds_valid['clean_text']
y_valid = ds_valid.loc[:, GE_taxonomy].values.astype(float)

X_test = ds_test['clean_text']
y_test = ds_test.loc[:, GE_taxonomy].values.astype(float)

### Tokenizing the Data

We use the pretrained [RoBERTa tokenizer](https://huggingface.co/docs/transformers/model_doc/roberta#transformers.RobertaTokenizer)

it uses \<PAD> , \<s> and \<UNK> tokens too

It is important to notice the use of [max_length](https://colab.research.google.com/drive/1GYb7qo-QCeegdTK5PCNdUy2hSWUSQI93#scrollTo=sNOEReSSYjo2&line=3&uniqifier=1) here in order to pad the sentences well

In [None]:
# Importing and using pretrained tokenizer

from transformers import RobertaTokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Tokenizing train data
train_token = tokenizer(
    text = X_train.to_list(),
    add_special_tokens = True,
    max_length = max_length,
    truncation = True,
    padding = 'max_length', 
    return_tensors = 'tf',
    return_token_type_ids = True,
    return_attention_mask = True,
    verbose = True)

# Tokenizing validation data
valid_token = tokenizer(
    text = X_valid.to_list(),
    add_special_tokens = True,
    max_length = max_length,
    truncation = True,
    padding = 'max_length', 
    return_tensors = 'tf',
    return_token_type_ids = True,
    return_attention_mask = True,
    verbose = True)

# Tokenizing test data
test_token = tokenizer(
    text = X_test.to_list(),
    add_special_tokens = True,
    max_length = max_length,
    truncation = True,
    padding = 'max_length', 
    return_tensors = 'tf',
    return_token_type_ids = True,
    return_attention_mask = True,
    verbose = True)

## PreProcessing the Dataset!

The following preprocess **only normalizes** the corpus by:

* Adding space between punctuations
* Removing emojis
* Expanding contarctions
* Turning the corpus to Lowercase 
* Removing special characters and numbers replace by space 
* Removing double spaces




# Building the Model

### Importing ReBERTa model and config

We finally import [Roberta Model](https://huggingface.co/docs/transformers/model_doc/roberta) and it's config file

In [None]:
from transformers import RobertaConfig 
from transformers import TFRobertaModel

In [None]:
model_name = "roberta-base"
configuration = RobertaConfig.from_pretrained(model_name, output_hidden_states=False)
transformer_model = TFRobertaModel.from_pretrained(model_name, configuration)

# ➤ Experiment Variation '3' ---

for this experiment we will finetune and change hyperparameters to compare scoring metrics


We will change
* Weight Initilizer
* Activation Function



## ➢ Experiment 3 - a



* Weight Initilizer:
> All weight initialized to 0


* Activation Function:
> Sigmoid

we use the **Sigmoid** function
\begin{equation}
\sigma(x) = \frac{1}{1+e^{-x}}
\end{equation}

In [None]:
ACTIVATION = "sigmoid"

# Initializer that generates tensors initialized to 0.
INITIALIZER = tf.keras.initializers.Zeros()

## ➢ Experiment 3 - b


* Weight Initilizer:
> Truncated Normal with Standard Deviation
* Activation Function:
> Softmax

**Softmax** Function

\begin{equation}
\mathrm{softmax}(z)i = \frac{e^{z_i}}{\sum{j=1}^K e^{z_j}} \text{ for } i = 1, 2, \dots, K
\end{equation}

where $z = (z_1, z_2, \dots, z_K)$ are the input values and $\mathrm{softmax}(z)_i$ represents the $i$-th output of the softmax function.

In [None]:
ACTIVATION = "softmax"
INITIALIZER = tf.keras.initializers.TruncatedNormal(stddev=configuration.initializer_range)

## Compiling the model

We use Keras to build our Network layers
here we also set the number of batches variable.


In [None]:
from keras.layers import Activation, Dense, Dropout, InputLayer
from keras import layers
BATCH = 128

We also make use of the attention mask since we are padding out sentences.

The Dropout layer randomly sets input units to 0 with a defined frequency at each step during training time, which helps prevent overfitting. Inputs not set to 0 are scaled up by 1/(1 - rate) such that the sum over all inputs is unchanged.

In [None]:
# function for creating RoBERTa based model
def create_model(nb_labels):
  #nb_labels is the number of labels in our data

  # Load the MainLayer
  roberta = transformer_model.layers[0]

  # Build the model inputs
  input_ids = layers.Input(shape=(max_length,), name='input_ids', dtype='int32')
  attention_mask = layers.Input(shape=(max_length,), name='attention_mask', dtype='int32')
  token_type_ids = layers.Input(shape=(max_length,), name='token_type_ids', dtype='int32')
  inputs = {'input_ids': input_ids, 'attention_mask': attention_mask, 'token_type_ids': token_type_ids}

  # Load the Transformers RoBERTa model as a layer in a Keras model
  roberta_model = roberta(inputs)[1]
  dropout = layers.Dropout(configuration.hidden_dropout_prob, name='pooled_output')
  pooled_output = dropout(roberta_model, training=False)

  emotion = layers.Dense(units=nb_labels, activation=ACTIVATION, kernel_initializer=INITIALIZER, name='emotion')(pooled_output)
  outputs = emotion

  # And combine it all in a model object
  model = tf.keras.models.Model(inputs=inputs, outputs=outputs, name='roBERTa_Label')

  return model

In [None]:
# Creating a model instance
model = create_model(num_labels)

# Take a look at the model
model.summary()

We can observe the roberta configuration below and can tune it accordingly

In [None]:
print(configuration)

In [None]:
# Creating RoBERTa compatible inputs with Input Ids, attention masks and token Ids 

train = {'input_ids': train_token['input_ids'], 'attention_mask': train_token['attention_mask'],'token_type_ids': train_token['token_type_ids']}
val = {'input_ids': valid_token['input_ids'], 'attention_mask': valid_token['attention_mask'],'token_type_ids': valid_token['token_type_ids']}
test = {'input_ids': test_token['input_ids'], 'attention_mask': test_token['attention_mask'],'token_type_ids': test_token['token_type_ids']}


In [None]:
# Creating TF tensors
train_tensor = tf.data.Dataset.from_tensor_slices((train, y_train)).shuffle(len(train)).batch(BATCH)
val_tensor = tf.data.Dataset.from_tensor_slices((val, y_valid)).shuffle(len(val)).batch(BATCH)
test_tensor = tf.data.Dataset.from_tensor_slices((test, y_test)).shuffle(len(test)).batch(BATCH)
print(BATCH)

# ➤ Experiment Variation '4' ---

For this experiment we will tuning the:
* Loss Function
* Number of Epochs
* Optimizer

## ➢ Experiment Variation 4 - a
* Loss:
> Custom Binary Cross Entropy 
* Number of Epochs:
> 10
* Optimizer:
> Adam
>>Learning Rate: 1.e-06

**Adam Optimizer**


---



\begin{equation}
\theta_{t+1} = \theta_t - \frac{\eta}{\sqrt{\hat{v}_t} + \epsilon} \hat{m}_t
\end{equation}

and the equations for computing the first and second moments are:

\begin{equation}
\hat{m}_t = \frac{m_t}{1-\beta_1^t}, \quad \hat{v}_t = \frac{v_t}{1-\beta_2^t}
\end{equation}


---



The weighted_loss function calculates the binary cross-entropy loss between y_true and y_pred. However, the loss is weighted using the weights parameter. The weights parameter is a 2D array, where the first column contains the weights for negative samples (when y_true is 0) and the second column contains the weights for positive samples (when y_true is 1).

In [None]:
# Number of Epochs
EPOCHS = 10

# Function for calculating multilabel class weights
def calculating_class_weights(y_true):
    number_dim = np.shape(y_true)[1]
    weights = np.empty([number_dim, 2])
    for i in range(number_dim):
        weights[i] = compute_class_weight('balanced', classes = [0.,1.], y = y_true[:, i])
    return weights

class_weights = calculating_class_weights(y_train)

# Custom loss function for multilabel

def get_weighted_loss(weights):
    def weighted_loss(y_true, y_pred):
      #first part of mult is appllied to negative samples and second on the positive samples
        return K.mean((weights[:,0]**(1-y_true))*(weights[:,1]**(y_true))*K.binary_crossentropy(y_true, y_pred), axis=-1)
    return weighted_loss

LOSS = get_weighted_loss(class_weights)

In [None]:
# Set an optimizer
OPTIMIZER = tf.keras.optimizers.experimental.Adam(
    learning_rate=5.e-05,
    weight_decay=None
    )


# Compile the model
model.compile(
    optimizer = OPTIMIZER,
    loss = LOSS,
    metrics=["accuracy"] 
    )


## ➢ Experiment Variation 4 - b
* Loss:
> Custom Binary Cross Entropy 
* Number of Epochs:
> 7
* Optimizer:
> AdamW
>>Learning Rate: 5.e-05
>>>Weight Decay: 0.004



**AdamW Optimizer**


---

\begin{equation}
\theta_{t+1} = \theta_t - \frac{\eta}{\sqrt{\hat{v}_t} + \epsilon} (\hat{m}_t + \lambda \theta_t)
\end{equation}

and the equations for computing the first and second moments are:

\begin{equation}
\hat{m}_t = \frac{m_t}{1-\beta_1^t}, \quad \hat{v}_t = \frac{v_t}{1-\beta_2^t}
\end{equation}

where $m_t$ and $v_t$ are the first and second moments of the gradients respectively.
$\theta_t$ is the model parameter at time step $t$, $\eta$ is the learning rate, $\epsilon$ is a small constant for numerical stability.

---



Custom loss function to adjust class weights to avoid class imbalance problem

The weighted_loss function calculates the binary cross-entropy loss between y_true and y_pred. However, the loss is weighted using the weights parameter. The weights parameter is a 2D array, where the first column contains the weights for negative samples (when y_true is 0) and the second column contains the weights for positive samples (when y_true is 1).

In [None]:
# Function for calculating multilabel class weights
def calculating_class_weights(y_true):
    number_dim = np.shape(y_true)[1]
    weights = np.empty([number_dim, 2])
    for i in range(number_dim):
        weights[i] = compute_class_weight('balanced', classes = [0.,1.], y = y_true[:, i])
    return weights

class_weights = calculating_class_weights(y_train)

# Custom loss function for multilabel

def get_weighted_loss(weights):
    def weighted_loss(y_true, y_pred):
      #first part of mult is appllied to negative samples and second on the positive samples
        return K.mean((weights[:,0]**(1-y_true))*(weights[:,1]**(y_true))*K.binary_crossentropy(y_true, y_pred), axis=-1)
    return weighted_loss


Setting Training Variables

In [None]:
# Number of Epochs
EPOCHS = 8

# Set an optimizer
OPTIMIZER = tf.keras.optimizers.experimental.AdamW(
    learning_rate=5.e-05,
    weight_decay = 0.004
    )

# Set loss
LOSS = get_weighted_loss(class_weights)

# Compile the model
model.compile(
    optimizer = OPTIMIZER,
    loss = LOSS,
    metrics=["accuracy"] 
    )


# Training the Data

In [None]:
# Summary of the layers of our model
model.summary()

In [None]:
# train the model
history = model.fit(train_tensor, 
                    epochs = EPOCHS,
                    validation_data=val_tensor
                    )

# Evaluate the Model

In [None]:
y_pred_proba = model.predict(test)
test

For the group project we will be using argmax function here in order to make the model multi-class and not multi-label.

In [None]:

# from probabilities to labels using a given threshold
def proba_to_labels(y_pred_proba, threshold=0.8):
    
    y_pred_labels = np.zeros_like(y_pred_proba)
    
    for i in range(y_pred_proba.shape[0]):
        for j in range(y_pred_proba.shape[1]):
            if y_pred_proba[i][j] > threshold:
                y_pred_labels[i][j] = 1
            else:
                y_pred_labels[i][j] = 0
                
    return y_pred_labels

In [None]:
# Generate labels
y_pred_labels = proba_to_labels(y_pred_proba)

In [None]:
# Model evaluation function 
def model_eval(y_true, y_pred_labels, emotions):
    
    # Defining variables
    precision = []
    recall = []
    f1 = []
    
    # Per emotion evaluation      
    idx2emotion = {i: e for i, e in enumerate(emotions)}
    
    for i in range(len(emotions)):
   
        # Computing precision, recall and f1-score
        p, r, f1_score, _ = precision_recall_fscore_support(y_true[:, i], y_pred_labels[:, i], average="binary")
        
        # Append results in lists
        precision.append(round(p, 2))
        recall.append(round(r, 2))
        f1.append(round(f1_score, 2))
    
    # Macro evaluation
    macro_p, macro_r, macro_f1_score, _ = precision_recall_fscore_support(y_true, y_pred_labels, average="macro")
    
    # Append results in lists
    precision.append(round(macro_p, 2))
    recall.append(round(macro_r, 2))
    f1.append(round(macro_f1_score, 2))
    
    # Converting results to a dataframe with gradient
    df_results = pd.DataFrame({"Precision":precision, "Recall":recall, 'F1':f1})
    df_results.style.background_gradient(cmap='YlOrRd')
    df_results.index = emotions+['MACRO-AVERAGE']
    
    
    return df_results

#Results! in F1 score

In [None]:
seaborn.heatmap(model_eval(y_test, y_pred_labels, GE_taxonomy),annot=True, cmap = 'viridis') 
#research paper had F1 = 0.46

\begin{equation}
\text{Precision} = \frac{\text{True Positives}}{\text{True Positives} + \text{False Positives}}
\end{equation}

where True Positives are the number of positive instances that are correctly predicted by the model, and False Positives are the number of negative instances that are incorrectly predicted as positive by the model.

\begin{equation}
\text{Recall} = \frac{\text{True Positives}}{\text{True Positives} + \text{False Negatives}}
\end{equation}

where True Positives are the number of positive instances that are correctly predicted by the model, and False Negatives are the number of positive instances that are incorrectly predicted as negative by the model.

\begin{equation}
\text{F1 score} = 2 \times \frac{\text{Precision} \times \text{Recall}}{\text{Precision} + \text{Recall}}
\end{equation}

The F1 score provides a balance between precision and recall, making it a useful metric when both false positives and false negatives are important considerations.