# Balancing Model

---

## Imports

In [1]:
import itertools
import os
import random
import re

import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_addons as tfa
import torch

from tensorflow import keras
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from transformers import AutoTokenizer, TFAutoModel
from transformers import logging
from tqdm import tqdm


TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 

 The versions of TensorFlow you are currently using is 2.10.1 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


In [2]:
logging.set_verbosity_error()

In [3]:
MAX_SEQUENCE_LENGTH = 200

In [4]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

In [5]:
torch.cuda.is_available()

True

---

## Data Loading

In [6]:
dataset = pd.read_parquet("../data/clean_data.parquet")

---

## Code 

### Balancing 

In [7]:
import random

from textblob import TextBlob
from textblob.translate import NotTranslated


SR = random.SystemRandom()
LANGUAGES = ["es", "de", "fr", "ar", "te", "hi", "ja", "fa", "sq", "bg", "nl", "gu", "ig", "kk", "mt", "ps"]
TRANSLATIONS = dict()


def data_augmentation(message: str, language: str = "en", aug_range: int = 1) -> list:
    """
    Create new text data by translating a message to a random language
    and then tranlating it back
    
    :param message: messege to be translated
    :param language: original language of the message
    :param aug_range: number of new messages to generate
    :return: list of new messages
    """
    augmented_messages = []
    if hasattr(message, "decode"):
        message = message.decode("utf-8")

    for j in range(0, aug_range) :
        text_blob = TextBlob(message)
        try:
            to_lang = SR.choice(LANGUAGES)
            
            if (message, to_lang) in TRANSLATIONS:
                text_blob = TRANSLATIONS[(message, to_lang)]
            else:
                text_blob = text_blob.translate(from_lang=language, to=to_lang)
                text_blob = text_blob.translate(from_lang=to_lang, to=language)
                TRANSLATIONS[(message, to_lang)] = str(text_blob)
        except NotTranslated:
            pass
        else:
            augmented_messages.append(TRANSLATIONS[(message, to_lang)])

    return augmented_messages


def apply_balancing(data: pd.DataFrame, target: str, augmented: bool) -> pd.DataFrame:
    """
    Apply balancing to the dataset based on the selected strategy
    
    :param data: dataset to be balanced
    :param target: target strategy (avg or max)
    :param augmented: flag if we should apply augmentation (back-and-forth translation)
    :return: balanced dataframe
    """
    assert (augmented and target == "avg") or not augmented
    
    # get the file name with the augmented data
    if augmented:
        file_name = f"balanced_augmented_{target}.parquet"
    else:
        file_name = f"balanced_{target}.parquet"
    
    # if the file exists load and returns it
    if os.path.exists(f"../data/{file_name}"):
        return pd.read_parquet(f"../data/{file_name}")
    
    # select the training data
    train = data.loc[lambda f: f["set"] == "train"].copy()
    train["augmented"] = False
    train["for_balance"] = False
    
    # calculate the representation of each class
    class_representation = train.goemotion.value_counts()
    minority = class_representation.min()
    majority = class_representation.max()
    avg = int(class_representation.mean())

    # choose what is the target amount of each class based on the strategy
    if target == "max":
        target_value = majority
    elif target == "avg":
        target_value = avg
    else:
        raise ValueError
    
    # for each emotion of interest
    final = list()
    for emotion in class_representation.index:
        # get the data with that emotion
        edata = train.loc[lambda f: f["goemotion"] == emotion]
        representation = class_representation.loc[emotion]
        to_generate = target_value - representation
        
        # if we have less than the target value, we should balance it
        if representation < target_value:
            # if we don't want any augmentation, do a simple sampling of the data
            if not augmented:
                sampled = pd.concat([edata, edata.sample(to_generate, replace=True)])
            
            # otherwise
            else:
                # sample some text from the original dataset
                generated = list()
                sampled = edata.sample(to_generate, replace=(to_generate > representation)).reset_index(drop=True)
                
                # for each text in the sample
                for row in tqdm(sampled.itertuples(name=None), total=sampled.shape[0]):
                    # try to perform the translation of text at least 3 times
                    for i in range(3):
                        try:
                            sampled.loc[row[0], "text"] = data_augmentation(row[-4])[0]
                            sampled.loc[row[0], "augmented"] = True
                        except IndexError:
                            continue
                        except urllib.error.URLError:
                            time.sleep(3)
                        else:
                            break
                
                # save the final dataset
                sampled["for_balance"] = True
                sampled = pd.concat([edata, sampled])
        else:
            sampled = edata
        
        # save the sampled results to the final dataset
        final.append(sampled)

    # concatenate with the original test set
    balanced = pd.concat(final + [data.loc[lambda f: f["set"] != "train"].assign(augmented=False, for_balance=False)])
    balanced["text"] = balanced["text"].str.replace("\[name\]", "[NAME]").str.replace("\[religion\]", "[RELIGION]")
    
    # export the final result
    balanced.to_parquet(f"../data/{file_name}")
    
    
    # return the sampled dataset
    return balanced

In [8]:
apply_balancing(dataset, "avg", True)

Unnamed: 0,code,goemotion,ekman,sentiment,set,text,clean_text,augmented,for_balance
72,edpom9u,neutral,neutral,neutral,train,The good kind,good kind,False,False
84,ef55x08,neutral,neutral,neutral,train,GG. We played as best as we could. Utah just a...,gg played best could utah better team,False,False
106,edxzrwk,neutral,neutral,neutral,train,[NAME] looking like [NAME] out there,[NAME] looking like [NAME],False,False
108,ef9yhnp,neutral,neutral,neutral,train,>Allow insurance to not pay for treatment of d...,allow insurance pay treatment disease known va...,False,False
208,ees5bt3,neutral,neutral,neutral,train,Shadow of Mordor 3 looks pretty lit,shadow mordor 3 look pretty lit,False,False
...,...,...,...,...,...,...,...,...,...
63807,eezc65u,neutral,neutral,neutral,test,The essay is optional.,essay optional,False,False
63808,edduyro,neutral,neutral,neutral,test,Waiting for both of these things is torture,waiting thing torture,False,False
63809,edy4kl7,neutral,neutral,neutral,test,Easy just include [NAME] to continue to tormen...,easy include [NAME] continue torment [NAME],False,False
63810,efbiugo,neutral,neutral,neutral,test,Daddy issues [NAME],daddy issue [NAME],False,False


### Clean Data 

In [9]:
import string

import contractions
import emoji

from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer

STOP_WORDS = set(stopwords.words("english"))
TOKENIZER = TweetTokenizer()
PUNCUATION_LIST = list(string.punctuation)


def reinsert_tags(text: str, tag_split: str, tag_prefix: str = "") -> str:
    """
    Given a certain text, look for a separator that has been used for tag splitting
    and make sure that all tags are put between []
    
    :param text: text to be adjusted
    :param tag_split: tag to be splitted
    :param tag_prefix: prefix to apply on tagging
    :return: tagged text
    """
    open_tag = False
    full_sentence = ""
    for sentence in text.split(tag_split):
        if open_tag:
            full_sentence += f" [{tag_prefix}{sentence.upper().strip()}] "
        else:
            full_sentence += sentence
        open_tag = not open_tag

    full_sentence = re.sub(" +", " ", full_sentence)
    return full_sentence.strip()


def clean_content(
    text, 
    fix_contraction: bool = True, 
    tag_emoji: bool = True,
    tagged_items: list = ["name", "religion"],
    handles: bool = True, 
    case: bool = True,
    links: bool = True,
    non_char: bool = True,
    rm_stop_words: bool = True,
    lemmatization: bool = True,
    tokenize: bool = True,
    ponctuation: bool = True,
    unmapped_emoji: bool = True
) -> str:
    """
    Apply data cleaning to text given a range of options
    
    :param fix_contraction: True if we want to remove abbreviations
    :param tag_emoji: True if we want to tag emojis by name
    :param tagged_items: List of items that are tagged in the current text
    :param handles: True if we want to remove twitter like handles
    :param case: True if we want to normalize to lower case
    :param links: True if we want to remove websites and links
    :param non_char: True if we want to remove non-character words
    :param rm_stop_words: True if we want to remove stop words
    :param lemmatization: True if we want to apply lemmatization
    :param tokenize: True if we want to apply twitter tokenization
    :param ponctuation: True if we want to remove ponctuation
    :param unmapped_emoji: True if we want to remove unmapped emojis
    :return: clean text
    """
    clean_text = text
    
    # replaces abbreviations with full word versions
    if fix_contraction:
        clean_text = contractions.fix(text)
    
    # replaces emojis
    if tag_emoji:
        clean_text = "".join(
            [c if c not in emoji.EMOJI_DATA else emoji.EMOJI_DATA[c]["en"].replace(":", " _emoji_ ") for c in clean_text]
        )
    
    # remove reddit handles
    if handles:
        clean_text = re.sub(r"@\w+\s?", "", clean_text)
    
    # convert to lowercase
    if case:
        clean_text = clean_text.lower()
    

    if links:
         # remove links http:// or https://
        clean_text = re.sub(r"https?:\/\/\S+", "", clean_text)
    
        # remove links beginning with www. and ending with .com
        clean_text = re.sub(r"www\.[a-z]?\.?(com)+|[a-z]+\.(com)", "", clean_text)
    
        # remove html reference characters
        clean_text = re.sub(r"&[a-z]+;", "", clean_text)
        
    # deal with tagged items
    for tag in tagged_items:
        clean_text = re.sub(fr"\[{tag}\]", f" _tag_ {tag} _tag_ ", clean_text)  
    
    # remove non-letter characters besides spaces "/", ";" "[", "]" "=", "#"
    if non_char:
        clean_text = re.sub(r"[/;\[\]=#]", "", clean_text)  
    
    # remove stop words
    if rm_stop_words:
        clean_lst = []
        for word in clean_text.split():
            if word not in STOP_WORDS:
                clean_lst.append(word)
    else:
        clean_lst = clean_text.split()
        
    
    # apply lemmatization
    if lemmatization:
        lemmatized_words = []
        for word in clean_lst:
            lemmatized_word = WordNetLemmatizer().lemmatize(word)
            lemmatized_words.append(lemmatized_word)
        clean_lst = lemmatized_words
    
    # concatenate the text again
    clean_text = " ".join(clean_lst)
    
    # apply tokenization
    if tokenize:
        tokens = TOKENIZER.tokenize(clean_text)
    else:
        tokens = clean_text.split(" ")
    
    if ponctuation:
        clean_text = " ".join([w for w in tokens if w not in PUNCUATION_LIST])
    else:
        clean_text = " ".join(tokens)
        
    # clean emojis that were not mapped by the library
    if unmapped_emoji:
        clean_text = "".join([w for w in clean_text if ord(w) < 2000])
        
    # add the tags for emojis
    if tag_emoji:
        clean_text = reinsert_tags(clean_text, "_emoji_", "EMOJI_")
    
    # re-insert tags
    if len(tagged_items) > 0:
        clean_text = reinsert_tags(clean_text, "_tag_")
        
    return clean_text

In [10]:
text = dataset.text.sample(1).values[0]
print(text)
print(clean_content(text))

regret unfollowing him...
regret unfollowing him ...


In [11]:
clean_content("No. Or at least that’s what I suspect🤔")

'no least suspect [EMOJI_THINKING_FACE]'

In [12]:
clean_content(
    "No. Or at least that’s what I suspect🤔", 
    fix_contraction=False, 
    tag_emoji=True,
    tagged_items=["NAME", "RELIGION"],
    handles=False, 
    case=False,
    links=False,
    non_char=False,
    rm_stop_words=False,
    lemmatization=False,
    tokenize=False,
    ponctuation=False,
    unmapped_emoji=True
)

'No. Or at least thats what I suspect [EMOJI_THINKING_FACE]'

### Tokenization

In [13]:
EMOJIS_FOUND = (
    pd.DataFrame(
        [
            (c,) 
            for text in dataset["text"].to_list()
            for c in text
            if c in emoji.EMOJI_DATA 
        ],
        columns=["emoji"]
    )
    .assign(count=1)
    .groupby(["emoji"], as_index=False)["count"]
    .sum()
    .assign(pct=lambda f: f["count"] / f["count"].sum())
    .assign(cum_pct=lambda f: f["pct"].cumsum())
)


def apply_tokenization(
    t_model: TFAutoModel, 
    tokenizer: AutoTokenizer, 
    data: pd.DataFrame, 
    emoji_tagging: bool = False, 
    emoji_threshold: float = 0.8
) -> dict:
    """
    Given a tokenizer object and some data, apply the tokenization process on the
    train, test and validation sets and generate a dictionary of processed data with
    all inputs needed for the model
    
    :param t_model: transformer model
    :param tokenizer: tokenizer object
    :param data: input data to be used for training
    :param emoji_tagging: flag if we should apply emoji tagging
    :param emoji_threshold: threshold for selection of emojis to be added
    """
    processed = dict()
    
    # select words to be added
    words_to_add = ["[NAME]", "[RELIGION]"]
    if emoji_tagging:
        words_to_add += EMOJIS_FOUND.loc[lambda f: f["cum_pct"] <= emoji_threshold].emoji.to_list()
        
    # add the new tokens
    tokenizer.add_tokens(words_to_add)
    t_model.resize_token_embeddings(len(tokenizer))
    
    # for each group of data
    for group in tqdm(["train", "validation", "test"]):
        # pivot the adtaset and extract the emotions
        df = data.loc[lambda f: f["set"] == group].pivot_table(
            index=["code", "text"],
            columns=["goemotion"],
            values="set",
            aggfunc="count",
            fill_value=0
        ).reset_index().drop(columns=["none"], errors="ignore")

        processed[group] = dict()
        
        # apply the tokenizer to the data
        processed[group]["tokens"] = tokenizer(
            df.text.to_list(), 
            max_length=MAX_SEQUENCE_LENGTH, 
            truncation=True, 
            padding="max_length", 
            add_special_tokens=True,
            return_attention_mask=True,
            return_token_type_ids=True,
            return_tensors="tf"
        )
        
        # create same inputs to be used in the model
        processed[group]["inputs"] = [
            processed[group]["tokens"].input_ids, 
            processed[group]["tokens"].token_type_ids, 
            processed[group]["tokens"].attention_mask
        ]
        
        # create the set of labels
        processed[group]["labels"] = df.iloc[:, 2:].values
    
    return processed

### Model 

In [14]:
def create_cls_model(
    t_model: TFAutoModel,
    trainable: str = "all",
    head: str = "none",
    dropout: float = 0.3,
    label_smoothing: float = 0.1,
    hidden_size: int = 256, 
    hidden_layers: list = [256, 128, 64],
    dropout_layers: list = [0.3, 0.3, 0.3],
    num_filters: list = [100, 100, 50, 25],
    kernel_sizes: list = [2, 3, 4, 5],
    learning_rate: float = 0.00005,
    epsilon: float = 1e-08,
    num_classes: int = 28,
    max_sequence_length: int = MAX_SEQUENCE_LENGTH,
):
    """
    Build a classification model using a pre-trained transformer
    
    :param t_model: transformer model
    :param trainable: select whith parts of the transformer model are trainable (all, last, none)
    :param head: type of head to be applied (none, dense, mlp, cnn)
    :param dropout: dropout value to be selected
    :param label_smoothing: label smoothing to be applied
    :param hidden_size: number of nodes for head=dense
    :param hidden_layers: number of nodes per layer for head=mlp
    :param dropout_layers: dropout rate for each hidden layer for head=mlp
    :param num_filters: number of filters to use for head=cnn
    :param kernel_sizes: kernal sizes to use for head=cnn
    :param learning_rate: learning rate applied for Adam
    :param epsilon: epsilon selected for Adam
    :param num_classes: number of classes to predict
    :param max_sequence_length: maximum sequence length selected
    """
    # set the model to be trainable
    if trainable == "all":
        t_model.trainable = True
    elif trainable == "last":
        last_layer_num = max(
            [
                int(w.name[w.name.index("layer_._"):].split("/")[0].replace("layer_._", ""))
                for w in t_model.weights if "layer_._" in w.name
            ]
        )
        for w in t_model.weights:
            if f"layer_._{last_layer_num}" not in w.name:
                w._trainable = False
    elif trainable == "none":
        t_model.trainable = False
    else:
        raise NotImplementedError
    
    # extract input ids, token ids and the attention mask
    input_ids = tf.keras.layers.Input(shape=(max_sequence_length,), dtype=tf.int64, name="input_ids_layer")
    token_type_ids = tf.keras.layers.Input(shape=(max_sequence_length,), dtype=tf.int64, name="token_type_ids_layer")
    attention_mask = tf.keras.layers.Input(shape=(max_sequence_length,), dtype=tf.int64, name="attention_mask_layer")
    
    # encode this into the model
    model_inputs = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": attention_mask}      
    output = t_model(model_inputs)
    
    # if no head was selected, pass the pooler token to a dropout layer
    if head == "none":
        pooler_token = output[1]
        hidden = tf.keras.layers.Dropout(dropout)(pooler_token)
        
    # if a dense head was selected, add a hidden layer with the selected hidden size
    elif head == "dense":
        pooler_token = output[1]
        hidden = tf.keras.layers.Dense(hidden_size, activation="relu", name="hidden_layer")(pooler_token)
        hidden = tf.keras.layers.Dropout(dropout)(hidden)
    
    # if multi-layer perceptron was selected, add the hidden layers on top of the pooler token
    elif head == "mlp":
        pooler_token = output[1]
        hidden = tf.keras.layers.Dense(hidden_layers[0], activation="relu", name="hidden_layer")(pooler_token)
        i = 0
        for size in hidden_layers[1:]:
            hidden = tf.keras.layers.Dropout(dropout_layers[i])(hidden)
            hidden = tf.keras.layers.Dense(size, activation="relu", name="hidden_layer")(hidden)
            i += 1
        hidden = tf.keras.layers.Dropout(dropout_layers[-1])(hidden)
    
    # if cnn was selected, get the token embeddings and create a cnn network
    elif head == "cnn":
        token_embeddings = output[0]
        cnn_outputs = []
        for filters, kernel_size in zip(num_filters, kernel_sizes):
            conv_layer = tf.keras.layers.Conv1D(
                filters=filters, kernel_size=kernel_size, activation='relu'
            )(token_embeddings)
            max_pool = tf.keras.layers.GlobalMaxPooling1D()(conv_layer)
            cnn_outputs.append(max_pool)
        cnn_concat = tf.keras.layers.concatenate(cnn_outputs)
        
        hidden = tf.keras.layers.Dense(hidden_size, activation='relu', name='hidden_layer')(cnn_concat)
        hidden = tf.keras.layers.Dropout(dropout)(hidden)
    
    # with the final hidden layer, add a dense layer for the classification task
    classification = tf.keras.layers.Dense(num_classes, activation="softmax", name="classification_layer")(hidden)
    
    # instantiate the classification model
    classification_model = tf.keras.Model(inputs=[input_ids, token_type_ids, attention_mask], outputs=[classification])
    
    # compile using the learning rate and selected epsilon
    classification_model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=epsilon),
        loss=tf.keras.losses.CategoricalCrossentropy(label_smoothing=label_smoothing), 
        metrics=[tfa.metrics.F1Score(num_classes=num_classes, average="macro", threshold=0.2)]
    )
    
    return classification_model

### Evaluation 

In [15]:
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score


TARGET_NAMES = sorted(list(dataset["goemotion"].unique()))


def evaluate_model(p_set: str, proba: pd.DataFrame, dataset: pd.DataFrame) -> dict:
    """
    Evaluate the model results
    
    :param p_set: which prediction set to use (test, validation)
    :param dataset: the complete dataset
    :return: model evaluation values
    """
    outputs = dict()
    df = dataset.loc[lambda f: f["set"] == p_set]
    
    # get the one-hot-encoded values
    pv = df.pivot_table(index=["code"], columns="goemotion", values="set", aggfunc="count", fill_value=0)
    
    # generate the predictions
    proba = proba.loc[pv.index, pv.columns]
    predictions = (proba.values > 0.2).astype(int)
    pred = (
        proba.reset_index()
        .rename(columns={"index": "code"})
        .melt(id_vars=["code"], var_name="goemotion", value_name="proba")
    )
    pred["flag"] = pred["proba"] > 0.2
    outputs["predictions"] = pred
    
    # calculate metrics
    outputs["f1_macro"] = f1_score(pv[TARGET_NAMES].values, predictions, average="macro")
    outputs["f1_micro"] = f1_score(pv[TARGET_NAMES].values, predictions, average="micro")
    outputs["roc_auc"] = roc_auc_score(pv[TARGET_NAMES].values, proba.values, average="macro", multi_class="ovo")
    outputs["confusion_matrix"] = confusion_matrix(
        np.argmax(pv[TARGET_NAMES].values, axis=1), np.argmax(predictions, axis=1)
    )
    outputs["classification_report"] = classification_report(
        pv[TARGET_NAMES].values, predictions, target_names=TARGET_NAMES
    )
    
    # get the misclassification value
    df = df.merge(pred.loc[lambda f: f["flag"] == 1], on=["code", "goemotion"], how="left")
    corrclass = df[df["flag"].notnull()]
    misclass = df[df["flag"].isnull()]
    outputs["misclassification"] = misclass
    
    # get misclassification examples
    outputs["misclassification_examples"] = {
        label: misclass[misclass["goemotion"] == label]
        .sample(3, replace=True)
        .drop_duplicates()
        .text
        .to_list()
        for label in TARGET_NAMES
        if misclass[misclass["goemotion"] == label].shape[0] > 0
    }
    
    return outputs

### Scheduler 

In [16]:
def scheduler_10(epoch, lr):
    return lr


def scheduler_05(epoch, lr):
    if epoch > 0:
        return lr * 0.5
    else:
        return lr

    
def scheduler_02(epoch, lr):
    if epoch > 0:
        return lr * 0.2
    else:
        return lr

    
def scheduler_exp(epoch, lr):
    if epoch > 0:
        return lr * tf.math.exp(-0.1)
    else:
        return lr

### Pipeline 

In [17]:
import gc
import json
import os
import pickle
import shutil
import ipynbname
from pathlib import Path


NB_FNAME = ipynbname.name()

SCHEDULERS = {"1.0": scheduler_10, "0.5": scheduler_05, "0.2": scheduler_02, "exp": scheduler_exp}


def limit_mem():
    tf.compat.v1.keras.backend.get_session().close()
    cfg = tf.compat.v1.ConfigProto()
    cfg.gpu_options.allow_growth = True
    tf.compat.v1.keras.backend.set_session(tf.compat.v1.Session(config=cfg))
    


def run_model_experiment(grid: dict, dataset: pd.DataFrame):   
    # select the appropriate dataset based on the balancing parameter
    if grid["balancing"] != "none":
        data_for_exp = apply_balancing(dataset, grid["balancing"], grid["augment"])
    else:
        data_for_exp = dataset.copy()

    # select the combinations to execute
    combinations = {k: (k,) for k in data_for_exp.goemotion.unique()}
    if len(grid["minority_shuffling"]) > 0:
        combinations = grid["minority_shuffling"]

    # apply cleaning to the dataset
    if grid["clean_data"]:
        raise NotImplementedError

    # get the tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(grid["model"])
    try:
        t_model = TFAutoModel.from_pretrained(grid["model"])
    except OSError:
        t_model = TFAutoModel.from_pretrained(grid["model"], from_pt=True)

    # get how many model replacements we should make
    replaces = [{v: k for k, vl in combinations.items() for v in vl}]
    for k, vl in combinations.items():
        if len(vl) > 1:
            replaces.append({v: v for v in vl})
            
    # create the folder
    folder = Path("../experiments/" + pd.to_datetime("today").strftime("%Y%m%dT%H%M%S"))
    folder.mkdir(exist_ok=False, parents=True)

    # run the model loop
    predictions = dict()
    for i, comb in enumerate(replaces):
        # select the data to be used for training
        data_for_training = (
            data_for_exp.loc[lambda f: ((f["goemotion"].isin(comb)) & (f["set"] == "train")) | (f["set"] != "train")]
            .assign(goemotion=lambda f: f["goemotion"].apply(lambda x: "none" if x not in comb else x))
            .assign(goemotion=lambda f: f["goemotion"].replace(comb))
        )

        # apply tokenization
        processed = apply_tokenization(t_model, tokenizer, data_for_training, grid["emoji_tagging"])

        # create the model
        cls_model = create_cls_model(
            t_model, 
            trainable=grid["trainable"], 
            head=grid["head"], 
            dropout=grid.get("dropout", 0.3),
            label_smoothing=grid["label_smoothing"],
            learning_rate=grid["learning_rate"],
            num_classes=data_for_training.loc[lambda f: f["set"] == "train"].goemotion.nunique(),
        )
        
        # get the indexes that make the validation set
        indexes = (
            data_for_training.loc[lambda f: f["set"] == "validation"]
            .pivot_table(index="code", columns=["goemotion"], values="set", aggfunc="count", fill_value=0)
            .drop(columns=["none"], errors="ignore")
            .sum(axis=1)
            .reset_index()
            .loc[lambda f: f[0] > 0]
            .index
        )
        val_inputs = [tf.convert_to_tensor(tensor.numpy()[indexes]) for tensor in processed["validation"]["inputs"]]
        val_labels = processed["validation"]["labels"][indexes]

        # fit the model
        cb_scheduler = tf.keras.callbacks.LearningRateScheduler(SCHEDULERS[grid["scheduler"]])
        cb_earlystop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
        model_history = cls_model.fit(
            processed["train"]["inputs"],
            processed["train"]["labels"],
            validation_data=(val_inputs, val_labels),
            batch_size=grid["batch_size"],
            epochs=grid["epochs"],
            callbacks=[cb_scheduler, cb_earlystop],
        )
        
        # save the history data
        with open(folder / f"history_{i}.pkl", "wb") as f:
            pickle.dump(model_history.history, f)

        # run the probability calculation
        classes = sorted(list(set(comb.values())))
        predictions[tuple(classes)] = dict()
        for p_set in ["validation", "test"]:
            index = (
                data_for_training.loc[lambda f: f["set"] == p_set]
                .pivot_table(index="code", columns=["goemotion"], values="set", aggfunc="count", fill_value=0)
                .index
            )
            predictions[tuple(classes)][p_set] = pd.DataFrame(
                cls_model.predict(processed[p_set]["inputs"]), index=index, columns=classes,
            )
    
    # export the base predictions
    with open(folder / "original_predictions.pkl", "wb") as f:
        pickle.dump(predictions, f)
    
    # ensure for the post predictions that we have the expected values
    base = predictions[list(predictions)[0]]
    for k, vl in combinations.items():
        if len(vl) > 1:
            values = predictions[tuple(sorted(vl))]
            for p_set in ["validation", "test"]:
                base[p_set] = pd.concat(
                    [base[p_set].drop(columns=k), values[p_set].multiply(base[p_set][k], axis=0)], axis=1
                )

    # run the model evaluation on validation
    val_res = evaluate_model("validation", base["validation"], data_for_exp)
    test_res = evaluate_model("test", base["test"], data_for_exp)
    
    # save the model val_res
    with open(folder / "grid_params.json", "w") as f:
        json.dump(grid, f)
    
    for res, name in [(val_res, "validation"), (test_res, "test")]:
        with open(folder / f"metrics_{name}.json", "w") as f:
            json.dump(
                {
                    r: v 
                    for r, v in res.items() 
                    if r not in ["confusion_matrix", "predictions", "misclassification"]
                }, 
                f
            )
        with open(folder / f"confusion_matrix_{name}.pkl", "wb") as f:
            pickle.dump(res["confusion_matrix"], f)

        res["predictions"].to_pickle(folder / f"prediction_{name}.pkl")
        res["misclassification"].to_pickle(folder / f"misclassification_{name}.pkl")
    
    data_for_exp.to_pickle(folder / "data_for_exp.pkl")
    
    shutil.copyfile(os.path.abspath(f"{NB_FNAME}.ipynb"), folder / f"{NB_FNAME}.ipynb")

---

## Grid Space 

In [18]:
GRID = {
    "model": [
        "bert-base-uncased",
        "vinai/bertweet-base",
        "flboehm/reddit-bert-text_10",
    ],
    "trainable": ["all", "last", "none"],
    "head": ["none", "dense", "cnn", "mlp"],
    "dropout": [0.3],
    "label_smoothing": [0, 0.1],
    "epochs": [10],
    "batch_size": [8, 16],
    "learning_rate": [0.00005],
    "emoji_tagging": [False, True],
    "clean_data": [False, True],
    "balancing": ["none", "avg"],
    "augment": [False, True],
    "minority_shuffling": [
        dict(),
        {
            "neutral": ["neutral"],
            "anger": ["anger", "annoyance", "disapproval"],
            "disgust": ["disgust"],
            "fear": ["fear", "nervousness"],
            "joy": [
                "joy", 
                "amusement",
                "approval", 
                "excitement", 
                "gratitude",  
                "love", 
                "optimism", 
                "relief", 
                "pride", 
                "admiration", 
                "desire", 
                "caring"
            ],
            "sadness": ["sadness", "disappointment", "embarrassment", "grief",  "remorse"],
            "surprise": ["surprise", "realization", "confusion", "curiosity"]
        },
        {
            "grief_sadness": ("grief", "sadness"),
            "nervousness_fear": ("nervousness", "fear"),
            "pride_approval": ("pride", "approval"),
            "realization_surprise": ("realization", "surprise"),
            "relief_joy": ("relief", "joy"),
            "neutral": ("neutral",),
            "optimism": ("optimism",),
            "anger": ("anger",),
            "desire": ("desire",),
            "love": ("love",),
            "disapproval": ("disapproval",),
            "amusement": ("amusement",),
            "caring": ("caring",),
            "excitement": ("excitement",),
            "curiosity": ("curiosity",),
            "embarrassment": ("embarrassment",),
            "disgust": ("disgust",),
            "gratitude": ("gratitude",),
            "annoyance": ("annoyance",),
            "confusion": ("confusion",),
            "disappointment": ("disappointment",),
            "admiration": ("admiration",),
            "remorse": ("remorse",)
        }
    ]
}

EARLY_STOP = 0.55

keys, values = zip(*GRID.items())
grid_space = [dict(zip(keys, v)) for v in itertools.product(*values)]

valid_space = list()
for g in grid_space:
    if g["clean_data"] and not g["emoji_tagging"]:
        continue
    if g["augment"] and g["balancing"] == "none":
        continue
    if len(g["minority_shuffling"]) > 0 and g["balancing"] != "none":
        continue
    if len(g["minority_shuffling"]) > 0 and g["augment"]:
        continue
    valid_space.append(g)
    
print(len(valid_space), "searches created - expected time:", len(valid_space) * 10000 * 2 / (24 * 60 * 60), "days")
for _ in range(10):
    random.shuffle(valid_space)

2160 searches created - expected time: 500.0 days


In [19]:
EKMAN_SAMPLING = {
    "neutral": ("neutral", ),
    "anger": ("anger", "annoyance", "disapproval"),
    "disgust": ("disgust", ),
    "fear": ("fear", "nervousness"),
    "joy": (
        "joy", 
        "amusement",
        "approval", 
        "excitement", 
        "gratitude",  
        "love", 
        "optimism", 
        "relief", 
        "pride", 
        "admiration", 
        "desire", 
        "caring"
    ),
    "sadness": ("sadness", "disappointment", "embarrassment", "grief",  "remorse"),
    "surprise": ("surprise", "realization", "confusion", "curiosity")
}

SELECTED_SAMPLING_1 = {
    "grief_sadness": ("grief", "sadness"),
    "pride_admiration": ("pride", "admiration"),
    "nervousness": ("nervousness",),
    "fear": ("fear", ),
    "approval": ("approval", ),
    "realization": ("realization", ),
    "surprise": ("surprise", ),
    "relief": ("relief",),
    "joy": ("joy",),
    "neutral": ("neutral",),
    "optimism": ("optimism",),
    "anger": ("anger",),
    "desire": ("desire",),
    "love": ("love",),
    "disapproval": ("disapproval",),
    "amusement": ("amusement",),
    "caring": ("caring",),
    "excitement": ("excitement",),
    "curiosity": ("curiosity",),
    "embarrassment": ("embarrassment",),
    "disgust": ("disgust",),
    "gratitude": ("gratitude",),
    "annoyance": ("annoyance",),
    "confusion": ("confusion",),
    "disappointment": ("disappointment",),
    "remorse": ("remorse",)
}

SELECTED_SAMPLING_2 = {
    "grief_sadness": ("grief", "sadness"),
    "pride_admiration": ("pride", "admiration"),
    "nervousness_fear": ("nervousness", "fear"),
    "approval": ("approval", ),
    "realization": ("realization", ),
    "surprise": ("surprise", ),
    "relief": ("relief",),
    "joy": ("joy",),
    "neutral": ("neutral",),
    "optimism": ("optimism",),
    "anger": ("anger",),
    "desire": ("desire",),
    "love": ("love",),
    "disapproval": ("disapproval",),
    "amusement": ("amusement",),
    "caring": ("caring",),
    "excitement": ("excitement",),
    "curiosity": ("curiosity",),
    "embarrassment": ("embarrassment",),
    "disgust": ("disgust",),
    "gratitude": ("gratitude",),
    "annoyance": ("annoyance",),
    "confusion": ("confusion",),
    "disappointment": ("disappointment",),
    "remorse": ("remorse",)
}

SELECTED_SAMPLING_3 = {
    "grief_sadness": ("grief", "sadness"),
    "pride_admiration": ("pride", "admiration"),
    "annoyance_anger": ("annoyance", "anger"),
    "nervousness": ("nervousness",),
    "fear": ("fear", )
    "approval": ("approval", ),
    "realization": ("realization", ),
    "surprise": ("surprise", ),
    "relief": ("relief",),
    "joy": ("joy",),
    "neutral": ("neutral",),
    "optimism": ("optimism",),
    "desire": ("desire",),
    "love": ("love",),
    "disapproval": ("disapproval",),
    "amusement": ("amusement",),
    "caring": ("caring",),
    "excitement": ("excitement",),
    "curiosity": ("curiosity",),
    "embarrassment": ("embarrassment",),
    "disgust": ("disgust",),
    "gratitude": ("gratitude",),
    "confusion": ("confusion",),
    "disappointment": ("disappointment",),
    "remorse": ("remorse",)
}

SELECTED_SAMPLING_4 = {
    "grief_sadness": ("grief", "sadness"),
    "pride_admiration": ("pride", "admiration"),
    "annoyance_anger": ("annoyance", "anger"),
    "nervousness_fear": ("nervousness", "fear"),
    "approval": ("approval", ),
    "realization": ("realization", ),
    "surprise": ("surprise", ),
    "relief": ("relief",),
    "joy": ("joy",),
    "neutral": ("neutral",),
    "optimism": ("optimism",),
    "desire": ("desire",),
    "love": ("love",),
    "disapproval": ("disapproval",),
    "amusement": ("amusement",),
    "caring": ("caring",),
    "excitement": ("excitement",),
    "curiosity": ("curiosity",),
    "embarrassment": ("embarrassment",),
    "disgust": ("disgust",),
    "gratitude": ("gratitude",),
    "confusion": ("confusion",),
    "disappointment": ("disappointment",),
    "remorse": ("remorse",)
}

---

## Experiments 

In [None]:
np.random.seed(33)
tf.random.set_seed(33)

for grid in [
    {
        "model": "bert-base-uncased",
        "trainable": "all",
        "head": "none",
        "scheduler": "0.2",
        "dropout": 0.3,
        "label_smoothing": 0,
        "epochs": 10,
        "batch_size": 16,
        "learning_rate": 0.00005,
        "emoji_tagging": True,
        "clean_data": False,
        "balancing": "none",
        "augment": False,
        "minority_shuffling": SELECTED_SAMPLING_2,
    },
    {
        "model": "roberta-base",
        "trainable": "all",
        "head": "none",
        "scheduler": "0.2",
        "dropout": 0.3,
        "label_smoothing": 0,
        "epochs": 10,
        "batch_size": 16,
        "learning_rate": 0.00005,
        "emoji_tagging": True,
        "clean_data": False,
        "balancing": "none",
        "augment": False,
        "minority_shuffling": SELECTED_SAMPLING_1,
    },
    {
        "model": "roberta-base",
        "trainable": "all",
        "head": "none",
        "scheduler": "0.2",
        "dropout": 0.3,
        "label_smoothing": 0,
        "epochs": 10,
        "batch_size": 16,
        "learning_rate": 0.00005,
        "emoji_tagging": True,
        "clean_data": False,
        "balancing": "none",
        "augment": False,
        "minority_shuffling": SELECTED_SAMPLING_2,
    },
]:
    limit_mem()
    tf.keras.backend.clear_session()
    while gc.collect():
        continue
    run_model_experiment(grid, dataset)




100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:01<00:00,  1.50it/s]


Epoch 1/10
Epoch 2/10
Epoch 3/10

---