In [None]:
import os

# takes care of annoying TF-GPU warnings
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

# remove useless Tensorflow warning:
# WARNING:absl:Found untraced functions such as _update_step_xla, lstm_cell_1_layer_call_fn, 
# lstm_cell_1_layer_call_and_return_conditional_losses, lstm_cell_2_layer_call_fn, 
# lstm_cell_2_layer_call_and_return_conditional_losses while saving (showing 5 of 5). 
# These functions will not be directly callable after loading.
import absl.logging
absl.logging.set_verbosity(absl.logging.ERROR)

In [None]:
# very useful for managing wandb runs: https://stackoverflow.com/questions/71106179/log-two-model-runs-with-keras-wandb
import wandb
from wandb.keras import WandbCallback
os.environ["WANDB_SILENT"] = "true"

#### Embedding Training (Log Reg and BiLSTM)

In [None]:
import numpy as np 
import pandas as pd
from pathlib import Path 
import ast

import tensorflow as tf
import keras_tuner as kt

import tensorflow_datasets as tfds
import tensorflow_text as tf_text

import datetime
import io

tfds.disable_progress_bar()
wandb_project_name = "formula_embedding_training"

In [None]:
def save_token_embeddings(model, encoder_int_tokens, embedding_layer_name, class_task_str, emb_dim_str):

    vec_path = Path("embedding_vecs_binary/") / (class_task_str + "_" + emb_dim_str + "_" + "vectors.tsv")
    meta_path = Path("embedding_vecs_binary/") / (class_task_str + "_" + emb_dim_str + "_" + "metadata.tsv")

    out_v = io.open(vec_path, 'w', encoding='utf-8')
    out_m = io.open(meta_path, 'w', encoding='utf-8')
    weights = model.get_layer(embedding_layer_name).get_weights()[0]
    vocab = encoder_int_tokens.get_vocabulary()

    for index, word in enumerate(vocab):
        if index == 0:
            continue  # skip 0, it's padding.
        vec = weights[index]
        out_v.write('\t'.join([str(x) for x in vec]) + "\n")
        out_m.write(word + "\n")
    out_v.close()
    out_m.close()

#### Import Data and Preprocess Dataframe

In [None]:
def preprocess_data(corpus,
                    irrelevant_features=["mtype",]):
    # drop irrelevant columns
    corpus.drop(irrelevant_features, inplace=True, axis=1)

    def cell_str_to_list(cell_val):
        return ast.literal_eval(cell_val)

    # filter strings
    def process_cell(cell_str):
        stripped_f_str = cell_str[1:-1].replace("\\\\", "\\")
        f_list = stripped_f_str.split(",")
        f_list = [token.replace("'", "").replace(" ", "") for token in f_list]
        f_list = ["{" if token == "\\{" else token for token in f_list]
        f_list = ["}" if token == "\\}" else token for token in f_list]
        cell_str = " ".join(f_list)
        return cell_str

    corpus["type_tokens"] = corpus["type_tokens"].map(process_cell)
    corpus["tokens"] = corpus["tokens"].map(process_cell)
    corpus["mtype_one_hot"] = corpus["mtype_one_hot"].map(cell_str_to_list)
    corpus["labels"] = corpus["labels"].map(cell_str_to_list)
    corpus = corpus.loc[(corpus["tokens"].str.len() > 0) & (corpus["tokens"] != " ")]

In [None]:
#print(os.getcwd())
data_p = Path("../data/") / "multi_class_unbalanced_data_TOKENIZED_V2.csv"
data = pd.read_csv(data_p)
preprocess_data(data)
data.head()

In [None]:
LARGE_TRAIN_SIZE = 106523 - 10650
LARGE_TEST_SIZE = 10650
# compact datasets
NUM_CLASSES = 40

dataset1_tokens = tf.data.Dataset.from_tensor_slices((data["tokens"]), name="data")
labels_array = np.array(data["labels"].to_list())
labels_ds = tf.data.Dataset.from_tensor_slices(labels_array, name="label")
dataset1_tokens_l = tf.data.Dataset.zip((dataset1_tokens, labels_ds))
test_dataset1 = dataset1_tokens_l.take(LARGE_TEST_SIZE)
train_dataset1 = dataset1_tokens_l.skip(LARGE_TEST_SIZE)

#### Setup and Data Preparation

In [None]:
for example_token,  label in train_dataset1.take(5):
    print("text: ", example_token.numpy())
    print("label: ", label.numpy())

In [None]:
BUFFER_SIZE = 2000
BATCH_SIZE = 64
STEPS_PER_EPOCH = np.floor(LARGE_TRAIN_SIZE/BATCH_SIZE)
VAL_STEPS_PER_EPOCH = np.floor(LARGE_TEST_SIZE/BATCH_SIZE)

In [None]:
test_dataset1 = test_dataset1.batch(BATCH_SIZE, drop_remainder=True).prefetch(tf.data.AUTOTUNE)
train_dataset1 = train_dataset1.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True).prefetch(tf.data.AUTOTUNE)

#### Text Encoding (integer indices)

In [None]:
def create_encoder(output_mode_str, n_grams):
    VOCAB_SIZE = 1000
    encoder = tf.keras.layers.TextVectorization(
        standardize=None,
        output_mode=output_mode_str,
        ngrams = n_grams,
        split="whitespace",
        max_tokens=VOCAB_SIZE)
    encoder.adapt(train_dataset1.map(lambda tokens, label: tokens)) # removes the label column through transformation: text, label -> text
    return encoder

In [None]:
encoder_int_tokens = create_encoder("int", 1)

vocab_tokens = np.array(encoder_int_tokens.get_vocabulary())
vocab_tokens[:500]

In [None]:
encoded_example = encoder_int_tokens(example_token).numpy()
print(example_token)
print(encoded_example)

#### Experiment 1: LR, emb dim = 64

In [None]:
NUM_EPOCHS = 150
optimal_lr = 0.0001
optimal_emb_dims = 64
dp1 = 0.2
dp2 = 0.3
tokens_input_len = len(encoder_int_tokens.get_vocabulary()) + 1

model1 = mc_u.create_model1_LR(NUM_CLASSES, optimal_emb_dims, dp1, dp2, encoder_int_tokens, tokens_input_len)
model1.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(optimal_lr),
              metrics=["accuracy", tf.keras.metrics.Recall()])

In [None]:
configs_dict = {
    "learning_rate": optimal_lr,
    "emb_dim": optimal_emb_dims,
    "dp1": dp1, 
    "dp2": dp2,
    "algorithm": "LogReg",
    "configuration": "multi_only_tokens",
    "optimizer": "adam",
    "loss": "binary_crossentropy",
    "epochs": NUM_EPOCHS,
    "batch_size": 64,
    "vectorizer": "int",
    "dataset": "multi_class_unbalanced_data_TOKENIZED_V2"
}

run = wandb.init(project=wandb_project_name, reinit=True, config=configs_dict)

history = model1.fit(train_dataset1, 
                    epochs=NUM_EPOCHS,
                    validation_data=test_dataset1,
                    #steps_per_epoch = STEPS_PER_EPOCH,
                    validation_steps = VAL_STEPS_PER_EPOCH,
                    callbacks= [WandbCallback()])#[tensorboard_callback])
run.finish()

In [None]:
model1.summary()

In [None]:
# take model -> save embedding vecs
mc_u.save_token_embeddings(model1, encoder_int_tokens, "embedding", "multilabel", str(optimal_emb_dims))

#### Experiment 2: LR, emb dim = 128

In [None]:
NUM_EPOCHS = 150
optimal_lr = 0.0001
optimal_emb_dims = 128
dp1 = 0.2
dp2 = 0.3
tokens_input_len = len(encoder_int_tokens.get_vocabulary()) + 1

model2 = mc_u.create_model1_LR(NUM_CLASSES, optimal_emb_dims, dp1, dp2, encoder_int_tokens, tokens_input_len)
model2.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(optimal_lr),
              metrics=["accuracy", tf.keras.metrics.Recall()])

In [None]:
configs_dict = {
    "learning_rate": optimal_lr,
    "emb_dim": optimal_emb_dims,
    "dp1": dp1, 
    "dp2": dp2,
    "algorithm": "LogReg",
    "configuration": "multi_only_tokens",
    "optimizer": "adam",
    "loss": "binary_crossentropy",
    "epochs": NUM_EPOCHS,
    "batch_size": 64,
    "vectorizer": "int",
    "dataset": "multi_class_unbalanced_data_TOKENIZED_V2"
}

run = wandb.init(project=wandb_project_name, reinit=True, config=configs_dict)

history = model2.fit(train_dataset1, 
                    epochs=NUM_EPOCHS,
                    validation_data=test_dataset1,
                    validation_steps = VAL_STEPS_PER_EPOCH,
                    callbacks= [WandbCallback()])#[tensorboard_callback])
run.finish()

In [None]:
model2.summary()

In [None]:
# take model -> save embedding vecs
mc_u.save_token_embeddings(model2, encoder_int_tokens, "embedding", "multilabel", str(optimal_emb_dims))

#### Experiment 3: RNN, emb dim = 64

In [None]:
NUM_EPOCHS = 50
optimal_lr = 0.0001
optimal_emb_dims = 64
lstm_units = 64
dense_units = 64
tokens_input_len = len(encoder_int_tokens.get_vocabulary()) + 1

model3 = mc_u.create_model1_RNN(NUM_CLASSES, optimal_emb_dims, lstm_units, dense_units, encoder_int_tokens, tokens_input_len)
model3.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(optimal_lr),
              metrics=["accuracy", tf.keras.metrics.Recall()])

In [None]:
configs_dict = {
    "learning_rate": optimal_lr,
    "emb_dim": optimal_emb_dims,
    "lstm_units": lstm_units, 
    "dense_units": dense_units,
    "algorithm": "BiLstm",
     
    "configuration": "multi_only_tokens",
    "optimizer": "adam",
    "loss": "binary_crossentropy",
    "epochs": NUM_EPOCHS,
    "batch_size": 64,
    "vectorizer": "int",
    "dataset": "multi_class_unbalanced_data_TOKENIZED_V2"
}

run = wandb.init(project=wandb_project_name, reinit=True, config=configs_dict)

history = model3.fit(train_dataset1, 
                    epochs=NUM_EPOCHS,
                    validation_data=test_dataset1,
                    #steps_per_epoch = STEPS_PER_EPOCH,
                    validation_steps = VAL_STEPS_PER_EPOCH,
                    callbacks= [WandbCallback()])#[tensorboard_callback])
run.finish()

In [None]:
model3.summary()

In [None]:
# take model -> save embedding vecs
mc_u.save_token_embeddings(model3, encoder_int_tokens, "embedding", "multilabel", str(optimal_emb_dims))

#### Experiment 4: RNN, emb dim = 128

In [None]:
NUM_EPOCHS = 50
optimal_lr = 0.0001
optimal_emb_dims = 128
lstm_units = 64
dense_units = 64
tokens_input_len = len(encoder_int_tokens.get_vocabulary()) + 1

model4 = mc_u.create_model1_RNN(NUM_CLASSES, optimal_emb_dims, lstm_units, dense_units, encoder_int_tokens, tokens_input_len)
model4.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(optimal_lr),
              metrics=["accuracy", tf.keras.metrics.Recall()])

In [None]:
configs_dict = {
    "learning_rate": optimal_lr,
    "emb_dim": optimal_emb_dims,
    "lstm_units": lstm_units, 
    "dense_units": dense_units,
    "algorithm": "BiLstm",
     
    "configuration": "multi_only_tokens",
    "optimizer": "adam",
    "loss": "binary_crossentropy",
    "epochs": NUM_EPOCHS,
    "batch_size": 64,
    "vectorizer": "int",
    "dataset": "multi_class_unbalanced_data_TOKENIZED_V2"
}

run = wandb.init(project=wandb_project_name, reinit=True, config=configs_dict)

history = model4.fit(train_dataset1, 
                    epochs=NUM_EPOCHS,
                    validation_data=test_dataset1,
                    #steps_per_epoch = STEPS_PER_EPOCH,
                    validation_steps = VAL_STEPS_PER_EPOCH,
                    callbacks= [WandbCallback()])#[tensorboard_callback])
run.finish()

In [None]:
model4.summary()

In [None]:
# take model -> save embedding vecs
mc_u.save_token_embeddings(model4, encoder_int_tokens, "embedding", "multilabel", str(optimal_emb_dims))