In [None]:
import os

# takes care of annoying TF-GPU warnings
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

# remove useless Tensorflow warning:
# WARNING:absl:Found untraced functions such as _update_step_xla, lstm_cell_1_layer_call_fn, 
# lstm_cell_1_layer_call_and_return_conditional_losses, lstm_cell_2_layer_call_fn, 
# lstm_cell_2_layer_call_and_return_conditional_losses while saving (showing 5 of 5). 
# These functions will not be directly callable after loading.
import absl.logging
absl.logging.set_verbosity(absl.logging.ERROR)

In [None]:
# very useful for managing wandb runs: https://stackoverflow.com/questions/71106179/log-two-model-runs-with-keras-wandb
import wandb
from wandb.keras import WandbCallback
os.environ["WANDB_SILENT"] = "true"

#### RNN: Formula Label Prediction (multi-label, all features)

In [None]:
import numpy as np 
import pandas as pd
from pathlib import Path 
import ast

import tensorflow as tf

import tensorflow_datasets as tfds
import tensorflow_text as tf_text

import datetime

tfds.disable_progress_bar()
wandb_project_name = "multi_label_formula_classification"

In [None]:
import matplotlib.pyplot as plt

def plot_graphs(history, metric):
    plt.plot(history.history[metric])
    plt.plot(history.history["val_"+metric], "")
    plt.xlabel("Epochs")
    plt.ylabel(metric)
    plt.legend([metric, "val_"+metric])

#### Import Data and Preprocess Dataframe

In [None]:
def preprocess_data(corpus,
                    irrelevant_features=["mtype",]):
    # drop irrelevant columns
    corpus.drop(irrelevant_features, inplace=True, axis=1)

    def cell_str_to_list(cell_val):
        return ast.literal_eval(cell_val)

    # filter strings
    def process_cell(cell_str):
        stripped_f_str = cell_str[1:-1].replace("\\\\", "\\")
        f_list = stripped_f_str.split(",")
        f_list = [token.replace("'", "").replace(" ", "") for token in f_list]
        f_list = ["{" if token == "\\{" else token for token in f_list]
        f_list = ["}" if token == "\\}" else token for token in f_list]
        cell_str = " ".join(f_list)
        return cell_str

    corpus["type_tokens"] = corpus["type_tokens"].map(process_cell)
    corpus["tokens"] = corpus["tokens"].map(process_cell)
    corpus["mtype_one_hot"] = corpus["mtype_one_hot"].map(cell_str_to_list)
    corpus["labels"] = corpus["labels"].map(cell_str_to_list)
    corpus = corpus.loc[(corpus["tokens"].str.len() > 0) & (corpus["tokens"] != " ")]


In [None]:
#print(os.getcwd())
data_p = Path("../data/") / "multi_class_unbalanced_data_TOKENIZED_V1.csv"
data = pd.read_csv(data_p)
preprocess_data(data)
data.head()

In [None]:
print(data["type_tokens"].map(lambda x: len((x.split(" ")))).max())
print(data["tokens"].map(lambda x: len((x.split(" ")))).max())

In [None]:
# ordinary datasets
SMALL_TRAIN_SIZE = 24620 - 2460
SMALL_TEST_SIZE = 2460
LARGE_TRAIN_SIZE = 106523 - 10650
LARGE_TEST_SIZE = 10650
# compact datasets
NUM_CLASSES = 40

labels_array = np.array(data["labels"].to_list())
m_type_array = np.array(data["mtype_one_hot"].to_list())
data_as_ds = tf.data.Dataset.from_tensor_slices((data["tokens"],data["type_tokens"],m_type_array))
labels_ds = tf.data.Dataset.from_tensor_slices(labels_array)
data_as_ds = tf.data.Dataset.zip((data_as_ds, labels_ds))

test_dataset = data_as_ds.take(SMALL_TEST_SIZE)
train_dataset = data_as_ds.skip(SMALL_TEST_SIZE)

In [None]:
data_as_ds.element_spec

##### Setup and Data Preparation

In [None]:
for (example_token, example_type, example_m_type), label in train_dataset.take(5):
    print("text: ", example_token.numpy())
    print("type: ", example_type.numpy())
    print("m_type: ", example_m_type.numpy())
    print("label: ", label.numpy())

In [None]:
BUFFER_SIZE = 2000
BATCH_SIZE = 64
STEPS_PER_EPOCH = np.floor(SMALL_TRAIN_SIZE/BATCH_SIZE)
VAL_STEPS_PER_EPOCH = np.floor(SMALL_TEST_SIZE/BATCH_SIZE)
train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(BATCH_SIZE, drop_remainder=True).prefetch(tf.data.AUTOTUNE)

##### Text Encoding

In [None]:
# for int encoder
TYPE_TOKENS_MAX_SEQ_LEN = 260
TOKENS_MAX_SEQ_LEN = 260

# for other encoders 
TYPE_TOKENS_PAD_TO_MAX_TOKENS = 80
TOKENS_PAD_TO_MAX_TOKENS = 200
BIGRAM_PAD_TO_MAX_TOKENS = 350

In [None]:
def create_encoder(output_mode_str, n_grams, mode):
    if output_mode_str == "int":
        VOCAB_SIZE = 200
        if mode == "token":
            max_seq_len = TOKENS_MAX_SEQ_LEN
        elif mode == "type":
            max_seq_len = TYPE_TOKENS_MAX_SEQ_LEN

        encoder = tf.keras.layers.TextVectorization(
            standardize=None,
            output_mode=output_mode_str,
            ngrams = n_grams,
            output_sequence_length = max_seq_len,
            split="whitespace",
            max_tokens=VOCAB_SIZE)
        #TODO: adapt for different inputs
        if mode == "token": 
            encoder.adapt(train_dataset.map(lambda inputs, label: inputs[0])) # removes the label column through transformation: text, label -> text
        elif mode == "type":
            encoder.adapt(train_dataset.map(lambda inputs, label: inputs[1])) # removes the label column through transformation: text, label -> text
        return encoder
    
    if output_mode_str == "count" and n_grams == 2:
        max_seq_len = BIGRAM_PAD_TO_MAX_TOKENS
        encoder = tf.keras.layers.TextVectorization(
            standardize=None,
            output_mode=output_mode_str,
            ngrams = n_grams,
            pad_to_max_tokens = max_seq_len,
            split="whitespace",
            max_tokens=max_seq_len)
        
        #TODO: adapt for different inputs
        if mode == "token": 
            encoder.adapt(train_dataset.map(lambda inputs, label: inputs[0])) # removes the label column through transformation: text, label -> text
        elif mode == "type":
            encoder.adapt(train_dataset.map(lambda inputs, label: inputs[1])) # removes the label column through transformation: text, label -> text
        return encoder
    
    if mode == "token":
        max_seq_len = TOKENS_PAD_TO_MAX_TOKENS
    elif mode == "type":
        max_seq_len = TYPE_TOKENS_PAD_TO_MAX_TOKENS

    encoder = tf.keras.layers.TextVectorization(
        standardize=None,
        output_mode=output_mode_str,
        ngrams = n_grams,
        pad_to_max_tokens = max_seq_len,
        split="whitespace",
        max_tokens=max_seq_len)
    #TODO: adapt for different inputs
    if mode == "token": 
        encoder.adapt(train_dataset.map(lambda inputs, label: inputs[0])) # removes the label column through transformation: text, label -> text
    elif mode == "type":
        encoder.adapt(train_dataset.map(lambda inputs, label: inputs[1])) # removes the label column through transformation: text, label -> text
    
    return encoder


    

##### Representation 1: Use integer indices encoding

In [None]:
encoder_int_tokens = create_encoder("int", None, "token")
encoder_int_types = create_encoder("int", None, "type")

vocab_tokens = np.array(encoder_int_tokens.get_vocabulary())
vocab_size_tokens = len(encoder_int_tokens.get_vocabulary())
vocab_types = np.array(encoder_int_types.get_vocabulary())
vocab_size_types = len(encoder_int_types.get_vocabulary())

print("tokens (voc size): ", vocab_size_tokens)
print("types (voc size): ", vocab_size_types)

In [None]:
encoded_example_token = encoder_int_tokens(example_token).numpy()
encoded_example_types = encoder_int_types(example_type).numpy()

print("tokens: ")
print(example_token)
print(encoded_example_token)
print(encoded_example_token.shape)

print("types: ")
print(example_type)
print(encoded_example_types)
print(encoded_example_types.shape)

##### Representation 2: Count Vectorizer

In [None]:
encoder_count_tokens = create_encoder("count", None, "token")
encoder_count_types = create_encoder("count", None, "type")

vocab_tokens = np.array(encoder_count_tokens.get_vocabulary())
vocab_size_tokens = len(encoder_count_tokens.get_vocabulary())
vocab_types = np.array(encoder_count_types.get_vocabulary())
vocab_size_types = len(encoder_count_types.get_vocabulary())

print("tokens (voc size): ", vocab_size_tokens)
print("types (voc size): ", vocab_size_types)

In [None]:
encoded_example_token = encoder_count_tokens(example_token).numpy()
encoded_example_types = encoder_count_types(example_type).numpy()

print("tokens: ")
print(example_token)
print(encoded_example_token)
print(encoded_example_token.shape)

print("types: ")
print(example_type)
print(encoded_example_types)
print(encoded_example_types.shape)

##### Model (Bidirectional LSTM with one layer)

 **Define and compile model**

In [None]:
def create_model(tokens_encoder, types_encoder, tokens_size, types_size, input_type, add_inp_emb_dim=1):
    # model = create_model(encoder_count_tokens, encoder_count_types,tokens_input_len, "float", type_input_len)
    embedding_input_dim = tokens_size + types_size + add_inp_emb_dim + 3 # 3 size of sem_type_model_input

    tokens_model_input = tf.keras.layers.Input(dtype=tf.string, shape=(1,))
    tokens_vectorized = tokens_encoder(tokens_model_input)

    types_model_input = tf.keras.layers.Input(dtype=tf.string, shape=(1,))
    types_vectorized = types_encoder(types_model_input)

    if input_type == "float":
        sem_type_model_input = tf.keras.layers.Input(dtype=tf.float32, shape=(3,))
    elif input_type == "int":
        sem_type_model_input = tf.keras.layers.Input(dtype=tf.int64, shape=(3,))

    merged = tf.keras.layers.Concatenate(axis=1)([tokens_vectorized, types_vectorized, sem_type_model_input])

    embedded = tf.keras.layers.Embedding(input_dim= embedding_input_dim,
                                         output_dim=64,
                                         # user masking to handle the variable sequence lengths
                                         mask_zero=True)(merged)
    bilstmed = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64))(embedded)
    densed1 = tf.keras.layers.Dense(64, activation="relu")(bilstmed)
    model_output = tf.keras.layers.Dense(40)(densed1)


    model = tf.keras.models.Model(inputs=[tokens_model_input, types_model_input, sem_type_model_input], outputs=model_output)
    return model


 **Train the model**

##### Experiment 1: Use integer indices for encoding tokens

In [None]:
### TENSORBOARD USAGE
#log_dir = "4_nlp_rnns/logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
#tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir)

In [None]:
tokens_input_len = TOKENS_MAX_SEQ_LEN 
type_input_len = TYPE_TOKENS_MAX_SEQ_LEN

model = create_model(encoder_int_tokens, encoder_int_types,tokens_input_len, type_input_len, "int")
"""
sample_text = "( A \cup B ) \cap (C \cup D )"
# predict on a sample formula using untrained model
predictions = model.predict(np.array([sample_text]))
print(predictions[0])
"""

In [None]:
model.summary()

In [None]:
LEARNING_RATE = 1e-3
NUM_EPOCHS = 70

model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(LEARNING_RATE),
              metrics=["accuracy"])

In [None]:
configs_dict = {
    "learning_rate": LEARNING_RATE,
    "algorithm": "BiLstm",
    "configuration": "small-ordinary-unbalanced-all-inputs",
    "optimizer": "adam",
    "loss": "binary_crossentropy",
    "epochs": NUM_EPOCHS,
    "batch_size": 64,
    "vectorizer": "int",
    "dataset": "multi_class_unbalanced_data_TOKENIZED_V1"
}

run = wandb.init(project=wandb_project_name, reinit=True, config=configs_dict)

history = model.fit(train_dataset, 
                    epochs=NUM_EPOCHS,
                    validation_data=test_dataset,
                    #steps_per_epoch = STEPS_PER_EPOCH,
                    validation_steps = VAL_STEPS_PER_EPOCH,
                    callbacks= [WandbCallback()])#[tensorboard_callback])
run.finish()

In [None]:
test_loss, test_acc = model.evaluate(test_dataset)

print("Test Loss:", test_loss)
print("Test Accuracy:", test_acc)

In [None]:
plt.figure(figsize=(16,8))
plt.subplot(1,2,1)
plot_graphs(history, "accuracy")
plt.ylim(None, 1)
plt.subplot(1,2,2)
plot_graphs(history, "loss")
plt.ylim(0, None)

In [None]:

# predict on a sample text without padding
"""
predictions = model.predict((np.array([example_token]), np.array([example_type])))
print(predictions[0])
"""

##### Experiment 2:

In [None]:
tokens_input_len = TOKENS_PAD_TO_MAX_TOKENS 
type_input_len = TYPE_TOKENS_PAD_TO_MAX_TOKENS

model = create_model(encoder_count_tokens, encoder_count_types,tokens_input_len, type_input_len, "float") # maybe add 30
# predict on a sample formula using untrained model
#predictions = model.predict(np.array([sample_text]))
#print(predictions[0])

In [None]:
LEARNING_RATE = 1e-3
NUM_EPOCHS = 80
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(LEARNING_RATE),
              metrics=["accuracy"])

In [None]:
configs_dict = {
    "learning_rate": LEARNING_RATE,
    "algorithm": "BiLstm",
    "configuration": "small-ordinary-unbalanced-all-inputs",
    "optimizer": "adam",
    "loss": "binary_crossentropy",
    "epochs": NUM_EPOCHS,
    "batch_size": 64,
    "vectorizer": "count",
    "dataset": "multi_class_unbalanced_data_TOKENIZED_V1"
}
run = wandb.init(project=wandb_project_name, reinit=True, config=configs_dict)

history = model.fit(train_dataset, 
                    epochs=NUM_EPOCHS,
                    validation_data=test_dataset,
                    steps_per_epoch = STEPS_PER_EPOCH,
                    validation_steps = VAL_STEPS_PER_EPOCH,
                    callbacks= [WandbCallback()])#[tensorboard_callback])
run.finish()

In [None]:
test_loss, test_acc = model.evaluate(test_dataset)

print("Test Loss:", test_loss)
print("Test Accuracy:", test_acc)

In [None]:
plt.figure(figsize=(16,8))
plt.subplot(1,2,1)
plot_graphs(history, "accuracy")
plt.ylim(None, 1)
plt.subplot(1,2,2)
plot_graphs(history, "loss")
plt.ylim(0, None)

In [None]:
"""
# predict on a sample text without padding
predictions = model.predict(np.array([sample_text]))
print(predictions[0])
"""