### Import Libraries

In [None]:
import datasets
import pandas
import transformers
import tensorflow as tf
import keras_tuner
import numpy as np

### Set up Tokenizer
This object is used to convert sentences from a string to a list of integers

In [None]:
# use the tokenizer from DistilRoBERTa
tokenizer = transformers.AutoTokenizer.from_pretrained("distilroberta-base")

### Data Prep
These functions are used to load and transform the data into the appropriate format

In [None]:
def tokenize(examples):
    """Converts the text of each example to "input_ids", a sequence of integers
    representing 1-hot vectors for each token in the text"""
    return tokenizer(
        examples["text"],
        truncation = True,
        max_length = 64,
        padding = "max_length",
    )

In [None]:
def padToZero(example):
    """Embedding layer expects mask value to be zero, but the tokenizer's
    pad value is 1. This function changes 0s to 1s and 1s to 0s"""
    vector = np.array(example["input_ids"])
    return {
        "input_ids": np.where((vector == 0) | (vector == 1), vector^1, vector)
    }

tokenize() and padToZero() are used in the following function.
This function is used by both train() and search()

In [None]:
def prepare_data(train_path, dev_path):
    # load the CSVs into Huggingface datasets to allow use of the tokenizer
    hf_dataset = datasets.load_dataset("csv", data_files={
        "train": train_path, "validation": dev_path})

    # the labels are the names of all columns except the first
    labels = hf_dataset["train"].column_names[1:]

    def gather_labels(example):
        """Converts the label columns into a list of 0s and 1s"""
        # the float here is because F1Score requires floats
        return {"labels": [float(example[l]) for l in labels]}

    # convert text and labels to format expected by model
    hf_dataset = hf_dataset.map(gather_labels)
    hf_dataset = hf_dataset.map(tokenize, batched=True)
    hf_dataset = hf_dataset.map(padToZero)

    # convert Huggingface datasets to Tensorflow datasets
    train_dataset = hf_dataset["train"].to_tf_dataset(
        columns = 'input_ids',
        label_cols = "labels",
        batch_size = 8,
        shuffle = True,
    )

    dev_dataset = hf_dataset["validation"].to_tf_dataset(
        columns = 'input_ids',
        label_cols = "labels",
        batch_size = 8
    )

    return train_dataset, dev_dataset, labels

### Train

The final model I ended up with is a dual bidrectional RNN with GRU units.
The first layer is an embedding layer that transforms each integer in the input to a vector of size 256. 
`mask_zero = True` is there to make sure that the padding is ignored. 
The embedding layer is followed by the first bidirectionalo RNN layer. This layer returns a sequence.
This is followed by the second bidirectional RNN layer. I initially used LSTM units but I could not get the dev score to get above 0.80.
I tried reducing the complexity of the model but in the end using GRUs helped breaking through the 0.80 score. 
Having a dropout > 0.5 was also crucial for this.
Finally, the last layer is the output layer which is used for multi-class non-exclusive classification.
To allow for non-exclusivity, the sigmoid activation function was used (along with the proper loss function).

In [None]:
def train(model_path = "model.keras", train_path = "train.csv", dev_path = "dev.csv"):
    train_dataset, dev_dataset, labels = prepare_data(train_path, dev_path)

    model = tf.keras.Sequential([
            tf.keras.layers.Embedding(
                input_dim = tokenizer.vocab_size + 1,
                output_dim = 256,
                mask_zero = True,
            ),
            tf.keras.layers.Bidirectional(
                tf.keras.layers.GRU(
                    units = 256,
                    return_sequences = True,
                    ),
            ),
            tf.keras.layers.Dropout(0.56),
            tf.keras.layers.Bidirectional(
                tf.keras.layers.GRU(
                    units = 160
                ),
            ),
            tf.keras.layers.Dense(
                units = len(labels),
                activation = 'sigmoid'
            ),
        ])

    # specify compilation hyperparameters
    model.compile(
        optimizer = tf.keras.optimizers.Adam(learning_rate = 0.0006),
        loss = tf.keras.losses.binary_focal_crossentropy,
        metrics = [tf.keras.metrics.F1Score(average = "micro", threshold = 0.5)])

    checkpointCallback = tf.keras.callbacks.ModelCheckpoint(
        filepath = model_path,
        monitor = "val_f1_score",
        mode = "max",
        save_best_only = True
    )

    earlyStopCallback = tf.keras.callbacks.EarlyStopping(
        monitor = 'val_f1_score',
        mode = 'max',
        patience = 5,
    )

    # fit the model to the training data, monitoring F1 on the dev data
    model.fit(
        train_dataset,
        epochs = 50,
        validation_data = dev_dataset,
        callbacks = [
            checkpointCallback,
            earlyStopCallback
        ]
    )

If I had more time I would have kept tuning the learning rate. 
This was very important to getting a decent generalization score, and with more time could have resulted in a better score.
I also think the model could have benefited from a learning rate schedule since the model kept getting its best dev score in 2 or 3 epochs of training and then would drop after while the training score kept improving. I know that the typical "fix" for overfitting is to reduce the model complexity but somehow it seemed that I could not reach (let alone break) a dev score of .80 without making the model more complex. I tried adding as much dropout as I could initially but I could not break a dev score of 0.78. Until this current iteration of the model did I obtain 0.80. Further tuning allowed reaching 0.83.

I ended up using binary_focal_crossentropy as the loss function in hopes of improving the dev score. I forgot to switch back binary_crossentropy but it did not seem to hurt. From my understanding, binary_focal_crossentropy focuses on (weights more) the more difficult samples.

Two callbacks are used. ModelCheckpoint is used to save the model with the best validation score.
EarlyStopping is used to stop training if the validation score does not see improvement.
With the current iteration of the model, I found the best value for patience was 5.

### Hyperparameter Search
I went through several iterations of this function. 
Its current state is what I had after getting my best dev score (though it is basically what I had to get me to my final model architecture).
Random search was to try in an attempt to find the best hyperparameters. I was hoping once I narrowed it down I would switch to a grid search but I never got to that point.

In [None]:
def search(train_path = "train.csv", dev_path = "dev.csv"):
    train_dataset, dev_dataset, labels = prepare_data(train_path, dev_path)

    def build_model(hp):
        embedding_size = hp.Int(
            'emb_size',
            min_value = 64,
            max_value = 256,
            step = 32,
        )
        bilstm_0 = hp.Int(
            'bilstm_0',
            min_value = 64,
            max_value = 256,
            step = 32,
        )
        bilstm_1 = hp.Int(
            'bilstm_1',
            min_value = 64,
            max_value = 256,
            step = 32,
        )
        dropout = hp.Float(
            'dropout_0',
            min_value = 0.5,
            max_value = 0.64,
            step = 0.02,
        )
        learning_rate_0 = hp.Float(
            'learning_rate',
            min_value = 0.000001,
            max_value = 0.0005,
            step = 0.000005
        )

        model = tf.keras.Sequential([
            tf.keras.layers.Embedding(
                input_dim = tokenizer.vocab_size + 1,
                output_dim = embedding_size,
                mask_zero = True,
            ),
            tf.keras.layers.Bidirectional(
                tf.keras.layers.GRU(
                    units = bilstm_0,
                    return_sequences = True,
                    ),
            ),
            tf.keras.layers.Dropout(dropout),
            tf.keras.layers.Bidirectional(
                tf.keras.layers.GRU(
                    units = bilstm_1
                ),
            ),
            tf.keras.layers.Dense(
                units = len(labels),
                activation = 'sigmoid'
            ),
        ])

        # specify compilation hyperparameters
        model.compile(
            optimizer = tf.keras.optimizers.Adam(learning_rate = learning_rate_0),
            loss = tf.keras.losses.binary_focal_crossentropy,
            metrics = [tf.keras.metrics.F1Score(average = "micro", threshold = 0.5)]
        )
    
        return model

    tuner = keras_tuner.RandomSearch(
        build_model,
        objective = keras_tuner.Objective('val_f1_score', direction = 'max'),
        max_trials = 100,
        executions_per_trial = 1,
        directory = '.',
        project_name = 'RandomSearch_7',
        overwrite = False,
    )

    earlyStopCallback = tf.keras.callbacks.EarlyStopping(
        monitor = 'val_f1_score',
        mode = 'max',
        patience = 4,
    )
 
    tuner.search(
        train_dataset,
        validation_data = dev_dataset,
        epochs = 50,
        callbacks = [earlyStopCallback]
    )


### Predict
This function is used to load in a .csv file, prepare the data, and classify the sentences in the data using the model generated by train().

In [None]:
def predict(model_path = "model.keras", input_path = "test-ref.csv"):
    # load the saved model
    model = tf.keras.models.load_model(model_path)

    # load the data for prediction
    # use Pandas here to make assigning labels easier later
    df = pandas.read_csv(input_path)

    # create input features in the same way as in train()
    hf_dataset = datasets.Dataset.from_pandas(df)
    hf_dataset = hf_dataset.map(tokenize, batched=True)
    hf_dataset = hf_dataset.map(padToZero)
    tf_dataset = hf_dataset.to_tf_dataset(
        columns = 'input_ids',
        batch_size = 8,
    )

    # generate predictions from model
    predictions = np.where(model.predict(tf_dataset) > 0.5, 1, 0)

    # assign predictions to label columns in Pandas data frame
    df.iloc[:, 1:] = predictions

    # write the Pandas dataframe to a zipped CSV file
    df.to_csv("submission.zip", index=False, compression=dict(
        method='zip', archive_name=f'submission.csv'))

## Train & Predict
The following cell expects the "train.csv" and "dev.csv" to exist in the same directory that the notebook is executed from.
The files paths can be changed by updating the path arguments.
train() will generate a Keras model in the specified path. This file will then be used by predict() which loads in said model to do predictions on the file specified by the `input_path` argument. All input files are expected to be `.csv` files.

In [None]:
mode_path = "model.keras"

train(model_path = mode_path, train_path = "train.csv", dev_path = "dev.csv")

predict(model_path = mode_path, input_path = "test-ref.csv")