In [None]:
import sys
sys.path.append("..")

In [None]:
# import libraries
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

from arjun import ArjunModel, ArjunTokenizer

In [None]:
# config
TOKENIZER_PATH = "../resources/vac-arjun-small"
PRETRAINED_MODEL_PATH = "../resources/vac-arjun-small"

In [None]:
# read tokenizer
tokenizer = ArjunTokenizer.from_pretrained(TOKENIZER_PATH)
len(tokenizer)

In [None]:
# load pretrained model
model = ArjunModel.from_pretrained(PRETRAINED_MODEL_PATH)
model.summary()

# Poem Classification

In [None]:
# read data
MAX_LEN = 256
data_path = "<enter_your_data_path_here>"
df = pd.read_parquet(data_path) # or use pd.read_csv for csv files
df["tokens"] = df["text"].apply(lambda x: [tokenizer.convert_tokens_to_ids("[MT]")] + \
                                tokenizer(x, add_special_tokens=False, max_length=MAX_LEN-1, padding="max_length", 
                                          truncation=True, return_attention_mask=False, return_token_type_ids=False)["input_ids"])

In [None]:
# process data
df_train, df_test = train_test_split(df, test_size=0.2, random_state=20)

X_train = np.array(df_train.tokens.tolist())
X_test = np.array(df_test.tokens.tolist())

y_train = np.array(df_train.label)
y_test = np.array(df_test.label)

In [None]:
# define poem classification model
encoder_model = tf.keras.Model(inputs=[model.get_layer(index=2).input], outputs=[model.get_layer(index=4).output])

def get_poem_classification_model():
    encoder_input = tf.keras.layers.Input(shape=(MAX_LEN, ))
    y = encoder_model(encoder_input)
    y = tf.keras.layers.Lambda(lambda x: x[:,0,:])(y)
    y = tf.keras.layers.Dense(df.label.nunique(), activation="softmax")(y)


    final_model = tf.keras.models.Model(encoder_input, y)
    return final_model

pc_model = get_poem_classification_model()
pc_model.summary()

In [None]:
# macro F1 callback
class MacroF1(tf.keras.callbacks.Callback):
    def __init__(self, X_test, y_test, patience, batch_size, mode, ignore_label=-100.0):
        super(MacroF1, self).__init__()
        self.X_test = X_test
        self.y_test = y_test
        self.patience = patience
        self.best = 0
        self.wait = 0
        self.batch_size = batch_size
        self.mode = mode
        self.ignore_label = ignore_label

    def on_epoch_end(self, epoch, logs=None):
        preds = self.model.predict(self.X_test, verbose=0, batch_size=self.batch_size)
        y_test = self.y_test
        
        if len(preds.shape) == 3:
            pred_class = np.argmax(preds, axis=-1).reshape(-1, )
            y_test = y_test.reshape(-1, )
            
            keep_idx = np.where(y_test != self.ignore_label)[0]
            y_test = np.take(y_test, keep_idx)
            pred_class = np.take(pred_class, keep_idx)
        else:
            if preds.shape[1] == 1:
                preds = preds.reshape(-1,)
                pred_class = np.where(preds > 0.5, 1, 0)
            else:
                pred_class = np.argmax(preds, axis=1)

        macro_f1 = f1_score(y_test, pred_class, average="macro")
        logs["test_macro_f1"] = macro_f1

        if np.greater(macro_f1, self.best):
            self.best = macro_f1
            self.wait = 0

        else:
            self.wait += 1
            if self.wait >= self.patience:
                self.stopped_epoch = epoch
                self.model.stop_training = True


In [None]:
# compile and train
BATCH_SIZE = 32
optimizer = tf.keras.optimizers.experimental.AdamW(learning_rate=5e-5)
pc_model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"])

es = tf.keras.callbacks.EarlyStopping(monitor="val_accuracy", mode="max", restore_best_weights=True, patience=10)
macro_f1_callback = MacroF1(X_test=X_test, y_test=y_test, patience=10, batch_size=BATCH_SIZE, mode="max")

pc_model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=100, callbacks=[macro_f1_callback, es], batch_size=BATCH_SIZE)