### Complete TensorFlow mixed-precision implementation with Bert

-*1.use roberta in tensorflow 2.1*
-*2.add the sentiment frquence: positive negative neutral*
-*3.use lr warmup*
-*4.expand data in word_counts.py*
-*5.use joblib.Parallel*

In [1]:
import numpy as np
import pandas as pd
from math import ceil, floor
import tensorflow as tf
import tensorflow.keras.layers as L
from tensorflow.keras.initializers import TruncatedNormal
from sklearn import model_selection
# from transformers import RobertaConfig, TFRobertaPreTrainedModel, TFRobertaMainLayer
from transformers import BertConfig, TFBertPreTrainedModel, TFBertMainLayer
from tokenizers import BertWordPieceTokenizer
import matplotlib.pyplot as plt
from tqdm.autonotebook import tqdm
from joblib import Parallel, delayed
import os

import logging
tf.get_logger().setLevel(logging.ERROR)
import warnings
warnings.filterwarnings("ignore")
    
tf.config.optimizer.set_jit(True)
tf.config.optimizer.set_experimental_options(
    {"auto_mixed_precision": True})

In [2]:
# read csv files
train_df = pd.read_csv('../input/my-data/train_process.csv')
train_df.dropna(inplace=True)

test_df = pd.read_csv('../input/tweet-sentiment-extraction/test.csv')
test_df.loc[:, "selected_text"] = test_df.text.values

submission_df = pd.read_csv('../input/tweet-sentiment-extraction/sample_submission.csv')

print("train shape =", train_df.shape)
print("test shape  =", test_df.shape)

# # merge word_counts_prediction.csv
# word_counts_prediction = pd.read_csv("../input/my-data/word_counts_prediction.csv")
# train_df = pd.merge(train_df, word_counts_prediction, on="textID")
# test_df = pd.merge(test_df, word_counts_prediction, on="textID")

# set some global variables
PATH = "../input/huggingfacetransformermodels/model_classes/roberta/roberta-large-tf2-model/"
MAX_SEQUENCE_LENGTH = 128
TOKENIZER = BertWordPieceTokenizer(f"../input/bert-base-uncased/vocab.txt", lowercase=True, add_special_tokens=False)

sentiment_dict = {"positive": ["good", "happy", "love", "day", "thanks", "great", "fun", "nice", "hope", "thank"],
                  "negative": ["miss", "sad", "sorry", "bad", "hate", "sucks", "sick", "like", "feel", "bored"],
                  "neutral": ["get", "go", "day", "work", "going", "quot", "lol", "got", "like", "today"]}

# let's take a look at the data
train_df.head(10)


train shape = (27480, 4)
test shape  = (3534, 4)


Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,sooo sad,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","sons of * * * * ,",negative
5,28b57f3990,http://www.dothebouncy.com/smf - some shameles...,http://www.dothebouncy.com/smf - some shameles...,neutral
6,6e0c6d75b1,2am feedings for the baby are fun when he is a...,fun,positive
7,50e14c0bb8,Soooo high,Soooo high,neutral
8,e050245fbd,Both of you,Both of you,neutral
9,fc2cbefa9d,Journey!? Wow... u just became cooler. hehe....,wow . . . u just became cooler .,positive


```
I. Set up preprocessing and dataset/datagenerator
```


In [3]:
def preprocess(tweet, selected_text, sentiment):
    """
    Will be used in tf.data.Dataset.from_generator(...)

    """

    # The original strings have been converted to
    # byte strings, so we need to decode it
    tweet = tweet.decode('utf-8')
    selected_text = selected_text.decode('utf-8')
    sentiment = sentiment.decode('utf-8')
    # predicted_selection = predicted_selection.decode('utf-8')

    # Clean up the strings a bit
    tweet = " ".join(str(tweet).lower().split())
    selected_text = " ".join(str(selected_text).lower().split())
    # predicted_selection = " ".join(str(predicted_selection).lower().split())

    tokens_a = TOKENIZER.encode(tweet).tokens

    # find the intersection between text and selected text
    target_start, target_end = None, None
    if selected_text is not None:
        selected_texts_a = TOKENIZER.encode(selected_text).tokens
        # find the intersection between text and selected text
        for index in (i for i, c in enumerate(tokens_a) if c == selected_texts_a[0]):
            if tokens_a[index:index + len(selected_texts_a)] == selected_texts_a:
                target_start = index
                target_end = index + len(selected_texts_a)
                break

    # tokenize with offsets
    enc = TOKENIZER.encode(tweet)
    input_ids_orig, offsets = enc.ids, enc.offsets
    
    # add sentiment word frequency
    sentiment_frequency = []
    pos_fre = 0
    neg_fre = 0
    neu_fre = 0
    for token in enc.tokens:
        if token in sentiment_dict["positive"]:
            pos_fre += 1
        if token in sentiment_dict["negative"]:
            neg_fre += 1
        if token in sentiment_dict["neutral"]:
            neu_fre += 1
    sentiment_frequency.append(str(pos_fre))
    sentiment_frequency.append(str(neg_fre))
    sentiment_frequency.append(str(neu_fre))
    enc_sentiment = TOKENIZER.encode(" ".join(sentiment_frequency))
    
    # # add predicted_selection
    # enc_ps = TOKENIZER.encode(predicted_selection)
    # input_ids_orig_ps, tokens_ps = enc_ps.ids, enc_ps.tokens
    
    # add and pad data (hardcoded for BERT)
    # --> [CLS] sentiment [SEP] input_ids [SEP] [PAD]
    sentiment_map = {
        'positive': 3893,
        'negative': 4997,
        'neutral': 8699,
    }

    input_ids = [101] + [sentiment_map[sentiment]] + enc_sentiment.ids + [102] + input_ids_orig + [102]
    input_type_ids = [0, 0, 0, 0, 0, 0] + [1] * (len(input_ids_orig) + 1)
    attention_mask = [1] * (len(input_ids_orig) + 7)
    offsets = [(0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0)] + offsets + [(0, 0)]
    target_start += 6
    target_end += 5

    padding_length = MAX_SEQUENCE_LENGTH - len(input_ids)
    if padding_length > 0:
        input_ids = input_ids + ([0] * padding_length)
        attention_mask = attention_mask + ([0] * padding_length)
        input_type_ids = input_type_ids + ([0] * padding_length)
        offsets = offsets + ([(0, 0)] * padding_length)
    elif padding_length < 0:
        input_ids = input_ids[:padding_length - 1] + [102]
        attention_mask = attention_mask[:padding_length - 1] + [1]
        input_type_ids = input_type_ids[:padding_length - 1] + [1]
        offsets = offsets[:padding_length - 1] + [(0, 0)]
        if target_start >= MAX_SEQUENCE_LENGTH:
            target_start = MAX_SEQUENCE_LENGTH - 1
        if target_end >= MAX_SEQUENCE_LENGTH:
            target_end = MAX_SEQUENCE_LENGTH - 1

    return (
        input_ids, attention_mask, input_type_ids, offsets,
        target_start, target_end, tweet, selected_text, sentiment,
        # predicted_selection,
    )

class TweetSentimentDataset(tf.data.Dataset):
    
    OUTPUT_TYPES = (
        tf.dtypes.int32,  tf.dtypes.int32,   tf.dtypes.int32, 
        tf.dtypes.int32,  tf.dtypes.float32, tf.dtypes.float32,
        tf.dtypes.string, tf.dtypes.string,  tf.dtypes.string,
        # tf.dtypes.string,
    )
    
    OUTPUT_SHAPES = (
        (MAX_SEQUENCE_LENGTH,),   (MAX_SEQUENCE_LENGTH,), (MAX_SEQUENCE_LENGTH,), 
        (MAX_SEQUENCE_LENGTH, 2), (),                     (),
        (),                       (),                     (),
        # (),
    )
    
    # AutoGraph will automatically convert Python code to
    # Tensorflow graph code. You could also wrap 'preprocess' 
    # in tf.py_function(..) for arbitrary python code
    def _generator(tweet, selected_text, sentiment):
        for tw, st, se in zip(tweet, selected_text, sentiment):
            yield preprocess(tw, st, se)
    
    # This dataset object will return a generator
    def __new__(cls, tweet, selected_text, sentiment):
        return tf.data.Dataset.from_generator(
            cls._generator,
            output_types=cls.OUTPUT_TYPES,
            output_shapes=cls.OUTPUT_SHAPES,
            args=(tweet, selected_text, sentiment)
        )
    
    @staticmethod
    def create(dataframe, batch_size, shuffle_buffer_size=-1):
        dataset = TweetSentimentDataset(
            dataframe.text.values, 
            dataframe.selected_text.values, 
            dataframe.sentiment.values,
            # dataframe.predicted_selection.values,
        )

        dataset = dataset.cache()
        if shuffle_buffer_size != -1:
            dataset = dataset.shuffle(shuffle_buffer_size)
        dataset = dataset.batch(batch_size)
        dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
        
        # d = next(iter(dataset))
        # print("Writing example in %d" % (len(dataframe)))
        # for i in range(5):
        #     print("*** Example ***")
        #     print("tokens: %s" % " ".join(TOKENIZER.encode(d[6].numpy()[i].decode("utf-8")).tokens))
        #     print("tokens_predicted_selection: %s" % " ".join(TOKENIZER.encode(d[9].numpy()[i].decode("utf-8")).tokens))
        #     print("input_ids: %s" % " ".join([str(x) for x in d[0].numpy()[i]]))
        #     print("input_mask: %s" % " ".join([str(x) for x in d[1].numpy()[i]]))
        #     print("segment_ids: %s" % " ".join([str(x) for x in d[2].numpy()[i]]))
        #     print("selected_text: %s" % d[7].numpy()[i].decode("utf-8"))
        #     print("idx_start: %d" % d[4].numpy()[i])
        #     print("idx_end: %d" % d[5].numpy()[i])
        
        return dataset
    
def generate_fold_data(data, num_folds):
    kfold = model_selection.StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)
    for fold_num, (train_idx, valid_idx) in enumerate(kfold.split(X=data.text, y=data.sentiment.values)):
        if fold_num == 0:
            save_data = data.iloc[valid_idx]
            save_data["kfold"] = fold_num
        else:
            _save_data = data.iloc[valid_idx]
            _save_data["kfold"] = fold_num
            save_data = pd.concat([save_data, _save_data], axis=0)
            
    save_data = save_data.reset_index(drop=True)
    # print(save_data.shape)
    # save_data.to_csv("train_5folds.csv", index=False)
    return save_data
    


```
II. Set up transformer model and functions
```

In [4]:
# class RoBertQAModel(TFRobertaPreTrainedModel):
class BertQAModel(TFBertPreTrainedModel):
    
    DROPOUT_RATE = 0.1
    NUM_HIDDEN_STATES = 2
    
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        
        # self.robert = TFRobertaMainLayer(config, name="robert")
        self.robert = TFBertMainLayer(config, name="robert")
        self.concat = L.Concatenate()
        self.dropout = L.Dropout(self.DROPOUT_RATE)
        self.qa_outputs = L.Dense(
            config.num_labels, 
            kernel_initializer=TruncatedNormal(stddev=config.initializer_range),
            dtype='float32',
            name="qa_outputs")
        
    @tf.function
    def call(self, inputs, **kwargs):
        # outputs: Tuple[sequence, pooled, hidden_states]
        _, _, hidden_states = self.robert(inputs, **kwargs)
        
        hidden_states = self.concat([
            hidden_states[-i] for i in range(1, self.NUM_HIDDEN_STATES+1)
        ])
        
        hidden_states = self.dropout(hidden_states, training=kwargs.get("training", False))
        logits = self.qa_outputs(hidden_states)
        start_logits, end_logits = tf.split(logits, 2, axis=-1)
        start_logits = tf.squeeze(start_logits, axis=-1)
        end_logits = tf.squeeze(end_logits, axis=-1)
        
        return start_logits, end_logits
    
    
def train(model, dataset, loss_fn, optimizer, current_step, loss_step, data_len, fold_num):
    
    @tf.function
    def train_step(model, inputs, y_true, loss_fn, optimizer, current_step):
        with tf.GradientTape() as tape:
            y_pred = model(inputs, training=True)
            loss  = loss_fn(y_true[0], y_pred[0])
            loss += loss_fn(y_true[1], y_pred[1])
            scaled_loss = optimizer.get_scaled_loss(loss)
    
        scaled_gradients = tape.gradient(scaled_loss, model.trainable_variables)
        gradients = optimizer.get_unscaled_gradients(scaled_gradients)
        # optimizer.learning_rate = learning_rate_decay(learning_rate, num_train_steps, num_warmup_steps, current_step)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))
        return loss, y_pred

    epoch_loss = 0.
    tk0 = tqdm(dataset, total=data_len, desc="Training: " + str(fold_num))
    for batch_num, sample in enumerate(tk0):
        current_step.assign_add(1)
        loss, y_pred = train_step(
            model, sample[:3], sample[4:6], loss_fn, optimizer, current_step)

        epoch_loss += loss
        loss_step.append(epoch_loss/(batch_num+1))
        # print(
        #     f"training ... batch {batch_num+1:03d} : "
        #     f"train loss {epoch_loss/(batch_num+1):.3f} ",
        #     end='\r')
        
def predict(model, dataset, loss_fn, optimizer, data_len, fold_num):
    
    @tf.function
    def predict_step(model, inputs):
        return model(inputs)
        
    def to_numpy(*args):
        out = []
        for arg in args:
            if arg.dtype == tf.string:
                arg = [s.decode('utf-8') for s in arg.numpy()]
                out.append(arg)
            else:
                arg = arg.numpy()
                out.append(arg)
        return out
    
    # Initialize accumulators
    offset = tf.zeros([0, MAX_SEQUENCE_LENGTH, 2], dtype=tf.dtypes.int32)
    text = tf.zeros([0,], dtype=tf.dtypes.string)
    selected_text = tf.zeros([0,], dtype=tf.dtypes.string)
    sentiment = tf.zeros([0,], dtype=tf.dtypes.string)
    pred_start = tf.zeros([0, MAX_SEQUENCE_LENGTH], dtype=tf.dtypes.float32)
    pred_end = tf.zeros([0, MAX_SEQUENCE_LENGTH], dtype=tf.dtypes.float32)
    
    tk0 = tqdm(dataset, total=data_len, desc="Validating or Testing: " + str(fold_num))
    for batch_num, sample in enumerate(tk0):
        
        # print(f"predicting ... batch {batch_num+1:03d}"+" "*20, end='\r')
        
        y_pred = predict_step(model, sample[:3])
        
        # add batch to accumulators
        pred_start = tf.concat((pred_start, y_pred[0]), axis=0)
        pred_end = tf.concat((pred_end, y_pred[1]), axis=0)
        offset = tf.concat((offset, sample[3]), axis=0)
        text = tf.concat((text, sample[6]), axis=0)
        selected_text = tf.concat((selected_text, sample[7]), axis=0)
        sentiment = tf.concat((sentiment, sample[8]), axis=0)

    # pred_start = tf.nn.softmax(pred_start)
    # pred_end = tf.nn.softmax(pred_end)
    
    pred_start, pred_end, text, selected_text, sentiment, offset = \
        to_numpy(pred_start, pred_end, text, selected_text, sentiment, offset)
    
    return pred_start, pred_end, text, selected_text, sentiment, offset


def decode_prediction(pred_start, pred_end, text, offset, sentiment):
    
    def decode(pred_start, pred_end, text, offset):

        decoded_text = ""
        for i in range(pred_start, pred_end+1):
            decoded_text += text[offset[i][0]:offset[i][1]]
            if (i+1) < len(offset) and offset[i][1] < offset[i+1][0]:
                decoded_text += " "
        return decoded_text
    
    decoded_predictions = []
    for i in range(len(text)):
        if sentiment[i] == "neutral" or len(text[i].split()) < 2:
            decoded_text = text[i]
        else:
            idx_start = np.argmax(pred_start[i])
            idx_end = np.argmax(pred_end[i])
            if idx_start > idx_end:
                idx_end = idx_start 
            decoded_text = str(decode(idx_start, idx_end, text[i], offset[i]))
            if len(decoded_text) == 0:
                decoded_text = text[i]
        decoded_predictions.append(decoded_text)
    
    return decoded_predictions

def jaccard(str1, str2):
    a = set(str1.lower().split())
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

@tf.function
def learning_rate_decay(init_lr, num_train_steps, num_warmup_steps, current_step):
    # Implements linear decay of the learning rate.
    learning_rate = tf.keras.optimizers.schedules.PolynomialDecay(
                    init_lr, num_train_steps, end_learning_rate=0.0, power=1.0)(current_step)

    if num_warmup_steps:
        global_steps_int = tf.cast(current_step, tf.dtypes.int32)
        warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.dtypes.int32)

        global_steps_float = tf.cast(global_steps_int, tf.dtypes.float32)
        warmup_steps_float = tf.cast(warmup_steps_int, tf.dtypes.float32)

        warmup_percent_done = global_steps_float / warmup_steps_float
        warmup_learning_rate = init_lr * warmup_percent_done

        if global_steps_int < warmup_steps_int:
            learning_rate = warmup_learning_rate
        else:
            learning_rate = learning_rate
        
    return learning_rate

```
III. Run it all: 

model.create() -> dataset.create() -> train(train) ->
       -> predict(val).decode() -> predict(test).decode() -> submit
```

num_folds = 5
num_epochs = 3
batch_size = 50
learning_rate = 4e-5
num_train_steps = int(len(train_df) / batch_size * num_epochs)
num_warmup_steps = int(num_train_steps * 0.1)

optimizer = tf.keras.optimizers.Adam(learning_rate)
optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(
    optimizer, 'dynamic')

config = RobertaConfig(output_hidden_states=True, num_labels=2)
RoBertQAModel.DROPOUT_RATE = 0.2
RoBertQAModel.NUM_HIDDEN_STATES = 2
model = RoBertQAModel.from_pretrained(PATH, config=config)

loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

kfold = model_selection.StratifiedKFold(
    n_splits=num_folds, shuffle=True, random_state=42)

# initialize test predictions
test_preds_start = np.zeros((len(test_df), MAX_SEQUENCE_LENGTH), dtype=np.float32)
test_preds_end = np.zeros((len(test_df), MAX_SEQUENCE_LENGTH), dtype=np.float32)

for fold_num, (train_idx, valid_idx) in enumerate(
        kfold.split(X=train_df.text, y=train_df.sentiment.values)):
    print("\nfold %02d" % (fold_num+1))
    
    loss_step = []
    global_step = tf.Variable(0, name="global_step")
    train_dataset = TweetSentimentDataset.create(
        train_df.iloc[train_idx], batch_size, shuffle_buffer_size=2048)
    valid_dataset = TweetSentimentDataset.create(
        train_df.iloc[valid_idx], batch_size, shuffle_buffer_size=-1)
    test_dataset = TweetSentimentDataset.create(
        test_df, batch_size, shuffle_buffer_size=-1)
    
    best_score = float('-inf')
    for epoch_num in range(num_epochs):
        print("\nepoch %03d" % (epoch_num+1))
        # train for an epoch
        train(model, train_dataset, loss_fn, optimizer, global_step, loss_step)
        
        plt.plot(list(range(global_step.numpy())), loss_step)
        plt.show()
        
        # predict validation set and compute jaccardian distances
        pred_start, pred_end, text, selected_text, sentiment, offset = \
            predict(model, valid_dataset, loss_fn, optimizer)
        
        selected_text_pred = decode_prediction(
            pred_start, pred_end, text, offset, sentiment)
        jaccards = []
        for i in range(len(selected_text)):
            jaccards.append(
                jaccard(selected_text[i], selected_text_pred[i]))
        
        score = np.mean(jaccards)
        print(f"valid jaccard epoch {epoch_num+1:03d}: {score}"+" "*15)
        
        if score > best_score:
            best_score = score
            # requires you to have 'fold-{fold_num}' folder in PATH:
            # model.save_pretrained(PATH+f'fold-{fold_num}')
            # or
            # model.save_weights(PATH + f'fold-{fold_num}.h5')
            
            # predict test set
            test_pred_start, test_pred_end, test_text, _, test_sentiment, test_offset = \
                predict(model, test_dataset, loss_fn, optimizer)
    
    # add epoch's best test preds to test preds arrays
    test_preds_start += test_pred_start
    test_preds_end += test_pred_end
    
    # reset model, as well as session and graph (to avoid OOM issues?) 
    session = tf.compat.v1.get_default_session()
    graph = tf.compat.v1.get_default_graph()
    del session, graph, model
    model = RoBertQAModel.from_pretrained(PATH, config=config)
    break
    
# decode test set and add to submission file
selected_text_pred = decode_prediction(
    test_preds_start, test_preds_end, test_text, test_offset, test_sentiment)

submission_df.loc[:, 'selected_text'] = selected_text_pred
submission_df.to_csv("submission.csv", index=False)

In [5]:
num_folds = 1
num_epochs = 3
batch_size = 50
learning_rate = 4e-5
num_train_steps = int(len(train_df) / batch_size * num_epochs)
num_warmup_steps = int(num_train_steps * 0.1)
    
data_df_5folds = generate_fold_data(train_df, 4)

def run(fold):
    df_train_fold = data_df_5folds[data_df_5folds.kfold != fold].reset_index(drop=True)
    df_valid_fold = data_df_5folds[data_df_5folds.kfold == fold].reset_index(drop=True)
    
    num_train_batches = len(df_train_fold) // batch_size + int(len(df_train_fold) % batch_size != 0)
    num_eval_batches = len(df_valid_fold) // batch_size + int(len(df_valid_fold) % batch_size != 0)
    num_test_batches = len(test_df) // batch_size + int(len(test_df) % batch_size != 0)
    
    # initialize test predictions
    test_preds_start = np.zeros((len(test_df), MAX_SEQUENCE_LENGTH), dtype=np.float32)
    test_preds_end = np.zeros((len(test_df), MAX_SEQUENCE_LENGTH), dtype=np.float32)

    optimizer = tf.keras.optimizers.Adam(learning_rate)
    optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(
        optimizer, 'dynamic')

    # config = RobertaConfig(output_hidden_states=True, num_labels=2)
    # config = RobertaConfig.from_json_file(os.path.join(PATH, "config.json"))
    # config.output_hidden_states = True
    # config.num_labels = 2
    # RoBertQAModel.DROPOUT_RATE = 0.2
    # RoBertQAModel.NUM_HIDDEN_STATES = 2
    # model = RoBertQAModel.from_pretrained(PATH, config=config)
    config = BertConfig(output_hidden_states=True, num_labels=2)
    BertQAModel.DROPOUT_RATE = 0.2
    BertQAModel.NUM_HIDDEN_STATES = 2
    model = BertQAModel.from_pretrained(PATH, config=config)

    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

    loss_step = []
    global_step = tf.Variable(0, name="global_step")
    train_dataset = TweetSentimentDataset.create(
        df_train_fold, batch_size, shuffle_buffer_size=2048)
    valid_dataset = TweetSentimentDataset.create(
        df_valid_fold, batch_size, shuffle_buffer_size=-1)
    test_dataset = TweetSentimentDataset.create(
        test_df, batch_size, shuffle_buffer_size=-1)

    best_score = float('-inf')
    for epoch_num in range(num_epochs):
        # train for an epoch
        train(model, train_dataset, loss_fn, optimizer, global_step, loss_step, num_train_batches, fold)

        # predict validation set and compute jaccardian distances
        pred_start, pred_end, text, selected_text, sentiment, offset = \
            predict(model, valid_dataset, loss_fn, optimizer, num_eval_batches, fold)

        selected_text_pred = decode_prediction(
            pred_start, pred_end, text, offset, sentiment)
        jaccards = []
        for i in range(len(selected_text)):
            jaccards.append(
                jaccard(selected_text[i], selected_text_pred[i]))

        score = np.mean(jaccards)

        plt.plot(list(range(global_step.numpy())), loss_step)
        plt.show()
        print("fold = %d , epoch = %d , jaccard = %f" % (fold, epoch_num+1, score))

        if score > best_score:
            best_score = score
            # requires you to have 'fold-{fold_num}' folder in PATH:
            # model.save_pretrained(PATH+f'fold-{fold_num}')
            # or
            # model.save_weights(PATH + f'fold-{fold_num}.h5')

            # predict test set
            test_pred_start, test_pred_end, test_text, _, test_sentiment, test_offset = \
                predict(model, test_dataset, loss_fn, optimizer, num_test_batches, fold)

    # add epoch's best test preds to test preds arrays
    test_preds_start += test_pred_start
    test_preds_end += test_pred_end

    # reset model, as well as session and graph (to avoid OOM issues?) 
    session = tf.compat.v1.get_default_session()
    graph = tf.compat.v1.get_default_graph()
    del session, graph, model
    model = BertQAModel.from_pretrained(PATH, config=config)
    return (test_preds_start, test_preds_end, test_text, test_sentiment, test_offset)
    
test_result = Parallel(n_jobs=num_folds, backend="threading", verbose=10)(delayed(run)(i) for i in range(num_folds))

# initialize test predictions
test_preds_start = np.zeros((len(test_df), MAX_SEQUENCE_LENGTH), dtype=np.float32)
test_preds_end = np.zeros((len(test_df), MAX_SEQUENCE_LENGTH), dtype=np.float32)

for r in test_result:
    test_preds_start += r[0]
    test_preds_end += r[1]

# decode test set and add to submission file
test_text = r[2]
test_offset = r[4]
test_sentiment = r[3]
selected_text_pred = decode_prediction(
    test_preds_start, test_preds_end, test_text, test_offset, test_sentiment)

submission_df.loc[:, 'selected_text'] = selected_text_pred
submission_df.to_csv("submission.csv", index=False)


fold 01

epoch 001
valid jaccard epoch 001: 0.6184750805655036               
predicting ... batch 111                    
epoch 002
valid jaccard epoch 002: 0.6207123118625367               
predicting ... batch 111                    
epoch 003
valid jaccard epoch 003: 0.618340574200468               

fold 02

epoch 001
valid jaccard epoch 001: 0.6145207552234808               
predicting ... batch 111                    
epoch 002
valid jaccard epoch 002: 0.624140734730702               
predicting ... batch 111                    
epoch 003
valid jaccard epoch 003: 0.6130763425641519               

fold 03

epoch 001
valid jaccard epoch 001: 0.6064579633537712               
predicting ... batch 111                    
epoch 002
valid jaccard epoch 002: 0.6204347494436645               
predicting ... batch 111                    
epoch 003
valid jaccard epoch 003: 0.6253786957909664               
predicting ... batch 111                    
fold 04

epoch 001
valid jacc