In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from tqdm.auto import tqdm

from sklearn.model_selection import train_test_split
from collections import Counter
from itertools import repeat
from bisect import bisect_left
import nltk
import os
import shutil

import tensorflow as tf
# import tensorflow_hub as hub
# import tensorflow_text as text
import keras
#from official.nlp import optimization  # to create AdamW optimizer
from keras import ops
from keras.utils import pad_sequences
import sklearn
from keras import layers
import keras_nlp
tf.get_logger().setLevel('ERROR')
train = pd.read_json('data/train.json')

In [3]:
def make_tag_lookup_table():
    iob_labels = ["B", "I"]
    ner_labels = ["NAME_STUDENT", "EMAIL", "URL_PERSONAL", "ID_NUM",'USERNAME','PHONE_NUM','STREET_ADDRESS']
    all_labels = [(label1, label2) for label2 in ner_labels for label1 in iob_labels]
    all_labels = ["-".join([a, b]) for a, b in all_labels]
    all_labels = ['[PAD]',"O"] + all_labels
    return dict(zip(all_labels,range(0, len(all_labels) + 1)))

encoding = make_tag_lookup_table()
mapping = dict([(value, key) for key, value in encoding.items()])

#print(mapping)

In [124]:
all_tokens = train.tokens.explode().reset_index(drop = True).unique()
all_tokens_array = np.array(list(map(str.lower, all_tokens)))

counter = Counter(all_tokens)
#print(len(counter))

num_tags = len(mapping)
vocab_size = 50000

# We only take (vocab_size - 2) most commons words from the training data since
# the `StringLookup` class uses 2 additional tokens - one denoting an unknown
# token and another one denoting a masking token
vocabulary = [token for token, count in counter.most_common(vocab_size - 2)]

# The StringLook class will convert tokens to token IDs
lookup_layer = keras.layers.StringLookup(vocabulary=vocabulary)
del all_tokens

53985


In [10]:
def create_sample_data(df, test_size, neg_sample_size, rdm_state):
    """
    df: pandas dataframe with a column correct_label
    test_size: the fraction of positive samples to leave for testing
    neg_sample_size: number of negative samples to include in the dataframe
    random_state: set the random state of the sample function to access different samples
    returns: sample dataframe and indices of documents to test
    """
    np.random.seed(rdm_state)
    idx_pos = df.loc[(df.correct_label != "O") | (df.cm.isna()),'sentence_text'].drop_duplicates().index
    idx_neg = df.loc[df.correct_label == "O", 'sentence_text'].drop_duplicates().index
    
    df2 = pd.concat([df.iloc[idx_pos,:],df.iloc[idx_neg,:]],axis = 0).reset_index(drop = True)
    total_idx = df2.loc[:,'sentence_text'].drop_duplicates().index
    df2 = df2.iloc[total_idx,:]
    pos_df = df2.loc[df2.correct_label != "O",:]
    neg_df = df2.loc[df2.correct_label == "O",:]
    
    train_size = 1 - test_size
    pos_sample = pos_df.sample(frac = train_size,random_state = rdm_state)
    neg_sample = neg_df.sample(neg_sample_size,random_state = rdm_state)
    df3 = pd.concat([pos_sample,neg_sample],axis = 0).reset_index(drop = True)
    pos_sample_test = pos_df[~pos_df.document.isin(pos_sample.document.unique())]
    pos_documents = pos_sample_test.document.unique()
    neg_sample_test = neg_df[~neg_df.document.isin(neg_sample.document.unique())]
    neg_documents = np.random.choice(neg_df.document.unique(),len(pos_documents))
    documents = set(np.append(pos_documents,neg_documents))
    return df3, documents

In [None]:
def get_training_data(tokens, labels, test = 0.2, state = 42, batch_size = 32):
    X_train, X_val, y_train, y_val = train_test_split(tokens, labels, test_size = test, random_state = state)
    
    X_train = pad_sequences(X_train.map(lookup_layer))
    X_val = pad_sequences(X_val.map(lookup_layer))
    y_train = pad_sequences(pd.Series([[encoding[r] for r in row] for row in y_train]))
    y_val = pad_sequences(pd.Series([[encoding[r] for r in row] for row in y_val]))
    
    train_dataset = tf.data.Dataset.from_tensor_slices((X_train,y_train)).batch(batch_size)
    val_dataset = tf.data.Dataset.from_tensor_slices((X_val,y_val)).batch(batch_size)
    return train_dataset, val_dataset

In [None]:
def get_test_data(tokens, labels, batch_size = 32):

    X = pad_sequences(tokens.map(lookup_layer))
    y = pad_sequences(pd.Series([[encoding[r] for r in row] for row in labels]))
    
    dataset = tf.data.Dataset.from_tensor_slices((X,y)).batch(batch_size)
    return dataset

**Named Entity Recognition Transformer**

Built on code from https://keras.io/examples/nlp/ner_transformers/

In [159]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = keras.layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.ffn = keras.Sequential(
            [
                keras.layers.Dense(ff_dim, activation="relu"),
                keras.layers.Dense(embed_dim),
            ]
        )
        self.layernorm1 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = keras.layers.Dropout(rate)
        self.dropout2 = keras.layers.Dropout(rate)

    def call(self, inputs, training=False):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [160]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.token_emb = keras.layers.Embedding(
            input_dim=vocab_size, output_dim=embed_dim
        )
        self.pos_emb = keras.layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, inputs):
        maxlen = ops.shape(inputs)[-1]
        positions = ops.arange(start=0, stop=maxlen, step=1)
        position_embeddings = self.pos_emb(positions)
        token_embeddings = self.token_emb(inputs)
        return token_embeddings + position_embeddings

In [161]:
class NERModel(keras.Model):
    def __init__(
        self, num_tags, vocab_size, maxlen=3298, embed_dim=32, num_heads=2, ff_dim=32
    ):
        super().__init__()
        self.embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
        self.transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
        self.dropout1 = layers.Dropout(0.1)
        self.ff = layers.Dense(ff_dim, activation="relu")
        self.dropout2 = layers.Dropout(0.1)
        self.ff_final = layers.Dense(num_tags, activation="softmax")

    def call(self, inputs, training=False):
        x = self.embedding_layer(inputs)
        x = self.transformer_block(x)
        x = self.dropout1(x, training=training)
        x = self.ff(x)
        x = self.dropout2(x, training=training)
        x = self.ff_final(x)
        return x

In [3]:
class TransformerBlockWithLSTM(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = keras.layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.ffn = keras.Sequential(
            [
                keras.layers.Bidirectional(layers.LSTM(ff_dim, return_sequences=True)),
                keras.layers.Dense(embed_dim),
            ]
        )
        self.layernorm1 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = keras.layers.Dropout(rate)
        self.dropout2 = keras.layers.Dropout(rate)

    def call(self, inputs, training=False):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [4]:
class NERModelWithLSTM(keras.Model):
    def __init__(
        self, num_tags, vocab_size, maxlen=3298, embed_dim=32, num_heads=2, ff_dim=32
    ):
        super().__init__()
        self.embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
        self.transformer_block = TransformerBlockWithLSTM(embed_dim, num_heads, ff_dim)
        self.dropout1 = layers.Dropout(0.1)
        self.ff = layers.Dense(ff_dim, activation="relu")
        self.dropout2 = layers.Dropout(0.1)
        self.ff_final = layers.Dense(num_tags, activation="softmax")

    def call(self, inputs, training=False):
        x = self.embedding_layer(inputs)
        x = self.transformer_block(x)
        x = self.dropout1(x, training=training)
        x = self.ff(x)
        x = self.dropout2(x, training=training)
        x = self.ff_final(x)
        return x

In [5]:
class TransformerBlockWithOneLSTM(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = keras.layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.ffn = keras.Sequential(
            [
                keras.layers.LSTM(ff_dim, return_sequences=True),
                keras.layers.Dense(embed_dim),
            ]
        )
        self.layernorm1 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = keras.layers.Dropout(rate)
        self.dropout2 = keras.layers.Dropout(rate)

    def call(self, inputs, training=False):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [6]:
class NERModelWithOneLSTM(keras.Model):
    def __init__(
        self, num_tags, vocab_size, maxlen=3298, embed_dim=32, num_heads=2, ff_dim=32
    ):
        super().__init__()
        self.embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
        self.transformer_block = TransformerBlockWithOneLSTM(embed_dim, num_heads, ff_dim)
        self.dropout1 = layers.Dropout(0.1)
        self.ff = layers.Dense(ff_dim, activation="relu")
        self.dropout2 = layers.Dropout(0.1)
        self.ff_final = layers.Dense(num_tags, activation="softmax")

    def call(self, inputs, training=False):
        x = self.embedding_layer(inputs)
        x = self.transformer_block(x)
        x = self.dropout1(x, training=training)
        x = self.ff(x)
        x = self.dropout2(x, training=training)
        x = self.ff_final(x)
        return x

In [116]:
class CustomNonPaddingTokenLoss(keras.losses.Loss):
    def __init__(self, name="custom_ner_loss"):
        super().__init__(name=name)

    def call(self, y_true, y_pred):
        loss_fn = keras.losses.SparseCategoricalCrossentropy(
            from_logits=False, reduction=None
        )
        loss = loss_fn(y_true, y_pred)
        mask = ops.cast((y_true > 0), dtype="float32")
        loss = loss * mask
        return ops.sum(loss) / ops.sum(mask)

loss = CustomNonPaddingTokenLoss()

In [None]:
class RecordDataFrame(pd.DataFrame):
    '''Function to utilize when adding model results to a dataframe'''
    def __init__(self):
        super().__init__(columns = ['Model Description', 'Weighted Precision','Weighted Recall', 'Weighted F1',
                        'Micro F5','True Positive', 'False Negative', 'False Positive', 'Sample Size','Epochs',
                        'Sample Random State','Batch Size','Num Transformer Heads',
                        'ff_dim','embed_dim','Dataset Type'])
        self.row_num = self.shape[0] - 1
        
    def add_record(self, dict_info):
        new_row = self.shape[0]
        for key in dict_info.keys():
            self.at[new_row, key] = dict_info[key]

    def clear_record(self):
        if self.row_num > -1:
            self.drop(index = self.row_num, axis = 0, inplace = True)
        else:
            pass

    def clear_df(self):
        self.drop(index = self.index, axis = 0, inplace = True)

In [5]:
labels = [x[1] for x in list(mapping.items())][2:]
def calculate_metrics(dataset, model, beta = 5):
    '''This function takes a tensorflow dataset and returns a dataframe of individual results
    for the model run as well as a dictionary to use to update the RecordDataFrame().
    '''
    all_true_tag_ids, all_predicted_tag_ids = [], []

    for x, y in dataset:
        output = model.predict(x, verbose=0)
        predictions = ops.argmax(output, axis=-1)
        predictions = ops.reshape(predictions, [-1])

        true_tag_ids = ops.reshape(y, [-1])

        mask = (true_tag_ids > 0) & (predictions > 0)
        true_tag_ids = true_tag_ids[mask]
        predicted_tag_ids = predictions[mask]

        all_true_tag_ids.append(true_tag_ids)
        all_predicted_tag_ids.append(predicted_tag_ids)

    all_true_tag_ids = np.concatenate(all_true_tag_ids)
    all_predicted_tag_ids = np.concatenate(all_predicted_tag_ids)

    predicted_tags = [mapping[tag] for tag in all_predicted_tag_ids]
    real_tags = [mapping[tag] for tag in all_true_tag_ids]

    print(f'processed {len(predicted_tags)} tokens')

    cm = sklearn.metrics.multilabel_confusion_matrix(real_tags,predicted_tags, labels = labels)

    precision = []
    recall = []
    f1 = []
    f5 = []
    total = []
    true_positive = []
    false_positive = []
    false_negative = []
    total_tp = 0
    total_fn = 0
    total_fp = 0
    for i in range(len(labels)):
        tn = cm[i][0][0]
        fp = cm[i][0][1]
        total_fp += fp
        fn = cm[i][1][0]
        total_fn += fn
        tp = cm[i][1][1]
        total_tp += tp
        number = fn + tp
        if (tp + fp) > 0:
            p = tp / (tp + fp)
        else: 
            p = 0
        if (tp + fn) > 0:
            r = tp / (tp + fn)
        else: 
            r = 0
        if (p + r) > 0:    
            f = 2 * (p * r / (p + r))
        else:
            f = 0
            
        precision.append(p)
        recall.append(r)
        f1.append(f)
        total.append(number)
        true_positive.append(tp)
        false_positive.append(fp)
        false_negative.append(fn)

    result_df = pd.DataFrame({'labels' : labels,
                            'precision': precision,
                            'recall' : recall,
                            'f1' : f1,
                            'total': total,
                            'TP' : true_positive,
                            'FP': false_positive,
                            'FN': false_negative})
                    
    micro_f5 = (1+(beta**2))*total_tp/(((1+(beta**2))*total_tp) + ((beta**2)*total_fn) + total_fp)
    weighted_precision = sum(result_df.precision * result_df.total / sum(result_df.total))
    weighted_recall = sum(result_df.recall * result_df.total / sum(result_df.total))
    weighted_f1 = sum(result_df.f1 * result_df.total / sum(result_df.total))
    print(f'True Positive: {total_tp}, False Positive: {total_fp}, False Negative: {total_fn}')
    print(f'Weighted Precision: {weighted_precision}')
    print(f'Weighted Recall: {weighted_recall}')
    print(f'Weighted F1: {weighted_f1}')
    print(f'Micro F5: {micro_f5}')
    record_dict = {'Weighted Precision': weighted_precision,
                   'Weighted Recall': weighted_recall, 
                   'Weighted F1': weighted_f1,
                   'Micro F5': micro_f5,
                   'True Positive': total_tp, 
                   'False Negative': total_fn, 
                   'False Positive': total_fp, 
                   'Sample Size': len(real_tags)}
    return result_df, record_dict