In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from tqdm.auto import tqdm

from sklearn.model_selection import train_test_split
from collections import Counter
from itertools import repeat
from bisect import bisect_left
import nltk
import os
import shutil

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import keras
from official.nlp import optimization  # to create AdamW optimizer
from keras import ops
from keras.utils import pad_sequences
from conlleval import evaluate
import sklearn
from keras import layers
import keras_nlp

tf.get_logger().setLevel('ERROR')
%run nlp-functions.ipynb

In [2]:
#train = pd.read_json('data/train.json')
#test = pd.read_json('data/test.json')

In [4]:
df1 = pd.read_json('initial_predictions.json')

In [5]:
df1.head()

Unnamed: 0,document,token,token_text,correct_label,preds,cm,sentence_text,tokenized_sentence,labels_in_sentence
0,7,0,Design,O,O,TN,Design Thinking for innovation reflexion-Avril...,"[Design, Thinking, for, innovation, reflexion,...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-..."
1,7,1,Thinking,O,O,TN,Design Thinking for innovation reflexion-Avril...,"[Design, Thinking, for, innovation, reflexion,...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-..."
2,7,2,for,O,O,TN,Design Thinking for innovation reflexion-Avril...,"[Design, Thinking, for, innovation, reflexion,...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-..."
3,7,3,innovation,O,O,TN,Design Thinking for innovation reflexion-Avril...,"[Design, Thinking, for, innovation, reflexion,...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-..."
4,7,4,reflexion,O,O,TN,Design Thinking for innovation reflexion-Avril...,"[Design, Thinking, for, innovation, reflexion,...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-..."


The dataset df1 contains the results of the Presidio Analyzer Function on each individual token in Learning Agency PII dataset. Each row contains the document number, word token number, word token text, correct label, Presidio Analyzer prediction, and classification as True Negative, True Positive, False Positive or False Negative for the prediction. We also included three columns containing the sentence context of each word.

In the next section, we are going to divide this initial dataset into True Negative and everything else. We will use these Presidio Analyzer results - what the previous model got right and wrong to better balance classes in the data sample for the neural network to learn.

In [6]:
#Find the indices in each category
idx = df1.loc[(df1.cm.isin(['TP','FN','FP'])) | df1.cm.isna(),'sentence_text'].drop_duplicates().index
idx_tn = df1.loc[df1.cm == 'TN', 'sentence_text'].drop_duplicates().index

df2 = pd.concat([df1.iloc[idx,:],df1.iloc[idx_tn,:]],axis = 0).reset_index(drop = True)
total_idx = df2.loc[:,'sentence_text'].drop_duplicates().index
df2 = df2.iloc[total_idx,:]
tn_df = df2.loc[df2.cm == 'TN',:]

#To have more balanced classes, get a random sample of sentences that have all TN tokens and combine this with everything in df2 with a label
tn_sample_size = 4000
tn_sample = tn_df.sample(tn_sample_size,random_state = 42)
df3 = pd.concat([df2.loc[df2.cm != 'TN',:],
                 tn_sample],axis = 0).reset_index(drop = True)
df2.shape

(208248, 9)

In [7]:
df3.shape
#df3 has 8283 unique sentences containing all 

(8283, 9)

In [15]:
df3.cm.value_counts()

cm
TN    4000
FP    3011
TP    1049
FN     221
Name: count, dtype: int64

In [9]:
def make_tag_lookup_table():
    iob_labels = ["B", "I"]
    ner_labels = ["NAME_STUDENT", "EMAIL", "URL_PERSONAL", "ID_NUM",'USERNAME','PHONE_NUM','STREET_ADDRESS']
    all_labels = [(label1, label2) for label2 in ner_labels for label1 in iob_labels]
    all_labels = ["-".join([a, b]) for a, b in all_labels]
    all_labels = ['[PAD]',"O"] + all_labels
    return dict(zip(all_labels,range(0, len(all_labels) + 1)))

encoding = make_tag_lookup_table()
mapping = dict([(value, key) for key, value in encoding.items()])

print(mapping)

{0: '[PAD]', 1: 'O', 2: 'B-NAME_STUDENT', 3: 'I-NAME_STUDENT', 4: 'B-EMAIL', 5: 'I-EMAIL', 6: 'B-URL_PERSONAL', 7: 'I-URL_PERSONAL', 8: 'B-ID_NUM', 9: 'I-ID_NUM', 10: 'B-USERNAME', 11: 'I-USERNAME', 12: 'B-PHONE_NUM', 13: 'I-PHONE_NUM', 14: 'B-STREET_ADDRESS', 15: 'I-STREET_ADDRESS'}


In [124]:
all_tokens = train.tokens.explode().reset_index(drop = True).unique()
all_tokens_array = np.array(list(map(str.lower, all_tokens)))

counter = Counter(all_tokens)
print(len(counter))

num_tags = len(mapping)
vocab_size = 50000

# We only take (vocab_size - 2) most commons words from the training data since
# the `StringLookup` class uses 2 additional tokens - one denoting an unknown
# token and another one denoting a masking token
vocabulary = [token for token, count in counter.most_common(vocab_size - 2)]

# The StringLook class will convert tokens to token IDs
lookup_layer = keras.layers.StringLookup(vocabulary=vocabulary)
del all_tokens

53985


1. Take a sample of the TN to balance the positive and negative classes.
2. Get a label for each word, pad and tokenize the sentences, get a label for each label of words in the sentence.
3. Try to run the transformer!

**Named Entity Recognition Transformer**

In [159]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = keras.layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.ffn = keras.Sequential(
            [
                keras.layers.Dense(ff_dim, activation="relu"),
                keras.layers.Dense(embed_dim),
            ]
        )
        self.layernorm1 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = keras.layers.Dropout(rate)
        self.dropout2 = keras.layers.Dropout(rate)

    def call(self, inputs, training=False):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [160]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.token_emb = keras.layers.Embedding(
            input_dim=vocab_size, output_dim=embed_dim
        )
        self.pos_emb = keras.layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, inputs):
        maxlen = ops.shape(inputs)[-1]
        positions = ops.arange(start=0, stop=maxlen, step=1)
        position_embeddings = self.pos_emb(positions)
        token_embeddings = self.token_emb(inputs)
        return token_embeddings + position_embeddings

In [161]:
class NERModel(keras.Model):
    def __init__(
        self, num_tags, vocab_size, maxlen=3298, embed_dim=32, num_heads=2, ff_dim=32
    ):
        super().__init__()
        self.embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
        self.transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
        self.dropout1 = layers.Dropout(0.1)
        self.ff = layers.Dense(ff_dim, activation="relu")
        self.dropout2 = layers.Dropout(0.1)
        self.ff_final = layers.Dense(num_tags, activation="softmax")

    def call(self, inputs, training=False):
        x = self.embedding_layer(inputs)
        x = self.transformer_block(x)
        x = self.dropout1(x, training=training)
        x = self.ff(x)
        x = self.dropout2(x, training=training)
        x = self.ff_final(x)
        return x

In [116]:
class CustomNonPaddingTokenLoss(keras.losses.Loss):
    def __init__(self, name="custom_ner_loss"):
        super().__init__(name=name)

    def call(self, y_true, y_pred):
        loss_fn = keras.losses.SparseCategoricalCrossentropy(
            from_logits=False, reduction=None
        )
        loss = loss_fn(y_true, y_pred)
        mask = ops.cast((y_true > 0), dtype="float32")
        loss = loss * mask
        return ops.sum(loss) / ops.sum(mask)

loss = CustomNonPaddingTokenLoss()

In [131]:
def calculate_metrics(dataset,model, beta = 5):
    all_true_tag_ids, all_predicted_tag_ids = [], []

    for x, y in dataset:
        output = model.predict(x, verbose=0)
        predictions = ops.argmax(output, axis=-1)
        predictions = ops.reshape(predictions, [-1])

        true_tag_ids = ops.reshape(y, [-1])

        mask = (true_tag_ids > 0) & (predictions > 0)
        true_tag_ids = true_tag_ids[mask]
        predicted_tag_ids = predictions[mask]

        all_true_tag_ids.append(true_tag_ids)
        all_predicted_tag_ids.append(predicted_tag_ids)

    all_true_tag_ids = np.concatenate(all_true_tag_ids)
    all_predicted_tag_ids = np.concatenate(all_predicted_tag_ids)

    predicted_tags = [mapping[tag] for tag in all_predicted_tag_ids]
    real_tags = [mapping[tag] for tag in all_true_tag_ids]

    print(f'processed {len(predicted_tags)} tokens')
    non_o = 0
    tn = 0
    tp = 0
    fp = 0
    fn = 0
    misclassified = 0
    for i in range(len(real_tags)):
        if real_tags[i] != "O":
            non_o += 1
            if real_tags[i] == predicted_tags[i]:
                #if real tag equals predicted tag and does not equal O
                tp += 1
            else:
                if predicted_tags[i] != "O":
                    #if real tag is not O and predicted tag is not O but does not match real tag
                    misclassified += 1
                if predicted_tags[i] == "O":
                    #if real tag is not O and predicted tag is O
                    fn += 1
        else:
            if predicted_tags[i] == "O":
                #if real tag is O and predicted tag is O
                tn += 1
            else:
                #if real tag is O and predicted tag is not O
                fp += 1

    accuracy_non_o = tp / non_o
    accuracy_score = sklearn.metrics.accuracy_score(real_tags, predicted_tags)
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1 = 2 * (precision * recall / (precision + recall))
    s_micro = (1+(beta**2))*tp/(((1+(beta**2))*tp) + ((beta**2)*fn) + fp)

    print(f'True Positive: {tp}, True Negative: {tn}, False Positive: {fp}, False Negative: {fn}')
    print(f'{misclassified} tokens identified as PII but mislabelled.')
    print(f'Accuracy (non-O): {accuracy_non_o}')
    print(f'Accuracy: {accuracy_score}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print(f'F1 score: {f1}')
    print(f'S-micro score: {s_micro}')
    return real_tags, predicted_tags

In [129]:
labels = [x[1] for x in list(mapping.items())][2:]
def calculate_metrics_by_label(y_true, y_pred,labels, beta = 5):
    cm = sklearn.metrics.multilabel_confusion_matrix(real,preds, labels = labels)
    precision = []
    recall = []
    f1 = []
    s_micro = []
    total_tp = 0
    total_fn = 0
    total_fp = 0
    for i in range(len(labels)):
        tn = cm[i][0][0]
        fp = cm[i][0][1]
        total_fp += fp
        fn = cm[i][1][0]
        total_fn += fn
        tp = cm[i][1][1]
        total_tp += tp
        p = tp / (tp + fp)
        r = tp / (tp + fn)
        f = 2 * (p * r / (p + r))
        s = (1+(beta**2))*tp/(((1+(beta**2))*tp) + ((beta**2)*fn) + fp)
        precision.append(p)
        recall.append(r)
        f1.append(f)
        s_micro.append(s)
    df = pd.DataFrame({'labels' : labels,
                        'precision': precision,
                        'recall' : recall,
                        'f1 score' : f1,
                        's_micro' : s_micro})
    print(f'True Positive: {total_tp}, False Positive: {total_fp}, False Negative: {total_fn}')
    return df

**Running the model on a subset of sentences**

Here, we are selecting sentences in df3, the dataset that contains all true positives, false negatives, false positives, and a sample of the true negative tokens from the results of Microsoft Presidio with rule-based NER. Since sentences do contain many negative labels in addition to the positive labels, we have removed duplicates from the dataset to prevent any overlap. We have also included only a small subset of the the sentences containing all true negatives to balance the classes and improve model accuracy.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df3['tokenized_sentence'],df3['labels_by_sentence'],test_size = 0.2, random_state = 42)

X_train = pad_sequences(X_train.map(lookup_layer))
X_test = pad_sequences(X_test.map(lookup_layer))
y_train = pad_sequences(pd.Series([[encoding[r] for r in row] for row in y_train]))
y_test = pad_sequences(pd.Series([[encoding[r] for r in row] for row in y_test]))

batch_size = 32
train_dataset = tf.data.Dataset.from_tensor_slices((X_train,y_train)).batch(batch_size)
test_dataset = tf.data.Dataset.from_tensor_slices((X_test,y_test)).batch(batch_size)

ner_model = NERModel(num_tags, vocab_size, embed_dim=32, num_heads=4, ff_dim=64)
ner_model.compile(optimizer='adam',loss=loss)
ner_model.fit(train_dataset, epochs=5)

def tokenize_and_convert_to_ids(text):
    tokens = text.split()
    return lookup_layer(tokens)

# Sample inference using the trained model
sample_input = tokenize_and_convert_to_ids(
    train.loc[345,'sentence_tokens'][0]
)
sample_input = ops.reshape(sample_input, newshape=[1, -1])
print(sample_input)

output = ner_model.predict(sample_input)
prediction = np.argmax(output, axis=-1)[0]
prediction = [mapping[i] for i in prediction]

print(prediction)

In [218]:
real, preds = calculate_metrics(test_dataset)

processed 39483 tokens
True Positive: 272, True Negative: 38385, False Positive: 641, False Negative: 91
94 tokens identified as PII but mislabelled.
Accuracy (non-O): 0.5951859956236324
Accuracy: 0.9790796038801509
Precision: 0.29791894852135814
Recall: 0.7493112947658402
F1 score: 0.426332288401254
S-micro score: 0.7080496595915098


2024-05-17 11:44:11.737679: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [219]:
results = calculate_metrics_by_label(real, preds, labels)

True Positive: 272, False Positive: 735, False Negative: 185


  p = tp / (tp + fp)
  r = tp / (tp + fn)
  s = (1+(beta**2))*tp/(((1+(beta**2))*tp) + ((beta**2)*fn) + fp)


In [220]:
results

Unnamed: 0,labels,precision,recall,f1 score,s_micro
0,B-NAME_STUDENT,0.358674,0.766667,0.488712,0.734531
1,I-NAME_STUDENT,0.170082,0.453552,0.247392,0.42623
2,B-EMAIL,,0.0,,0.0
3,I-EMAIL,,,,
4,B-URL_PERSONAL,1.0,0.136364,0.24,0.141049
5,I-URL_PERSONAL,,,,
6,B-ID_NUM,0.666667,0.2,0.307692,0.205534
7,I-ID_NUM,,,,
8,B-USERNAME,,,,
9,I-USERNAME,,,,


In [221]:
true_y_train, preds_train = calculate_metrics(train_dataset)

2024-05-17 11:45:14.071143: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


processed 153927 tokens
True Positive: 1728, True Negative: 151955, False Positive: 37, False Negative: 40
167 tokens identified as PII but mislabelled.
Accuracy (non-O): 0.8930232558139535
Accuracy: 0.9984148330052557
Precision: 0.9790368271954675
Recall: 0.9773755656108597
F1 score: 0.9782054910840644
S-micro score: 0.9774393560317634


**Using the full text instead of the sentence context for training dataset**

In [171]:
X_train, X_test, y_train, y_test = train_test_split(train['tokens'],train['labels'],test_size = 0.2, random_state = 42)

X_train = pad_sequences(X_train.map(lookup_layer))
X_test = pad_sequences(X_test.map(lookup_layer))
y_train = pad_sequences(pd.Series([[encoding[r] for r in row] for row in y_train]))
y_test = pad_sequences(pd.Series([[encoding[r] for r in row] for row in y_test]))

batch_size = 32
train_dataset = tf.data.Dataset.from_tensor_slices((X_train,y_train)).batch(batch_size)
test_dataset = tf.data.Dataset.from_tensor_slices((X_test,y_test)).batch(batch_size)

ner_model = NERModel(num_tags, vocab_size, embed_dim=32, num_heads=4, ff_dim=64)
ner_model.compile(optimizer='adam',loss=loss)

ner_model.fit(train_dataset, epochs=5)

Epoch 1/5
[1m171/171[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1402s[0m 8s/step - loss: 0.1482
Epoch 2/5
[1m171/171[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1367s[0m 8s/step - loss: 0.0064
Epoch 3/5
[1m171/171[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1315s[0m 8s/step - loss: 0.0044
Epoch 4/5
[1m171/171[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1325s[0m 8s/step - loss: 0.0017
Epoch 5/5
[1m171/171[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1336s[0m 8s/step - loss: 0.0014


<keras.src.callbacks.history.History at 0x3f79c1450>

Epoch 1/5
loss: 0.1482
Epoch 2/5
loss: 0.0064
Epoch 3/5
loss: 0.0044
Epoch 4/5
loss: 0.0017
Epoch 5/5
loss: 0.0014

In [206]:
real, preds = calculate_metrics(test_dataset)

2024-05-17 02:31:25.141757: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


processed 1005699 tokens
True Positive: 107, True Negative: 1004925, False Positive: 210, False Negative: 416
41 tokens identified as PII but mislabelled.
Accuracy (non-O): 0.18971631205673758
Accuracy: 0.9993367796925322
Precision: 0.33753943217665616
Recall: 0.2045889101338432
F1 score: 0.12738095238095237
S-micro score: 0.20773596176821985


In [204]:
results = calculate_metrics_by_label(real, preds, labels)

True Positive: 107, False Positive: 251, False Negative: 457


  p = tp / (tp + fp)
  r = tp / (tp + fn)
  s = (1+(beta**2))*tp/(((1+(beta**2))*tp) + ((beta**2)*fn) + fp)


In [202]:
results

Unnamed: 0,labels,precision,recall,f1 score,s_micro
0,B-NAME_STUDENT,0.294118,0.26616,0.139721,0.267136
1,I-NAME_STUDENT,0.308333,0.151639,0.101648,0.154662
2,B-EMAIL,,0.0,,0.0
3,I-EMAIL,,,,
4,B-URL_PERSONAL,,0.0,,0.0
5,I-URL_PERSONAL,,,,
6,B-ID_NUM,,0.0,,0.0
7,I-ID_NUM,,,,
8,B-USERNAME,,,,
9,I-USERNAME,,,,


This model is highly overfit. With such imbalanced classes, it only predicted 107 true positive labels with 457 mislabelled tokens.

In [24]:
#Create document with unique sentence labels
unique_sent_idx = df1[['document','unique_sentence_test']].drop_duplicates().index
sentence_df = df1.loc[unique_sent_idx,['document','tokenized_sentence','labels_by_sentence','unique_sentence_test']]
del unique_sent_idx
sentence_df[['unique_sentence_test','document']].duplicated().sum()

0

In [25]:
del df3

In [26]:
sentence_df.drop('unique_sentence_test',axis = 1,inplace = True)
sentence_df['sentence'] = sentence_df[['tokenized_sentence']].map(lambda x: " ".join(x))
sentence_df['word_labels'] = sentence_df[['labels_by_sentence']].map(lambda x: ",".join(x))

In [26]:
sentence_df.head()

Unnamed: 0,document,tokenized_sentence,labels_by_sentence,sentence,word_labels
0,7,"[Design, Thinking, for, innovation, reflexion,...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-...",Design Thinking for innovation reflexion - Avr...,"O,O,O,O,O,O,O,O,O,B-NAME_STUDENT,I-NAME_STUDENT,O"
12,7,"[Challenge, &, selection, \n\n]","[O, O, O, O]",Challenge & selection \n\n,"O,O,O,O"
16,7,"[The, tool, I, use, to, help, all, stakeholder...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",The tool I use to help all stakeholders findin...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O"
40,7,"[What, exactly, is, a, mind, map, ?]","[O, O, O, O, O, O, O]",What exactly is a mind map ?,"O,O,O,O,O,O,O"
47,7,"[According, to, the, definition, of, Buzan, T....","[O, O, O, O, O, O, O, O, O, O]",According to the definition of Buzan T. and Bu...,"O,O,O,O,O,O,O,O,O,O"


In [119]:
class TransformerBlockWithLSTM(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = keras.layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.ffn = keras.Sequential(
            [
                keras.layers.Bidirectional(layers.LSTM(ff_dim, return_sequences=True)),
                keras.layers.Dense(embed_dim),
            ]
        )
        self.layernorm1 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = keras.layers.Dropout(rate)
        self.dropout2 = keras.layers.Dropout(rate)

    def call(self, inputs, training=False):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [126]:
class NERModelWithLSTM(keras.Model):
    def __init__(
        self, num_tags, vocab_size, maxlen=3298, embed_dim=32, num_heads=2, ff_dim=32
    ):
        super().__init__()
        self.embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
        self.transformer_block = TransformerBlockWithLSTM(embed_dim, num_heads, ff_dim)
        self.dropout1 = layers.Dropout(0.1)
        self.ff = layers.Dense(ff_dim, activation="relu")
        self.dropout2 = layers.Dropout(0.1)
        self.ff_final = layers.Dense(num_tags, activation="softmax")

    def call(self, inputs, training=False):
        x = self.embedding_layer(inputs)
        x = self.transformer_block(x)
        x = self.dropout1(x, training=training)
        x = self.ff(x)
        x = self.dropout2(x, training=training)
        x = self.ff_final(x)
        return x

In [125]:
X_train, X_test, y_train, y_test = train_test_split(df3['tokenized_sentence'],df3['labels_by_sentence'],test_size = 0.2, random_state = 42)

X_train = pad_sequences(X_train.map(lookup_layer))
X_test = pad_sequences(X_test.map(lookup_layer))
y_train = pad_sequences(pd.Series([[encoding[r] for r in row] for row in y_train]))
y_test = pad_sequences(pd.Series([[encoding[r] for r in row] for row in y_test]))

batch_size = 32
train_dataset = tf.data.Dataset.from_tensor_slices((X_train,y_train)).batch(batch_size)
test_dataset = tf.data.Dataset.from_tensor_slices((X_test,y_test)).batch(batch_size)

In [127]:
ner_model_lstm = NERModelWithLSTM(num_tags, vocab_size, embed_dim=32, num_heads=4, ff_dim=64)
ner_model_lstm.compile(optimizer='adam',loss=loss)
ner_model_lstm.fit(train_dataset, epochs=5)

Epoch 1/5
[1m208/208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 116ms/step - loss: 0.2471
Epoch 2/5
[1m208/208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 109ms/step - loss: 0.0434
Epoch 3/5
[1m208/208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 108ms/step - loss: 0.0175
Epoch 4/5
[1m208/208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 117ms/step - loss: 0.0105
Epoch 5/5
[1m208/208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 110ms/step - loss: 0.0058


<keras.src.callbacks.history.History at 0x33e279d90>

In [132]:
real, preds = calculate_metrics(test_dataset,ner_model_lstm)

processed 39483 tokens
True Positive: 260, True Negative: 38812, False Positive: 214, False Negative: 156
41 tokens identified as PII but mislabelled.
Accuracy (non-O): 0.5689277899343544
Accuracy: 0.9895904566522301
Precision: 0.5485232067510548
Recall: 0.625
F1 score: 0.5842696629213483
S-micro score: 0.6216663601250689


2024-05-18 22:50:38.940457: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [133]:
results = calculate_metrics_by_label(real, preds, labels)
results

True Positive: 260, False Positive: 255, False Negative: 197


  p = tp / (tp + fp)
  r = tp / (tp + fn)
  s = (1+(beta**2))*tp/(((1+(beta**2))*tp) + ((beta**2)*fn) + fp)


Unnamed: 0,labels,precision,recall,f1 score,s_micro
0,B-NAME_STUDENT,0.501326,0.7875,0.612642,0.770582
1,I-NAME_STUDENT,0.507353,0.377049,0.432602,0.380811
2,B-EMAIL,,0.0,,0.0
3,I-EMAIL,,,,
4,B-URL_PERSONAL,1.0,0.045455,0.086957,0.047187
5,I-URL_PERSONAL,,,,
6,B-ID_NUM,1.0,0.1,0.181818,0.103586
7,I-ID_NUM,,,,
8,B-USERNAME,,,,
9,I-USERNAME,,,,


Testing models on a different sample

In [156]:
X_train, X_test, y_train, y_test = train_test_split(df3['tokenized_sentence'],df3['labels_by_sentence'],test_size = 0.2, random_state = 50)

X_train = pad_sequences(X_train.map(lookup_layer))
X_test = pad_sequences(X_test.map(lookup_layer))
y_train = pad_sequences(pd.Series([[encoding[r] for r in row] for row in y_train]))
y_test = pad_sequences(pd.Series([[encoding[r] for r in row] for row in y_test]))

batch_size = 32
train_dataset = tf.data.Dataset.from_tensor_slices((X_train,y_train)).batch(batch_size)
test_dataset = tf.data.Dataset.from_tensor_slices((X_test,y_test)).batch(batch_size)

In [157]:
ner_model_lstm = NERModelWithLSTM(num_tags, vocab_size, embed_dim=32, num_heads=4, ff_dim=64)
ner_model_lstm.compile(optimizer='adam',loss=loss)
ner_model_lstm.fit(train_dataset, epochs=5)

Epoch 1/5
[1m208/208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 120ms/step - loss: 0.4341
Epoch 2/5
[1m208/208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 114ms/step - loss: 0.0700
Epoch 3/5
[1m208/208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 116ms/step - loss: 0.0638
Epoch 4/5
[1m208/208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 116ms/step - loss: 0.0601
Epoch 5/5
[1m208/208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 117ms/step - loss: 0.0501


<keras.src.callbacks.history.History at 0x4958d0390>

In [158]:
real, preds = calculate_metrics(test_dataset,ner_model_lstm)

processed 38773 tokens
True Positive: 90, True Negative: 38241, False Positive: 45, False Negative: 377
20 tokens identified as PII but mislabelled.
Accuracy (non-O): 0.18480492813141683
Accuracy: 0.9886003146519485
Precision: 0.6666666666666666
Recall: 0.19271948608137046
F1 score: 0.29900332225913623
S-micro score: 0.19813717188823032


2024-05-19 00:12:03.738409: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [162]:
ner_model = NERModel(num_tags, vocab_size, embed_dim=32, num_heads=4, ff_dim=64)
ner_model.compile(optimizer='adam',loss=loss)
ner_model.fit(train_dataset, epochs=5)

Epoch 1/5
[1m208/208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 59ms/step - loss: 0.4274
Epoch 2/5
[1m208/208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 53ms/step - loss: 0.0760
Epoch 3/5
[1m208/208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 52ms/step - loss: 0.0322
Epoch 4/5
[1m208/208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 51ms/step - loss: 0.0217
Epoch 5/5
[1m208/208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 50ms/step - loss: 0.0167


<keras.src.callbacks.history.History at 0x4d745b1d0>

In [163]:
real, preds = calculate_metrics(test_dataset,ner_model)

processed 38773 tokens
True Positive: 283, True Negative: 37802, False Positive: 484, False Negative: 77
127 tokens identified as PII but mislabelled.
Accuracy (non-O): 0.5811088295687885
Accuracy: 0.9822556933948882
Precision: 0.36897001303780963
Recall: 0.7861111111111111
F1 score: 0.5022182786157942
S-micro score: 0.7533531278795945


2024-05-19 00:17:26.542172: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Can we reproduce the random_state = 42 performance of the NER with LSTM?

In [170]:
train_dataset, test_dataset = get_training_data(df3['tokenized_sentence'], df3['labels_by_sentence'])

ner_model_lstm = NERModelWithLSTM(num_tags, vocab_size, embed_dim=32, num_heads=4, ff_dim=64)
ner_model_lstm.compile(optimizer='adam',loss=loss)
ner_model_lstm.fit(train_dataset, epochs=5)

Epoch 1/5
[1m208/208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 128ms/step - loss: 0.3360
Epoch 2/5
[1m208/208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 117ms/step - loss: 0.0565
Epoch 3/5
[1m208/208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 117ms/step - loss: 0.0262
Epoch 4/5
[1m208/208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 117ms/step - loss: 0.0115
Epoch 5/5
[1m208/208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 118ms/step - loss: 0.0067


<keras.src.callbacks.history.History at 0x45a92cf50>

In [171]:
real, preds = calculate_metrics(test_dataset,ner_model_lstm)

processed 39483 tokens
True Positive: 164, True Negative: 38950, False Positive: 76, False Negative: 265
28 tokens identified as PII but mislabelled.
Accuracy (non-O): 0.3588621444201313
Accuracy: 0.9906542056074766
Precision: 0.6833333333333333
Recall: 0.3822843822843823
F1 score: 0.49028400597907323
S-micro score: 0.3888736890104879


2024-05-19 00:40:06.232222: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [174]:
ner_model = NERModel(num_tags, vocab_size, embed_dim=32, num_heads=4, ff_dim=64)
ner_model.compile(optimizer='adam',loss=loss)
ner_model.fit(train_dataset, epochs=5)

Epoch 1/5
[1m208/208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 67ms/step - loss: 0.3491
Epoch 2/5
[1m208/208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 54ms/step - loss: 0.0656
Epoch 3/5
[1m208/208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 53ms/step - loss: 0.0340
Epoch 4/5
[1m208/208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 53ms/step - loss: 0.0252
Epoch 5/5
[1m208/208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 52ms/step - loss: 0.0238


<keras.src.callbacks.history.History at 0x4b5c9c9d0>

In [175]:
real, preds = calculate_metrics(test_dataset,ner_model)

processed 39483 tokens
True Positive: 197, True Negative: 38815, False Positive: 211, False Negative: 210
50 tokens identified as PII but mislabelled.
Accuracy (non-O): 0.4310722100656455
Accuracy: 0.9880708152875921
Precision: 0.48284313725490197
Recall: 0.48402948402948404
F1 score: 0.4834355828220859
S-micro score: 0.48398374751960693


2024-05-19 00:44:08.645165: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Fine-tune batch size, number of transformer heads, embed_dims, ff_dims, number of epochs

Figure out what sample to use - try measuring model accuracy with all the sentences

Running the model on all sentences

In [137]:
#Create document with unique sentence labels
unique_sent_idx = df1[['document','sentence']].drop_duplicates().index
sentence_df = df1.loc[unique_sent_idx,['document','tokenized_sentence','labels_by_sentence','sentence']]
del unique_sent_idx
print(f"Number of duplicated sentences: {sentence_df[['sentence','document']].duplicated().sum()}")
sentence_df['word_labels'] = sentence_df[['labels_by_sentence']].map(lambda x: ",".join(x))
sentence_df.reset_index(drop = True, inplace=True)

Number of duplicated sentences: 0


In [153]:
np.random.seed(42)
documents = sentence_df.document.unique()
random_sample = np.random.choice(documents, size = 1000, replace = False)
sample_data = sentence_df.loc[sentence_df.document.isin(list(random_sample)),['tokenized_sentence','labels_by_sentence']]

X_train, X_test, y_train, y_test = train_test_split(sample_data['tokenized_sentence'], 
                                                    sample_data['labels_by_sentence'],
                                                    test_size = 0.2, random_state = 42)

X_train = pad_sequences(X_train.map(lookup_layer))
X_test = pad_sequences(X_test.map(lookup_layer))
y_train = pad_sequences(pd.Series([[encoding[r] for r in row] for row in y_train]))
y_test = pad_sequences(pd.Series([[encoding[r] for r in row] for row in y_test]))

batch_size = 32
train_dataset = tf.data.Dataset.from_tensor_slices((X_train,y_train)).batch(batch_size)
test_dataset = tf.data.Dataset.from_tensor_slices((X_test,y_test)).batch(batch_size)

In [154]:
ner_model_lstm = NERModelWithLSTM(num_tags, vocab_size, embed_dim=32, num_heads=4, ff_dim=64)
ner_model_lstm.compile(optimizer='adam',loss=loss)
ner_model_lstm.fit(train_dataset, epochs=5)

Epoch 1/5
[1m886/886[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m105s[0m 115ms/step - loss: 0.0862
Epoch 2/5
[1m886/886[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m103s[0m 116ms/step - loss: 0.0042
Epoch 3/5
[1m886/886[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m104s[0m 118ms/step - loss: 0.0036
Epoch 4/5
[1m886/886[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m104s[0m 118ms/step - loss: 0.0013
Epoch 5/5
[1m886/886[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m104s[0m 118ms/step - loss: 5.4111e-04


<keras.src.callbacks.history.History at 0x4d216c9d0>

In [155]:
real, preds = calculate_metrics(test_dataset, ner_model_lstm)

2024-05-18 23:55:36.882047: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


processed 147792 tokens
True Positive: 12, True Negative: 147706, False Positive: 1, False Negative: 72
1 tokens identified as PII but mislabelled.
Accuracy (non-O): 0.1411764705882353
Accuracy: 0.9994992963083252
Precision: 0.9230769230769231
Recall: 0.14285714285714285
F1 score: 0.24742268041237114
S-micro score: 0.1476573592049219
