# Named Entity Recognition

In [15]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf


## Import Data

In [14]:
data = pd.read_csv("data/ner_dataset.csv", encoding = "ISO-8859-1") 
print('ORIGINAL DATA:\n', data.head())
del(data)

ORIGINAL DATA:
     Sentence #           Word  POS Tag
0  Sentence: 1      Thousands  NNS   O
1          NaN             of   IN   O
2          NaN  demonstrators  NNS   O
3          NaN           have  VBP   O
4          NaN        marched  VBN   O


In [16]:
def load_data(file_path):
    with open(file_path,'r') as file:
        data = np.array([line.strip() for line in file.readlines()])
    return data

In [18]:

train_sentences = load_data('data/large/train/sentences.txt')
train_labels = load_data('data/large/train/labels.txt')

val_sentences = load_data('data/large/val/sentences.txt')
val_labels = load_data('data/large/val/labels.txt')

test_sentences = load_data('data/large/test/sentences.txt')
test_labels = load_data('data/large/test/labels.txt')


In [19]:
print(train_sentences.shape)
print(val_sentences.shape)
print(test_sentences.shape)
print("\n ",train_sentences[0:2])
print("\n ",train_labels[0:2])

(33570,)
(7194,)
(7194,)

  ['Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .'
 'Families of soldiers killed in the conflict joined the protesters who carried banners with such slogans as " Bush Number One Terrorist " and " Stop the Bombings . "']

  ['O O O O O O B-geo O O O O O B-geo O O O O O B-gpe O O O O O'
 'O O O O O O O O O O O O O O O O O O B-per O O O O O O O O O O O']


## 3 - Encoding

### 3.1 Encoding the sentences

We will use tf.keras.layers.TextVectorization to transform the sentences into integers.

In [20]:

def get_sentence_vectorizer(sentences):
    
    """
    Create a TextVectorization layer for sentence tokenization and adapt it to the provided sentences.

    Parameters:
    sentences (list of str): Sentences for vocabulary adaptation.

    Returns:
    sentence_vectorizer (tf.keras.layers.TextVectorization): TextVectorization layer for sentence tokenization.
    vocab (list of str): Extracted vocabulary.
    """

    # Define TextVectorization object with the appropriate standardize parameter
    sentence_vectorizer = tf.keras.layers.TextVectorization(standardize=None)
    # Adapt the sentence vectorization object to the given sentences
    sentence_vectorizer.adapt(sentences)
    # Get the vocabulary
    vocab = sentence_vectorizer.get_vocabulary()
    
    return sentence_vectorizer, vocab

In [28]:
sentence_vectorizer, vocab = get_sentence_vectorizer(train_sentences)
print(f"vocab size: {len(test_vocab)}")
print(f"Head vocab: {test_vocab[0:6]}")

sentence = "I like learning new NLP models !"
sentence_vectorized = test_vectorizer(sentence)
print(f"Sentence: {sentence}\nSentence vectorized: {sentence_vectorized}")

vocab size: 4650
Head vocab: ['', '[UNK]', 'the', '.', ',', 'in']
Sentence: I like learning new NLP models !
Sentence vectorized: [ 296  314    1   59    1    1 4649]


<a name="3.2"></a>
### 3.2 Encoding the labels

Extract all the different tags in a given set of labels

In [29]:
train_labels

array(['O O O O O O B-geo O O O O O B-geo O O O O O B-gpe O O O O O',
       'O O O O O O O O O O O O O O O O O O B-per O O O O O O O O O O O',
       'O O O O O O O O O O O B-geo I-geo O', ...,
       'B-per I-per O O O B-tim O O O O O O O O O O',
       'B-gpe O B-per I-per O O O O O B-org I-org I-org O O O O',
       'O O O O O O B-geo O O O O O O O O O O O O O O O O'], dtype='<U287')

In [30]:
def get_tags(labels):
    tag_set = set() # Define an empty set
    for el in labels:
        for tag in el.split(" "):
            tag_set.add(tag)
    tag_list = list(tag_set) 
    tag_list.sort()
    return tag_list

tags = get_tags(train_labels)
print(tags)

['B-art', 'B-eve', 'B-geo', 'B-gpe', 'B-nat', 'B-org', 'B-per', 'B-tim', 'I-art', 'I-eve', 'I-geo', 'I-gpe', 'I-nat', 'I-org', 'I-per', 'I-tim', 'O']


Now you will need to generate a **tag map**, i.e., a mapping between the tags and **positive** integers.

In [31]:
def make_tag_map(tags):
    tag_map = {}
    for i,tag in enumerate(tags):
        tag_map[tag] = i 
    return tag_map

tag_map = make_tag_map(tags)
print(tag_map)

{'B-art': 0, 'B-eve': 1, 'B-geo': 2, 'B-gpe': 3, 'B-nat': 4, 'B-org': 5, 'B-per': 6, 'B-tim': 7, 'I-art': 8, 'I-eve': 9, 'I-geo': 10, 'I-gpe': 11, 'I-nat': 12, 'I-org': 13, 'I-per': 14, 'I-tim': 15, 'O': 16}


<a name="3.3"></a>
### 3.3 Building the label vectorizer


In this section, we will pad the labels. TextVectorization already padded the sentences, so you must ensure that the labels are properly padded as well.         
We will pad the vectorized labels with the value -1. You will not use 0 to simplify loss masking and evaluation in further steps. This is because to properly classify one token, a log softmax transformation will be performed and the index with greater value will be the index label. Since index starts at 0, it is better to keep the label 0 as a valid index

Tensorflow provides the function tf.keras.utils.pad_sequences

In [34]:

def label_vectorizer(labels, tag_map):
    
    """
    Convert list of label strings to padded label IDs using a tag mapping.

    Parameters:
    labels (list of str): List of label strings.
    tag_map (dict): Dictionary mapping tags to IDs.
    Returns:
    label_ids (numpy.ndarray): Padded array of label IDs.
    
    """
    
    label_ids = [] # It can't be a numpy array yet, since each sentence has a different size

    # Each element in labels is a string of tags so for each of them:
    for element in labels:
        # Split it into single tokens. You may use .split function for strings. Be aware to split it by a blank space!
        tokens = element.split(' ')

        # Use the dictionaty tag_map passed as an argument to the label_vectorizer function
        # to make the correspondence between tags and numbers. 
        element_ids = [tag_map[tag] for tag in tokens]


        # Append the found ids to corresponding to the current element to label_ids list
        label_ids.append(element_ids)
        
    # Pad the elements
    label_ids = tf.keras.utils.pad_sequences(sequences=label_ids, padding='post', value=-1)

    return label_ids


In [39]:
print(f"Sentence: {train_sentences[5]}")
print(f"Labels: {train_labels[5]}")
print(f"Vectorized labels: {label_vectorizer([train_labels[5]], tag_map)}")

Sentence: The party is divided over Britain 's participation in the Iraq conflict and the continued deployment of 8,500 British troops in that country .
Labels: O O O O O B-gpe O O O O B-geo O O O O O O O B-gpe O O O O O
Vectorized labels: [[16 16 16 16 16  3 16 16 16 16  2 16 16 16 16 16 16 16  3 16 16 16 16 16]]


<a name="4"></a>
## 4 Building the Dataset

In [41]:
def generate_dataset(sentences, labels, sentence_vectorizer, tag_map):
    sentences_ids = sentence_vectorizer(sentences)
    labels_ids = label_vectorizer(labels, tag_map)
    dataset = tf.data.Dataset.from_tensor_slices((sentences_ids, labels_ids))
    return dataset

In [42]:
train_dataset = generate_dataset(train_sentences,train_labels, sentence_vectorizer, tag_map)
val_dataset = generate_dataset(val_sentences,val_labels,  sentence_vectorizer, tag_map)
test_dataset = generate_dataset(test_sentences, test_labels,  sentence_vectorizer, tag_map)

In [58]:
print(train_dataset)
#Use take(n) to take the first n elements
#Use skip(n) to skip the first n elements
#print('An example of the first sentence is\n', next(iter(train_dataset))[0].numpy())
#print('An example of its corresponding label is\n', next(iter(train_dataset))[1].numpy())

<_TensorSliceDataset element_spec=(TensorSpec(shape=(104,), dtype=tf.int64, name=None), TensorSpec(shape=(104,), dtype=tf.int32, name=None))>


In [52]:
for element in train_dataset.take(2):
    print(element)


(<tf.Tensor: shape=(104,), dtype=int64, numpy=
array([1046,    6, 1121,   18, 1832,  232,  543,    7,  528,    2,  158,
          5,   60,    9,  648,    2,  922,    6,  192,   87,   22,   16,
         54,    3,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0])>, <tf.Tensor: shape=(104,), dtype=int32, numpy=
array([16, 16, 16, 16, 16, 16,  2, 16, 16, 16, 16, 16,  2, 16, 16, 16, 16,
       16,  3, 16, 16, 16, 16, 16, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -

2024-07-18 05:39:07.336593: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_1' with dtype int32 and shape [33570,104]
	 [[{{node Placeholder/_1}}]]


In [68]:
# Create an iterator for the dataset
dataset_iter = iter(train_dataset)

# Print the first 2 examples
for _ in range(2):
    example_batch = next(dataset_iter)
    sentence, label = example_batch
    print('An example of a sentence is\n', sentence.numpy())
    print('An example of its corresponding label is\n', label.numpy())

An example of a sentence is
 [1046    6 1121   18 1832  232  543    7  528    2  158    5   60    9
  648    2  922    6  192   87   22   16   54    3    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0]
An example of its corresponding label is
 [16 16 16 16 16 16  2 16 16 16 16 16  2 16 16 16 16 16  3 16 16 16 16 16
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1]
An example of a sentence is
 [10265     6   156    38     5     2   492  1083     

<a name="4"></a>
## 5 Building the model

Inputs are sentences represented as tensors that are fed to a model with:

An Embedding layer           
A LSTM layer           
A Dense layer             
A log softmax layer             

In [71]:

def NER(len_tags, vocab_size, embedding_dim = 50):
    
    """
    Create a Named Entity Recognition (NER) model.

    Parameters:
    len_tags (int): Number of NER tags (output classes).
    vocab_size (int): Vocabulary size.
    embedding_dim (int, optional): Dimension of embedding and LSTM layers (default is 50).

    Returns:
    model (Sequential): NER model.
    """

    model = tf.keras.Sequential(name = 'sequential') 
    # Add the tf.keras.layers.Embedding layer. Do not forget to mask out the zeros!
    model.add(tf.keras.layers.Embedding(input_dim=vocab_size + 1, output_dim=embedding_dim, mask_zero=False))

    # Masking layer to handle -1 padding in labels
    model.add(tf.keras.layers.Masking(mask_value=-1))
    
    # Add the LSTM layer. Make sure you are passing the right dimension
    # and returning every output for the tf.keras.layers.LSTM layer and not the very last one.
    model.add(tf.keras.layers.LSTM(units=embedding_dim, return_sequences=True))
    # Add the final tf.keras.layers.Dense with the appropriate activation function. Remember you must pass the activation function itself ant not its call!
    # You must use tf.nn.log_softmax instead of tf.nn.log_softmax().
    model.add(tf.keras.layers.Dense(units=len_tags, activation=tf.nn.log_softmax))
    

    return model

Before training the model, you need to create your own function to compute the accuracy. Tensorflow has built-in accuracy metrics but you cannot pass values to be ignored. This will impact the calculations, since you must remove the padded values. 

In [73]:

def masked_loss(y_true, y_pred):
    
    """
    Calculate the masked sparse categorical cross-entropy loss.

    Parameters:
    y_true (tensor): True labels.
    y_pred (tensor): Predicted logits.
    
    Returns:
    loss (tensor): Calculated loss.
    """
    
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True , ignore_class = -1)

    loss = loss_fn(y_true,y_pred)
    
    return  loss 

In [74]:
true_labels = [0,1,2,0]
predicted_logits = [[-2.3,-0.51,-1.20] , [-1.61,-0.36,-2.30], [-2.30, -0.69,-0.92], [-0.92,-0.92,-1.61]]
print(masked_loss(true_labels, predicted_logits))

tf.Tensor(1.1242604, shape=(), dtype=float32)


In [76]:

def masked_accuracy(y_true, y_pred):
    
    """
    Calculate masked accuracy for predicted labels.

    Parameters:
    y_true (tensor): True labels.
    y_pred (tensor): Predicted logits.

    Returns:
    accuracy (tensor): Masked accuracy.

    """
    
    # Calculate the loss for each item in the batch.
    # You must always cast the tensors to the same type in order to use them in training. Since you will make divisions, it is safe to use tf.float32 data type.
    y_true = tf.cast(y_true, tf.float32) 
    # Create the mask, i.e., the values that will be ignored
    mask = tf.not_equal(y_true , -1)
    mask = tf.cast(mask, tf.float32) 
    # Perform argmax to get the predicted values
    y_pred_class = tf.math.argmax(y_pred, axis=-1)
    y_pred_class = tf.cast(y_pred_class, tf.float32) 
    # Compare the true values with the predicted ones
    matches_true_pred  = tf.equal(y_pred_class, y_true)
    matches_true_pred = tf.cast(matches_true_pred , tf.float32) 
    # Multiply the acc tensor with the masks
    matches_true_pred *= mask
    # Compute masked accuracy (quotient between the total matches and the total valid values, i.e., the amount of non-masked values)
    masked_acc = tf.reduce_sum(matches_true_pred) / tf.reduce_sum(mask)
    
    return masked_acc



In [77]:
true_labels = [0,1,2,0]
predicted_logits = [[0.1,0.6,0.3] , [0.2,0.7,0.1], [0.1, 0.5,0.4], [0.4,0.4,0.2]]
print(masked_accuracy(true_labels, predicted_logits))

tf.Tensor(0.5, shape=(), dtype=float32)


In [79]:
print(len(vocab))
print(len(tag_map))

29847
17


In [84]:
model = NER(len(tag_map), len(vocab))
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, None, 50)          1492400   
                                                                 
 masking_1 (Masking)         (None, None, 50)          0         
                                                                 
 lstm_1 (LSTM)               (None, None, 50)          20200     
                                                                 
 dense_1 (Dense)             (None, None, 17)          867       
                                                                 
Total params: 1,513,467
Trainable params: 1,513,467
Non-trainable params: 0
_________________________________________________________________


In [85]:
model.compile(optimizer=tf.keras.optimizers.Adam(0.01), 
              loss = masked_loss,
               metrics = [masked_accuracy])

<a name="6"></a>
## 6 Training the model

In [86]:
tf.keras.utils.set_random_seed(33) ## Setting again a random seed to ensure reproducibility

BATCH_SIZE = 64

model.fit(train_dataset.batch(BATCH_SIZE),
          validation_data = val_dataset.batch(BATCH_SIZE),
          shuffle=True,
          epochs = 10)

Epoch 1/10


2024-07-18 09:06:34.598839: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_1' with dtype int32 and shape [33570,104]
	 [[{{node Placeholder/_1}}]]




2024-07-18 09:07:47.650277: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_1' with dtype int32 and shape [7194,73]
	 [[{{node Placeholder/_1}}]]


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f1170638f50>

### Accuracy 

In [87]:
test_sentences_id = sentence_vectorizer(test_sentences)
test_labels_id = label_vectorizer(test_labels,tag_map)
y_true = test_labels_id 
y_pred = model.predict(test_sentences_id)



In [90]:
print(f"The model's accuracy in test set is: {masked_accuracy(y_true,y_pred).numpy():.4f}")

The model's accuracy in test set is: 0.9537
