In [1089]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import numpy as np
import pandas as pd
import tensorflow as tf

# set random seeds to make this notebook easier to replicate
tf.keras.utils.set_random_seed(33)

In [1090]:
import w2_unittest

In [1091]:
data = pd.read_csv('data/ner_dataset.csv', encoding='ISO-8859-1')
train_sents = open('data/small/train/sentences.txt').readline()
train_labels = open('data/small/train/labels.txt', 'r').readline()

In [1092]:
print('SENTENCE:', train_sents)
print('SENTENCE LABEL:', train_labels)
print('ORIGINAL DATA:\n', data.head())

SENTENCE: Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .

SENTENCE LABEL: O O O O O O B-geo O O O O O B-geo O O O O O B-gpe O O O O O

ORIGINAL DATA:
     Sentence #           Word  POS Tag
0  Sentence: 1      Thousands  NNS   O
1          NaN             of   IN   O
2          NaN  demonstrators  NNS   O
3          NaN           have  VBP   O
4          NaN        marched  VBN   O


In [1093]:
del(data, train_sents, train_labels)

# Load data

In [1094]:
def load_data(file_path):
    with open(file_path,'r') as file:
        data = np.array([line.strip() for line in file.readlines()])
    return data

In [1095]:
train_sentences = load_data('data/large/train/sentences.txt')
train_labels = load_data('data/large/train/labels.txt')

val_sentences = load_data('data/large/val/sentences.txt')
val_labels = load_data('data/large/val/labels.txt')

test_sentences = load_data('data/large/test/sentences.txt')
test_labels = load_data('data/large/test/labels.txt')

In [1096]:
train_sentences[0]

'Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .'

In [1097]:
train_labels[0]

'O O O O O O B-geo O O O O O B-geo O O O O O B-gpe O O O O O'

# Encoding

Use [`tf.keras.layers.TextVectorization`](https://www.tensorflow.org/api_docs/python/tf/keras/layers/TextVectorization) to transform the sentences into integers,

In [1098]:
def get_sentence_vectorizer(sentences):
    tf.keras.utils.set_random_seed(33) ## Do not change this line. 
    """
    Create a TextVectorization layer for sentence tokenization and adapt it to the provided sentences.

    Parameters:
    sentences (list of str): Sentences for vocabulary adaptation.

    Returns:
    sentence_vectorizer (tf.keras.layers.TextVectorization): TextVectorization layer for sentence tokenization.
    vocab (list of str): Extracted vocabulary.
    """
    ### START CODE HERE ###

    # Define TextVectorization object with the appropriate standardize parameter
    sentence_vectorizer = tf.keras.layers.TextVectorization(standardize=None)
    # Adapt the sentence vectorization object to the given sentences
    sentence_vectorizer.adapt(sentences)
    # Get the vocabulary
    vocab = sentence_vectorizer.get_vocabulary()

    ### END CODE HERE ### 
    
    return sentence_vectorizer, vocab

In [1099]:
test_vectorizer, test_vocab = get_sentence_vectorizer(train_sentences[:1000])
print(f"Test vocab size: {len(test_vocab)}")

sentence = "I like learning new NLP models !"
sentence_vectorized = test_vectorizer(sentence)
print(f"Sentence: {sentence}\nSentence vectorized: {sentence_vectorized}")

Test vocab size: 4650
Sentence: I like learning new NLP models !
Sentence vectorized: [ 296  314    1   59    1    1 4649]


In [1100]:
w2_unittest.test_get_sentence_vectorizer(get_sentence_vectorizer)

[92m All tests passed


In [1101]:
sentence_vectorizer, vocab = get_sentence_vectorizer(train_sentences)

In [1102]:
vocab[0] # Padded token

''

In [1103]:
vocab[1] # Unk token

'[UNK]'

In [1104]:
print(len(vocab))

29847


In [1105]:
print(f"Sentence: {train_sentences[0]}")
print(f"Labels: {train_labels[0]}")

Sentence: Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .
Labels: O O O O O O B-geo O O O O O B-geo O O O O O B-gpe O O O O O


In [1106]:
def get_tags(labels):
    tag_set = set() # Define an empty set
    for el in labels:
        for tag in el.split(" "):
            tag_set.add(tag)
    tag_list = list(tag_set) 
    tag_list.sort()
    return tag_list

In [1107]:
tags = get_tags(train_labels)

In [1108]:
tags

['B-art',
 'B-eve',
 'B-geo',
 'B-gpe',
 'B-nat',
 'B-org',
 'B-per',
 'B-tim',
 'I-art',
 'I-eve',
 'I-geo',
 'I-gpe',
 'I-nat',
 'I-org',
 'I-per',
 'I-tim',
 'O']

In [1109]:
def make_tag_map(tags):
    tag_map = {}
    for i,tag in enumerate(tags):
        tag_map[tag] = i 
    return tag_map

In [1110]:
tag_map = make_tag_map(tags)
print(tag_map)

{'B-art': 0, 'B-eve': 1, 'B-geo': 2, 'B-gpe': 3, 'B-nat': 4, 'B-org': 5, 'B-per': 6, 'B-tim': 7, 'I-art': 8, 'I-eve': 9, 'I-geo': 10, 'I-gpe': 11, 'I-nat': 12, 'I-org': 13, 'I-per': 14, 'I-tim': 15, 'O': 16}


In [1111]:
def label_vectorizer(labels, tag_map):
    """
    Convert list of label strings to padded label IDs using a tag mapping.

    Parameters:
    labels (list of str): List of label strings.
    tag_map (dict): Dictionary mapping tags to IDs.
    Returns:
    label_ids (numpy.ndarray): Padded array* of label IDs.
    """
    label_ids = [] # It can't be a numpy array yet, since each sentence has a different size

    ### START CODE HERE ### 

    # Each element in labels is a string of tags so for each of them:
    for element in labels:
        # Split it into single tokens. You may use .split function for strings. Be aware to split it by a blank space!
        tokens = element.split(' ')

        # Use the dictionaty tag_map passed as an argument to the label_vectorizer function
        # to make the correspondence between tags and numbers. 
        element_ids = []

        for token in tokens:
            element_ids.append(tag_map[token])

        # Append the found ids to corresponding to the current element to label_ids list
        label_ids.append(element_ids)
        
    # Pad the elements
    label_ids = tf.keras.utils.pad_sequences(sequences=label_ids, padding='post', value=-1.0)
    
    ### END CODE HERE ### 

    return label_ids
    

In [1112]:
print(f"Sentence: {train_sentences[5]}")
print(f"Labels: {train_labels[5]}")
print(f"Vectorized labels: {label_vectorizer([train_labels[5]], tag_map)}")

Sentence: The party is divided over Britain 's participation in the Iraq conflict and the continued deployment of 8,500 British troops in that country .
Labels: O O O O O B-gpe O O O O B-geo O O O O O O O B-gpe O O O O O
Vectorized labels: [[16 16 16 16 16  3 16 16 16 16  2 16 16 16 16 16 16 16  3 16 16 16 16 16]]


# Generate dataset

In [1113]:
def generate_dataset(sentences, labels, sentence_vectorizer, tag_map):
    sentences_ids = sentence_vectorizer(sentences)
    labels_ids = label_vectorizer(labels, tag_map = tag_map)
    dataset = tf.data.Dataset.from_tensor_slices((sentences_ids, labels_ids))
    return dataset

In [1114]:
train_dataset = generate_dataset(train_sentences,train_labels, sentence_vectorizer, tag_map)
val_dataset = generate_dataset(val_sentences,val_labels,  sentence_vectorizer, tag_map)
test_dataset = generate_dataset(test_sentences, test_labels,  sentence_vectorizer, tag_map)

In [1115]:
# Exploring information about the training data
print(f'The number of outputs is {len(tags)}')
# The number of vocabulary tokens (including <PAD>)
g_vocab_size = len(vocab)
print(f"Num of vocabulary words in the training set: {g_vocab_size}")
print('The training size is', len(train_dataset))
print('The validation size is', len(val_dataset))
print('An example of the first sentence is\n\t', next(iter(train_dataset))[0].numpy())
print('An example of its corresponding label is\n\t', next(iter(train_dataset))[1].numpy())

The number of outputs is 17
Num of vocabulary words in the training set: 29847
The training size is 33570
The validation size is 7194
An example of the first sentence is
	 [1046    6 1121   18 1832  232  543    7  528    2  158    5   60    9
  648    2  922    6  192   87   22   16   54    3    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0]
An example of its corresponding label is
	 [16 16 16 16 16 16  2 16 16 16 16 16  2 16 16 16 16 16  3 16 16 16 16 16
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 

# Build model

In [1116]:
def NER(len_tags, vocab_size, embedding_dim = 50):
    """
    Create a Named Entity Recognition (NER) model.

    Parameters:
    len_tags (int): Number of NER tags (output classes).
    vocab_size (int): Vocabulary size.
    embedding_dim (int, optional): Dimension of embedding and LSTM layers (default is 50).

    Returns:
    model (Sequential): NER model.
    """
    model = tf.keras.Sequential(name = 'sequential')
    model.add(tf.keras.layers.Embedding(vocab_size + 1, embedding_dim, mask_zero=True))
    model.add(tf.keras.layers.LSTM(units=embedding_dim, return_sequences=True))
    model.add(tf.keras.layers.Dense(units=len_tags, activation=tf.nn.log_softmax))


# Create masked loss function

In [1117]:
def masked_loss(y_true, y_pred):
    """
    Calculate the masked sparse categorical cross-entropy loss.

    Parameters:
    y_true (tensor): True labels.
    y_pred (tensor): Predicted logits.

    Returns:
    loss (tensor): Calculated loss.
    """
    # Define the loss function with from_logits=True as logits are expected
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, ignore_class=-1)
    loss = loss_fn(y_true, y_pred)

    return loss

In [1118]:
true_labels = np.array([0,1,2,0, -1])
predicted_logits = np.array([[0.1,0.6,0.3] , [0.2,0.7,0.1], [0.1, 0.5,0.4], [0.4,0.4,0.2],  [0.4,0.4,0.2]])
print(masked_loss(true_labels, predicted_logits))

tf.Tensor(1.0508583, shape=(), dtype=float32)


In [1119]:
true_labels = np.array([0,1,2,0])
predicted_logits = np.array([[0.1,0.6,0.3] , [0.2,0.7,0.1], [0.1, 0.5,0.4], [0.4,0.4,0.2]])
print(masked_loss(true_labels, predicted_logits))

tf.Tensor(1.0508583, shape=(), dtype=float32)


In [1120]:
def masked_accuracy(y_true, y_pred):
    """
    Calculate masked accuracy for predicted labels.

    Parameters:
    y_true (tensor): True labels.
    y_pred (tensor): Predicted logits.

    Returns:
    accuracy (tensor): Masked accuracy.

    """
    
    ### START CODE HERE ### 
    
    # Calculate the loss for each item in the batch.
    # You must always cast the tensors to the same type in order to use them in training. Since you will make divisions, it is safe to use tf.float32 data type.
    y_true = tf.cast(y_true, tf.float32) 
    # Create the mask, i.e., the values that will be ignored
    mask = tf.not_equal(y_true, -1)
    mask = tf.cast(mask, tf.float32) 
    # Perform argmax to get the predicted values
    y_pred_class = tf.math.argmax(y_pred, axis=-1)
    y_pred_class = tf.cast(y_pred_class, tf.float32) 
    # Compare the true values with the predicted ones
    matches_true_pred  = tf.equal(y_true, y_pred_class)
    matches_true_pred = tf.cast(matches_true_pred , tf.float32) 
    # Multiply the acc tensor with the masks
    matches_true_pred *= mask
    # Compute masked accuracy (quotient between the total matches and the total valid values, i.e., the amount of non-masked values)
    masked_acc = tf.reduce_sum(matches_true_pred)/tf.reduce_sum(mask)
    
    ### END CODE HERE ### 

    return masked_acc

In [1121]:
true_labels = [0,1,2,0]
predicted_logits = [[0.1,0.6,0.3] , [0.2,0.7,0.1], [0.1, 0.5,0.4], [0.4,0.4,0.2]]
print(masked_accuracy(true_labels, predicted_logits))

tf.Tensor(0.5, shape=(), dtype=float32)


In [1122]:
model = NER(len(tag_map), len(vocab))
model.summary()

In [1123]:
tf.keras.utils.set_random_seed(33)
x = np.array([546, 468, 896])
x = tf.expand_dims(x, axis=0)
x_padded = np.array([545, 467, 896, 0, 0, 0])
x_padded = tf.expand_dims(x_padded, axis=0)
pred_x = model(x)
pred_x_padded = model(x_padded)
y_true = tf.expand_dims([16, 6, 12], axis = 0)
y_true_padded = tf.expand_dims([16,6,12,-1,-1,-1], axis = 0)
print(f'masked loss: {masked_loss(y_true,pred_x)}')
print(f'masked loss padded: {masked_loss(y_true_padded,pred_x_padded)}')

masked loss: 2.82684063911438
masked loss padded: 2.833340883255005


In [1124]:

# Assuming model is defined and compiled appropriately
# Example to inspect outputs
x = tf.constant([[546, 468, 896]])  # Original input
x_padded = tf.constant([[545, 467, 896, 0, 0, 0]])  # Padded input

pred_x = model(x)
pred_x_padded = model(x_padded)

print("Predictions for non-padded input:", pred_x.numpy())
print("Predictions for padded input:", pred_x_padded.numpy())

# Calculate loss
y_true = tf.constant([[16, 6, 12]])  # True labels for non-padded
y_true_padded = tf.constant([[16, 6, 12, -1, -1, -1]])  # True labels for padded

loss_non_padded = masked_loss(y_true, pred_x)
loss_padded = masked_loss(y_true_padded, pred_x_padded)

print("Masked loss (non-padded):", loss_non_padded.numpy())
print("Masked loss (padded):", loss_padded.numpy())

Predictions for non-padded input: [[[-2.831167  -2.831565  -2.8388314 -2.8320184 -2.831891  -2.8354623
   -2.8312333 -2.823941  -2.8284352 -2.8338828 -2.838854  -2.8405085
   -2.831732  -2.8385525 -2.8332148 -2.833402  -2.830079 ]
  [-2.8285575 -2.828672  -2.839327  -2.8281872 -2.8355584 -2.8468404
   -2.8253448 -2.8193233 -2.837939  -2.8340418 -2.8385763 -2.8411806
   -2.8300018 -2.8376849 -2.83513   -2.831427  -2.827203 ]
  [-2.8295639 -2.8299117 -2.8346848 -2.829337  -2.8310251 -2.8450518
   -2.8238728 -2.8245683 -2.8445399 -2.833225  -2.8326201 -2.835588
   -2.8250976 -2.8332336 -2.84574   -2.836497  -2.8304324]]]
Predictions for padded input: [[[-2.8332272 -2.83451   -2.8306098 -2.8312786 -2.833474  -2.8220248
   -2.8367913 -2.835969  -2.832457  -2.831888  -2.8376558 -2.828516
   -2.8324287 -2.8304293 -2.8400228 -2.835833  -2.8376544]
  [-2.8399107 -2.8290792 -2.8280978 -2.83051   -2.8320081 -2.831079
   -2.8352644 -2.8339171 -2.835216  -2.8355134 -2.8333132 -2.8212814
   -2.83354

In [1125]:
model.summary()

In [1126]:
print(f"masked_accuracy is the same: {np.allclose(masked_accuracy(y_true,pred_x), masked_accuracy(y_true_padded,pred_x_padded))}")
print(f"masked_loss is the same: {np.allclose(masked_loss(y_true,pred_x), masked_loss(y_true_padded,pred_x_padded))}")

masked_accuracy is the same: True
masked_loss is the same: False


In [1127]:
model.compile(optimizer=tf.keras.optimizers.Adam(0.01), 
              loss = masked_loss,
               metrics = [masked_accuracy])

In [1128]:
## Setting again a random seed to ensure reproducibility

BATCH_SIZE = 64

model.fit(train_dataset.batch(BATCH_SIZE),
          validation_data = val_dataset.batch(BATCH_SIZE),
          shuffle=True,
          epochs = 2)

Epoch 1/2
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 37ms/step - loss: 0.4594 - masked_accuracy: 0.8952 - val_loss: 0.1393 - val_masked_accuracy: 0.9573
Epoch 2/2
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 37ms/step - loss: 0.1299 - masked_accuracy: 0.9612 - val_loss: 0.1359 - val_masked_accuracy: 0.9584


<keras.src.callbacks.history.History at 0x3a4cc51d0>

In [1129]:
# Convert the sentences into ids
test_sentences_id = sentence_vectorizer(test_sentences)
# Convert the labels into token ids
test_labels_id = label_vectorizer(test_labels,tag_map)
# Rename to prettify next function call
y_true = test_labels_id 
y_pred = model.predict(test_sentences_id)

[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step


In [1130]:
print(f"The model's accuracy in test set is: {masked_accuracy(y_true,y_pred).numpy():.4f}")

The model's accuracy in test set is: 0.9576


In [1131]:
def predict(sentence, model, sentence_vectorizer, tag_map):
    """
    Predict NER labels for a given sentence using a trained model.

    Parameters:
    sentence (str): Input sentence.
    model (tf.keras.Model): Trained NER model.
    sentence_vectorizer (tf.keras.layers.TextVectorization): Sentence vectorization layer.
    tag_map (dict): Dictionary mapping tag IDs to labels.

    Returns:
    predictions (list): Predicted NER labels for the sentence.

    """

    ### START CODE HERE ### 

    # Convert the sentence into ids
    sentence_vectorized = sentence_vectorizer(sentence)
    # Expand its dimension to make it appropriate to pass to the model
    #print(f'sentence vectorized: {sentence_vectorized.shape}')
    sentence_vectorized = tf.expand_dims(sentence_vectorized, axis=0) # You want to get it as a row
    #print(f'sentence vectorized: {sentence_vectorized.shape}')
    # Get the model output
    output = model(sentence_vectorized)
    # Get the predicted labels for each token, using argmax function and specifying the correct axis to perform the argmax
    outputs = np.argmax(output, axis=-1)
    # Next line is just to adjust outputs dimension. Since this function expects only one input to get a prediction, outputs will be something like [[1,2,3]]
    # so to avoid heavy notation below, let's transform it into [1,2,3]
    #print(f'outputs: {outputs.shape}')
    outputs = outputs[0] 
    #print(f'outputs: {outputs.shape}')
    # Get a list of all keys, remember that the tag_map was built in a way that each label id matches its index in a list
    labels = list(tag_map.keys()) 
    # print(tag_map)
    pred = [] 
    # Iterating over every predicted token in outputs list
    for tag_idx in outputs:
        #print(tag_idx)
        #print(labels[tag_idx])
        pred_label = labels[tag_idx]
        pred.append(pred_label)
    
    ### END CODE HERE ### 
    
    return pred

In [1132]:
# New york times news:
sentence = "Peter Parker , the White House director of trade and manufacturing policy of U.S , said in an interview on Sunday morning that the White House was working to prepare for the possibility of a second wave of the coronavirus in the fall , though he said it wouldn ’t necessarily come"
predictions = predict(sentence, model, sentence_vectorizer, tag_map)
for x,y in zip(sentence.split(' '), predictions):
    if y != 'O':
        print(x,y)

Peter B-per
Parker I-per
White B-org
House I-org
U.S B-org
Sunday B-tim
morning I-tim
White B-org
House I-org
