# Tatia Project: Named Entity Recognition

In [1]:
from datasets import load_dataset
from sklearn.metrics import classification_report
from sklearn.preprocessing import OneHotEncoder
from transformers import BertTokenizer, BertForTokenClassification, TFBertModel 
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras import Input
from tensorflow.keras import models, layers
from tensorflow.keras import losses, optimizers, regularizers, callbacks
from tensorflow.keras.utils import plot_model

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf



In [2]:
from datasets import load_dataset
dataset = load_dataset("conll2003")

Downloading builder script:   0%|          | 0.00/2.58k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

Downloading and preparing dataset conll2003/conll2003 (download: 959.94 KiB, generated: 9.78 MiB, post-processed: Unknown size, total: 10.72 MiB) to /root/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/63f4ebd1bcb7148b1644497336fd74643d4ce70123334431a3c053b7ee4e96ee...


Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14042 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3251 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3454 [00:00<?, ? examples/s]

Dataset conll2003 downloaded and prepared to /root/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/63f4ebd1bcb7148b1644497336fd74643d4ce70123334431a3c053b7ee4e96ee. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
num_classes = dataset["train"].features["ner_tags"].feature.num_classes
checkpoint_path = "./ner_model-{epoch:02d}.weights.h5"
max_len = 100
patience = 5
epochs = 4
batch_size = 128
number_of_steps_to_save = int(dataset["train"].num_rows / batch_size) * 2

callbacks_list = [
    callbacks.EarlyStopping(monitor='val_accuracy', 
        mode='max',
        patience=patience,
        restore_best_weights=True),
    callbacks.ModelCheckpoint(filepath=checkpoint_path,
        monitor="val_accuracy",
        verbose=0,
        save_best_only=False,
        save_weights_only=True,
        mode="auto",
        save_freq=number_of_steps_to_save)
]

In [4]:
def tokenize_and_split(hugging_face_dataset, tokenizer):
    tokens = []
    labels = []
    cols = ["tokens", "ner_tags"]
    for line in hugging_face_dataset:
        ner_tags_list = line.get(cols[1])
        tokens.append("[CLS]")
        labels.append(0)
        for i, word in enumerate(line.get(cols[0])):
            token_list = tokenizer.tokenize(word)
            tokens.extend(token_list)
            labels.extend([ner_tags_list[i]] * len(token_list))
        tokens.append("[SEP]")
        labels.append(0)
    return tokens, labels

In [5]:
# def preprocess(hugging_face_dataset):
#     tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
#     tokens, labels = tokenize_and_split(hugging_face_dataset, tokenizer)

#     input_ids = tokenizer.convert_tokens_to_ids(tokens)
#     print("input_ids: ", input_ids)
#     segment_ids = [0] * len(tokens)
#     attention_masks = [[float(i != 0.0) for i in ii] for ii in input_ids]
#     return tokens, labels, input_ids, segment_ids, attention_masks

In [6]:
def prepare_label_enc():
    label_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False, dtype=int)
    classes = np.array([i for i in range(num_classes)])
    label_encoder.fit_transform(classes.reshape(-1, 1))
    return label_encoder

def encode_labels(label_enc, labels: list[int]):
    np_labels = np.array(labels, dtype=int)
    return label_enc.transform(np_labels.reshape(-1, 1))


In [7]:
def align_encoded_labels_with_tokens(tokens: list[str], labels: list[int], label_enc):
    """
        completes the labels so that they match the tokenize sentence
    """
    aligned_labels = []
    token_idx = 0
    for token in tokens[1:-1]:
        # Check if the token is a subword piece (starts with "##")
        if token.startswith("##") and token_idx > 0:
            # If it's a subword, use the label of the previous token
            aligned_labels.append(labels[token_idx - 1])
        else:
            # If it's not a subword, use the label of the current token
            aligned_labels.append(labels[token_idx])
            token_idx += 1
    
    # [CLS] separator
    aligned_labels.insert(0, 0)
    # [SEP] separator
    aligned_labels.append(0)

    return encode_labels(label_enc, aligned_labels) 

In [8]:
def preprocess(hugging_face_dataset):

    tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
    label_enc = prepare_label_enc()
    X_train = []
    X_val = []
    X_test = []
    y_train = []
    y_val = []
    y_test = []
    y_train_dec = []
    y_val_dec = []
    y_test_dec = []
    for line in hugging_face_dataset["train"]:
        if line["tokens"] == []:
            continue
        sentence = tokenizer.encode_plus(
            text=line["tokens"],
            add_special_tokens=True,
            max_length=max_len,
            truncation=True,
            padding=True,
            return_tensors='np',
            return_token_type_ids=False,
            return_attention_mask=True,
            verbose=True
        )
        X_train.append(sentence)
        y_train_dec.append(line["ner_tags"])
        y_train.append(
            align_encoded_labels_with_tokens(
                tokenizer.convert_ids_to_tokens(sentence["input_ids"].flatten()), 
                line["ner_tags"], 
                label_enc
            )
        )
        
    for line in hugging_face_dataset["validation"]:
        if line["tokens"] == []:
            continue
        sentence = tokenizer.encode_plus(
            text=line["tokens"],
            add_special_tokens=True,
            max_length=max_len,
            truncation=True,
            padding=True,
            return_tensors='np',
            return_token_type_ids=False,
            return_attention_mask=True,
            verbose=True
        )
        X_val.append(sentence)
        y_val_dec.append(line["ner_tags"])
        y_val.append(
            align_encoded_labels_with_tokens(
                tokenizer.convert_ids_to_tokens(sentence["input_ids"].flatten()), 
                line["ner_tags"], 
                label_enc
            )
        )
        
    for line in hugging_face_dataset["test"]:
        if line["tokens"] == []:
            continue
        sentence = tokenizer.encode_plus(
            text=line["tokens"],
            add_special_tokens=True,
            max_length=max_len,
            truncation=True,
            padding=True,
            return_tensors='np',
            return_token_type_ids=False,
            return_attention_mask=True,
            verbose=True
        )
        X_test.append(sentence)
        y_test_dec.append(line["ner_tags"])
        y_test.append(
            align_encoded_labels_with_tokens(
                tokenizer.convert_ids_to_tokens(sentence["input_ids"].flatten()), 
                line["ner_tags"], 
                label_enc
            )
        )
        
    return X_train, X_val, X_test, y_train, y_val, y_test, y_train_dec, y_val_dec, y_test_dec
#         print(type(X_train))
#         print(X_train)
#     X_train = tokenizer.batch_encode_plus(
#             batch_text_or_text_pairs=hugging_face_dataset["train"]["tokens"],
#             add_special_tokens=True,
#             max_length=max_len,
#             truncation=True,
#             padding=True,
#             return_tensors='np',
#             return_token_type_ids=False,
#             return_attention_mask=True,
#             verbose=True
#         )
#     return X_train
    
#     X_val = tokenizer(
#         text=hugging_face_dataset["validation"]["tokens"],
#         add_special_tokens=True,
#         max_length=max_len,
#         truncation=True,
#         padding=True,
#         return_tensors='tf',
#         return_token_type_ids=False,
#         return_attention_mask=True,
#         verbose=True
#     )
    
#     X_test = tokenizer(
#         text=hugging_face_dataset["test"]["tokens"],
#         add_special_tokens=True,
#         max_length=max_len,
#         truncation=True,
#         padding=True,
#         return_tensors='tf',
#         return_token_type_ids=False,
#         return_attention_mask=True,
#         verbose=True
#     )
#     return X_train, X_val, X_test

In [9]:
X_train, X_val, X_test, y_train, y_val, y_test, y_train_dec, y_val_dec, y_test_dec = preprocess(dataset)

tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [10]:
y_train[:2]

[array([[1, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 1, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 1, 0],
        [1, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 1, 0],
        [1, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0, 0, 0, 0]]),
 array([[1, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 1, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 1, 0, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0, 0, 0, 0]])]

In [11]:
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
for sent in X_train[:2]:
    print(tokenizer.batch_decode(sent["input_ids"]))

['[CLS] EU rejects German call to boycott British [UNK]. [SEP]']
['[CLS] Peter Blackburn [SEP]']


In [12]:
# _, labels_train, input_ids_train, segment_ids_train, attention_masks_train = preprocess(dataset["train"])
# _, labels_val, input_ids_val, segment_ids_val, attention_masks_val = preprocess(dataset["validation"])
# _, labels_test, input_ids_test, segment_ids_test, attention_masks_test = preprocess(dataset["test"])

In [12]:
def prepare_batches(tokenized_sentences, aligned_encoded_labels, batch_size, is_test: bool = False):    
    # Extract input_ids and attention_mask lists
    input_ids_list = [sentence['input_ids'].flatten() for sentence in tokenized_sentences]
    attention_mask_list = [sentence['attention_mask'].flatten() for sentence in tokenized_sentences]
    # Calculate max_len dynamically based on the maximum sequence length in your data
    max_len_list = max(len(ids) for ids in input_ids_list)
    
    # Pad sequences to the same length within each batch
    input_ids_padded = pad_sequences(input_ids_list, maxlen=max_len_list, padding='post', value=0)
    attention_mask_padded = pad_sequences(attention_mask_list, maxlen=max_len_list, padding='post', value=0)
    aligned_encoded_labels_padded = pad_sequences(aligned_encoded_labels, maxlen=max_len_list, padding='post', value=0)
    
    # Convert to TensorFlow tensors
    input_ids = tf.convert_to_tensor(input_ids_padded, dtype=tf.int32)
    attention_mask = tf.convert_to_tensor(attention_mask_padded, dtype=tf.int32)
    labels = tf.convert_to_tensor(aligned_encoded_labels_padded, dtype=tf.int32)

    # Create a TensorFlow dataset
    if not is_test:
        dataset = tf.data.Dataset.from_tensor_slices(((input_ids, attention_mask), labels))
    else:
        dataset = tf.data.Dataset.from_tensor_slices((input_ids, attention_mask))

    # Batch the dataset
    dataset = dataset.batch(batch_size)

    return dataset, max_len_list

In [13]:
def prepare_dataset(tokenized_sentences, aligned_encoded_labels):
    # Extract input_ids and attention_mask lists
    input_ids_list = [sentence['input_ids'].flatten() for sentence in tokenized_sentences]
    attention_mask_list = [sentence['attention_mask'].flatten() for sentence in tokenized_sentences]
    # Calculate max_len dynamically based on the maximum sequence length in your data
    max_len_list = max(len(ids) for ids in input_ids_list)
    
    # Pad sequences to the same length within each batch
    input_ids_padded = pad_sequences(input_ids_list, maxlen=max_len_list, padding='post', value=0)
    attention_mask_padded = pad_sequences(attention_mask_list, maxlen=max_len_list, padding='post', value=0)
    aligned_encoded_labels_padded = pad_sequences(aligned_encoded_labels, maxlen=max_len_list, padding='post', value=0)
    
    data = {}
    data["input_ids"] = input_ids_padded
    data["attention_mask"] = attention_mask_padded
    data["labels"] = aligned_encoded_labels_padded
    return data

In [15]:
# train_dataset, max_len_train_list = prepare_batches(X_train, y_train, batch_size)

In [16]:
# val_dataset, max_len_val_list = prepare_batches(X_val, y_val, batch_size)

In [17]:
# test_dataset, max_len_test_list = prepare_batches(X_test, y_test, batch_size,is_test=True)

In [14]:
train_dataset = prepare_dataset(X_train, y_train)
val_dataset = prepare_dataset(X_val, y_val)
test_dataset = prepare_dataset(X_test, y_test)

In [19]:
# test = TFBertModel.from_pretrained('bert-base-cased')
# out = test(Input(shape=(max_len,), dtype=tf.int32, name="input_ids"), Input(shape=(max_len,), dtype=tf.int32, name="attention_mask"))
# out[-2]


In [20]:
# out[-2]

In [15]:
def fine_tune_bert(num_neuron_output: int):
    bert_base = TFBertModel.from_pretrained('bert-base-cased')

    input_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")
    print()
    bert_base.pop()
#     x = bert_base(input_ids, attention_mask = input_mask)[0]
    output = layers.Dense(num_neuron_output, activation="softmax")(x)
    model = models.Model(inputs=[input_ids, input_mask], outputs=output, name= "ner_model")
    return model

def build_model(num_hidden_layers: int, size_hidden_layers: list[int], hidden_activation_func: list[str], num_neuron_output: int):
    if len(size_hidden_layers) != num_hidden_layers or len(hidden_activation_func) != num_hidden_layers:
        raise Exception("The params num_hidden_layers and hidden_activation_func should have a length equal to num_hidden_layers")
    
    bert_base = TFBertModel.from_pretrained('bert-base-cased')

    input_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")
    x = bert_base(input_ids, attention_mask = input_mask)[0]
#     x = out[-2]
    for i in range(num_hidden_layers):
        x = layers.Dense(size_hidden_layers[i], activation=hidden_activation_func[i])(x)
        if i < num_hidden_layers - 2:
            x = layers.Dropout(0.2)(x)
    output = layers.Dense(num_neuron_output, activation="softmax")(x)
    model = models.Model(inputs=[input_ids, input_mask], outputs=output, name= "ner_model")
    return model
    
def configure_model(model, optimizer: str, loss_fn: str):
    model.compile(
        optimizer=optimizer, 
        loss=loss_fn,
        metrics = "accuracy",
    )
    return model

In [16]:
model = build_model(2, [20, 8], ["relu", "relu"], num_classes)

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [17]:
# model = build_model(0, [], [], num_classes)
model = configure_model(model, "adam", "categorical_crossentropy")

In [22]:
# history = model.fit(
#     x = {'inputs_ids': input_ids_train, "attention_mask": attention_masks_train},
#     y = labels_train,
#     validation_data = ({"input_ids": input_ids_val, "attention_mask": attention_masks_val}, labels_val),
#     epochs=epochs, batch_size=batch_size, callbacks=callbacks_list, 
    
# )

In [23]:
# predicted = model.predict({'input_ids': input_ids_test, 'attention_mask': attention_masks_test})
# labels_predicted = np.argmax(predicted, axis=1)
# print(classification_report(labels_test, labels_predicted))

In [18]:
model.summary()

Model: "ner_model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_ids (InputLayer)      [(None, 100)]                0         []                            
                                                                                                  
 attention_mask (InputLayer  [(None, 100)]                0         []                            
 )                                                                                                
                                                                                                  
 tf_bert_model (TFBertModel  TFBaseModelOutputWithPooli   1083102   ['input_ids[0][0]',           
 )                           ngAndCrossAttentions(last_   72         'attention_mask[0][0]']      
                             hidden_state=(None, 100, 7                                   

In [19]:
history = model.fit(
    x = {"input_ids": train_dataset["input_ids"] , "attention_mask": train_dataset["attention_mask"]},
    y = train_dataset["labels"],
    validation_data = ({"input_ids": val_dataset["input_ids"] , "attention_mask": val_dataset["attention_mask"]}, val_dataset["labels"]),
    epochs=epochs, batch_size=batch_size, callbacks=callbacks_list, 
)

Epoch 1/4
Epoch 2/4



Epoch 3/4
Epoch 4/4


In [20]:
# history = model.fit(
#     train_dataset,
#     validation_data = val_dataset,
#     epochs=epochs, batch_size=batch_size, callbacks=callbacks_list, 
# )

In [22]:
predicted = model.predict({"input_ids": test_dataset["input_ids"] , "attention_mask": test_dataset["attention_mask"]})
y_pred = np.argmax(predicted, axis=2)



In [23]:
predicted

array([[[2.5639673e-09, 1.6873159e-12, 4.3568453e-03, ...,
         2.6110591e-31, 9.9564314e-01, 4.3963187e-08],
        [2.5639673e-09, 1.6840236e-12, 4.3568453e-03, ...,
         2.6110591e-31, 9.9564314e-01, 4.3963187e-08],
        [2.5639673e-09, 1.6873159e-12, 4.3568453e-03, ...,
         2.6110591e-31, 9.9564314e-01, 4.3963187e-08],
        ...,
        [2.5639673e-09, 1.6873159e-12, 4.3568453e-03, ...,
         2.6110591e-31, 9.9564314e-01, 4.3963187e-08],
        [2.5639673e-09, 1.6840236e-12, 4.3568453e-03, ...,
         2.6110591e-31, 9.9564314e-01, 4.3963187e-08],
        [2.5639673e-09, 1.6873159e-12, 4.3568453e-03, ...,
         2.6110591e-31, 9.9564314e-01, 4.3963187e-08]],

       [[2.5639673e-09, 1.6873159e-12, 4.3568453e-03, ...,
         2.6110591e-31, 9.9564314e-01, 4.3963187e-08],
        [2.5639673e-09, 1.6873159e-12, 4.3568453e-03, ...,
         2.6110591e-31, 9.9564314e-01, 4.3963187e-08],
        [2.5639673e-09, 1.6873159e-12, 4.3568453e-03, ...,
         2.616

In [None]:
# def preprocess_y_test(align_labels, max_len):
#     return pad_sequences(align_labels, maxlen=max_len, padding='post', value=0)

In [23]:
def postprocess(labels_pred:list[list[int]], y_true: list[list[int]]):
    labels_pred_cut = []
    for i, sentence in enumerate(y_true):
        sen_len = len(sentence)
        labels_pred_cut.append(labels_pred[i][:sen_len])
    return labels_pred_cut

In [None]:
postprocess(y_pred, y_test_dec)

In [51]:
y_test_dec = np.array(y_test_dec, dtype=int)

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (3453,) + inhomogeneous part.

In [25]:
print(classification_report(y_test_dec, y_pred))

ValueError: You appear to be using a legacy multi-label data representation. Sequence of sequences are no longer supported; use a binary array or sparse matrix instead - the MultiLabelBinarizer transformer can convert to this format.