# Part 1. Sequence Tagging: NER

In [1]:
import pandas as pd
import gensim.downloader
import numpy as np

from keras import Sequential
from keras.layers import LSTM, Embedding, Dense
from keras.preprocessing.sequence import pad_sequences
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Bidirectional
import keras as k
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.optimizers import Adam

2023-10-28 12:49:57.977742: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-10-28 12:49:57.977781: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-10-28 12:49:57.978729: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-10-28 12:49:58.057783: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

## 1.1 Word Embedding

In [None]:
w2v = gensim.downloader.load("word2vec-google-news-300")

### Qn 1.1

In [None]:
words = ["student", "Apple", "apple"]
print("-----------------------------------------------------------------------")
print("Word\t\tMost similar word\tCosine similarity")
print("-----------------------------------------------------------------------")
for word in words:
    most_similar = w2v.most_similar(positive=[word])
    print(f"{word}\t\t{most_similar[0][0]}  \t\t{most_similar[0][1]}")
print("-----------------------------------------------------------------------")

## 1.2 Data

In [None]:
CoNLL2003_dir = '../Datasets/CoNLL2003_dataset'
train_dir = f'{CoNLL2003_dir}/eng.train'
dev_dir =  f'{CoNLL2003_dir}/eng.testa'
test_dir =  f'{CoNLL2003_dir}/eng.testb'

In [None]:
def import_content(path):
    try:
        with open(path, 'r') as file:
            content = file.readlines()
        file.close()
    except Exception as e:
        content = None
        print(e)
    
    return content

def print_items(item):
    for s in item: print(s)

In [None]:
train_content = import_content(train_dir)
dev_content = import_content(dev_dir)
test_content = import_content(test_dir)

### Split data by sentences

In [None]:
def split_sentences(content):
    split_data = [c.split(' ') for c in content] if content != None else []
    sentences = []
    sentence = []
    words = []

    for line in split_data:
        # if end of a sentence
        if line == ['\n']:
            sentences.append(sentence)
            sentence = []
        else:
            s_text  = line[0]
            s_tag = line[-1].replace('\n','')

            sentence.append([s_text, s_tag]) 
            words.append([s_text, s_tag])
    
    sentences.append(sentence) # last item in content not new line so must add previous sentence manually after loop           

    return sentences, words

def split_text_tag(sentences):
    text = []
    tag = []
    combined = []
    sentence_count = 1

    for s in sentences:
        for w in s:
            w_text  = w[0]
            w_tag = w[-1].replace('\n','')

            text.append(w_text)
            tag.append(w_tag)        
            combined.append({
                'sentence': sentence_count,
                'text' : w_text,
                'tag' : w_tag
            })   
        sentence_count+=1       
    return text, tag, combined

In [None]:
train_sentences, train_words = split_sentences(train_content)
dev_sentences, dev_words = split_sentences(dev_content)
test_sentences, test_words = split_sentences(test_content)

train_text, train_tag, train_combined = split_text_tag(train_sentences)
dev_text, dev_tag, dev_combined = split_text_tag(dev_sentences)
test_text, test_tag, test_combined = split_text_tag(test_sentences)

train_df = pd.DataFrame(train_combined)
dev_df = pd.DataFrame(dev_combined)
test_df = pd.DataFrame(test_combined)

max_sentence_len = max([len(s) for s in train_sentences])

voc = list(set(train_text))
voc = np.append(voc,'<UNK>')
voc = np.append(voc,'<PAD>')
n_voc = len(voc)
word2idx = {k: v for v, k in enumerate(voc)}

tag_set = list(set(train_tag))
n_tags = len(tag_set)
tag2idx = {k: v for v, k in enumerate(tag_set)}

tag2idx

### Qn 1.2 (a)

#### Describe the size (number of sentences) of the training, development and test file for CoNLL2003.

In [None]:
print("Number of sentences (training):", len(train_sentences))
print("Number of sentences (dev):", len(dev_sentences))
print("Number of sentences (test):", len(test_sentences))

#### Specify the complete set of all possible word labels based on the tagging scheme (IO, BIO, etc.) you chose

In [None]:
print("Tag set (BIO):", tag_set)

### Qn 1.2 (b)

#### Choose an example sentence from the training set of CoNLL2003 that has at least two named entities with more than one word.

In [None]:
def get_multiple_ne_sentence(sentences):
    for sentence in sentences:
        ne_count = 0
        for word_info in sentence:
            if "B-" in word_info[-1]:
                ne_count+=1
        if ne_count == 2:
            return sentence
    return None        

In [None]:
sentence = get_multiple_ne_sentence(train_sentences)
sentence

#### Explain how to form complete named entities from the label for each word, and list all the named entities in this sentence.

In [None]:
def get_named_entities(sentence):
    inside_tags = ['I-ORG', 'I-LOC', 'I-PER', 'I-MISC'] # Tags that require multiple words to form an entity
    begin_tags = ['B-LOC', 'B-ORG', 'B-MISC'] # Tags that are single word entities
    outside_tags = ['O']
    entities = [] # all entities gotten from search
    entity = [] # word group of current entity if any group tags encountered
    
    for c in sentence:
        if (c['tag'] in begin_tags or c['tag'] in outside_tags or c['tag'] == '\n') and len(entity) != 0:
            entities.append(' '.join(entity))
            entity = []
        if c['tag'] in begin_tags or c['tag'] in inside_tags: 
            entity.append(c['text'])

    return entities

In [None]:
_,_,sentence_text_tag = split_text_tag([sentence])
print("Complete named entities in the sentence:", get_named_entities(sentence_text_tag))

## 1.3 Model

### Gensim pre-trained word embeddings

In [None]:
pretrained_weights = w2v.vectors
google_vocab_size, embedding_size = pretrained_weights.shape

w2v_word2idx = dict(w2v.key_to_index)
w2v_voc = w2v.index_to_key
w2v_word2idx['<UNK>'] = w2v_word2idx[list(w2v_word2idx.keys())[-1]]+1
w2v_word2idx['<PAD>'] = w2v_word2idx[list(w2v_word2idx.keys())[-1]]+1

print("Embedding Size: ", embedding_size)
print("Max Sentence Len: ", max_sentence_len)

### Create embedding matrix

In [None]:
embeddings_index = {}
for v in w2v_voc:
  embeddings_index[v] =  w2v[v] 

In [None]:
hits = 0
misses = 0

embedding_matrix = np.zeros((n_voc, embedding_size))
for word, i in word2idx.items():    
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector[0:embedding_size]
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

## LSTM

In [None]:
def get_x_sequence(sentences):
  sequence = []
  sent_seq = []
  for s in sentences:
    for w in s:
      if w[0] in word2idx.keys():
        sent_seq.append(word2idx[w[0]])
      else:
        sent_seq.append(word2idx['<UNK>'])
    sequence.append(sent_seq)
    sent_seq = []
  
  return sequence

In [None]:
x_train = get_x_sequence(train_sentences)
x_train = pad_sequences(maxlen=max_sentence_len, sequences=x_train, padding="post", value=n_voc - 1)

y_train = [[tag2idx[w[1]] for w in s] for s in train_sentences]
y_train = pad_sequences(maxlen=max_sentence_len, sequences=y_train, padding="post", value=tag2idx['O'])

x_dev = get_x_sequence(dev_sentences)
x_dev = pad_sequences(maxlen=max_sentence_len, sequences=x_dev, padding="post", value=n_voc - 1)

y_dev = [[tag2idx[w[1]] for w in s] for s in dev_sentences]
y_dev = pad_sequences(maxlen=max_sentence_len, sequences=y_dev, padding="post", value=tag2idx['O'])

x_test = get_x_sequence(test_sentences)
x_test = pad_sequences(maxlen=max_sentence_len, sequences=x_test, padding="post", value=n_voc - 1)

y_test = [[tag2idx[w[1]] for w in s] for s in test_sentences]
y_test = pad_sequences(maxlen=max_sentence_len, sequences=y_test, padding="post", value=tag2idx['O'])

y_train = [to_categorical(i, num_classes=n_tags) for i in y_train]
y_dev = [to_categorical(i, num_classes=n_tags) for i in y_dev]
y_test = [to_categorical(i, num_classes=n_tags) for i in y_test]

In [None]:
class_weights = {}

for tag in tag2idx.keys():
  class_weights[tag2idx[tag]] = (1/train_tag.count(tag))*1000000
  # class_weights[tag2idx[tag]] = len(train_text) / (n_tags * train_tag.count(tag))

In [None]:
word_embedding_size = max_sentence_len
model = Sequential()
model.add(Embedding(input_dim=n_voc, 
                    output_dim=embedding_size, 
                    input_length=max_sentence_len,
                    weights = [embedding_matrix], 
                    embeddings_initializer=k.initializers.Constant(embedding_matrix)))

model.add(Bidirectional(LSTM(units=128, 
                           return_sequences=True, 
                           dropout=0.5, 
                           recurrent_dropout=0.5, 
                           kernel_initializer=k.initializers.he_normal())))

model.add(TimeDistributed(Dense(n_tags, activation="softmax")))
optimizer = Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999)

model.compile(optimizer=optimizer,
              loss="categorical_crossentropy",
              metrics=["accuracy"])
model.summary()

In [None]:
import keras
from seqeval.metrics import f1_score

class F1ScoreCallback(keras.callbacks.Callback):
    def __init__(self, val_data, val_labels):
        super().__init__()
        self.validation_data = val_data
        self.validation_labels = val_labels

    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
        predicted_labels = self.model.predict(self.validation_data)
        y_true = self.validation_labels

        y_test_tags = []
        y_pred_tags = []
        for i in range(len(y_true)):
            test_sent_tags = []
            pred_sent_tags = []
            for y, z in zip(y_true[i], predicted_labels[i]):
                true_tag = list(tag2idx.keys())[list(tag2idx.values()).index(np.where(y == 1)[0][0])]
                pred_tag = list(tag2idx.keys())[list(tag2idx.values()).index(np.argmax(z))]
                test_sent_tags.append(true_tag)
                pred_sent_tags.append(pred_tag)
            y_test_tags.append(test_sent_tags)
            y_pred_tags.append(pred_sent_tags)
        
        score = f1_score(y_test_tags, y_pred_tags)
        logs['f1_score'] = score

In [None]:
class F1EarlyStopping(EarlyStopping):
    def __init__(self, threshold=0.90, **kwargs):
        super(F1EarlyStopping, self).__init__(**kwargs)
        self.threshold = threshold  # The threshold for F1 score

    def on_epoch_end(self, epoch, logs=None):
        current = self.get_monitor_value(logs)
        if current is None:
            return

        # Only stop early if the F1 score is above the threshold
        if current >= self.threshold: super().on_epoch_end(epoch, logs)
        # If below the threshold, reset the number of no improvements
        else: self.wait = 0

    def get_monitor_value(self, logs):
        logs = logs or {}
        monitor_value = logs.get(self.monitor)
        return monitor_value

In [None]:
num_epochs = 80
batch_size = 64

early_stopping = F1EarlyStopping(monitor='f1_score', 
                                 mode='max', 
                                 patience=10, 
                                 threshold=0.60, 
                                 verbose=1, 
                                 restore_best_weights = True)

f1_metric = F1ScoreCallback(val_data=x_dev, val_labels=y_dev)

# Save 'best' model
model_path = '../Models/NER/'
checkpoint = ModelCheckpoint(model_path + 'ner_model_f1_best.h5',  # Save the model to a file named 'model-<epoch_number>.h5'
                             monitor='f1_score',      # Monitor validation loss
                             verbose=1,               # Verbosity mode: 1 = print progress bar, 0 = silent
                             save_best_only=True,     # Only save the model if 'val_loss' has improved
                             mode='max')             # Mode: 'auto' decides whether to maximize or minimize 'val_loss' based on its name ('loss' should be minimized, 'acc' should be maximized)

model.fit(x_train, np.array(y_train), batch_size=batch_size, epochs=num_epochs, shuffle=True, validation_data=(x_dev, np.array(y_dev)),
         callbacks = [early_stopping, f1_metric, checkpoint], workers = 4, class_weight=class_weights)

# Save latest model
save_path = model_path + '/ner_model_last.h5'
model.save(save_path)

### Test model

In [None]:
from keras.models import load_model
load_path = '../Models/NER/ner_model.h5'
loaded_model = load_model(load_path)

In [None]:
predicted_labels = model.predict(x_test)

In [None]:
loss, accuracy = model.evaluate(x_test, np.array(y_test))
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')

In [None]:
y_test_tags = []
y_pred_tags = []

for i in range(len(x_test)):
  test_sent_tags = []
  pred_sent_tags = []
  for y, z in zip(y_test[i], predicted_labels[i]):
    true_tag = list(tag2idx.keys())[list(tag2idx.values()).index(np.where(y == 1)[0][0])]
    pred_tag = list(tag2idx.keys())[list(tag2idx.values()).index(np.argmax(z))]
    test_sent_tags.append(true_tag)
    pred_sent_tags.append(pred_tag)
  y_test_tags.append(test_sent_tags)
  y_pred_tags.append(pred_sent_tags)

In [None]:
from seqeval.metrics import f1_score
f1_score(y_test_tags, y_pred_tags)

In [None]:
for i in range(len(x_test)):
  pred_tags = []
  for x, y, z in zip(x_test[i], y_test[i], predicted_labels[i]):
    word = list(word2idx.keys())[list(word2idx.values()).index(x)]
    true_tag = list(tag2idx.keys())[list(tag2idx.values()).index(np.where(y == 1)[0][0])]
    pred_tag = list(tag2idx.keys())[list(tag2idx.values()).index(np.argmax(z))]
    print(word, true_tag, pred_tag)
    pred_tags.append(pred_tag)
  # print(pred_tags)
  print()