# Part 1. Sequence Tagging: NER

In [None]:
import pandas as pd
import os
import gensim.downloader
from gensim.models import Word2Vec
import tensorflow as tf
import numpy as np

from tensorflow.keras import Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense
from tensorflow.keras.layers import InputLayer, TimeDistributed, SpatialDropout1D, Bidirectional
from tensorflow import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical


from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from livelossplot.tf_keras import PlotLossesCallback


## 1.1 Word Embedding

In [834]:
w2v = gensim.downloader.load("word2vec-google-news-300")

### Qn 1.1

In [835]:
words = ["student", "Apple", "apple"]
print("-----------------------------------------------------------------------")
print("Word\t\tMost similar word\tCosine similarity")
print("-----------------------------------------------------------------------")
for word in words:
    most_similar = w2v.most_similar(positive=[word])
    print(f"{word}\t\t{most_similar[0][0]}  \t\t{most_similar[0][1]}")
print("-----------------------------------------------------------------------")

-----------------------------------------------------------------------
Word		Most similar word	Cosine similarity
-----------------------------------------------------------------------
student		students  		0.7294865846633911
Apple		Apple_AAPL  		0.7456987500190735
apple		apples  		0.720359742641449
-----------------------------------------------------------------------


## 1.2 Data

In [836]:
CoNLL2003_dir = '../Datasets/CoNLL2003_dataset'
train_dir = f'{CoNLL2003_dir}/eng.train'
dev_dir =  f'{CoNLL2003_dir}/eng.testa'
test_dir =  f'{CoNLL2003_dir}/eng.testb'

In [837]:
def import_content(path):
    try:
        with open(path, 'r') as file:
            content = file.readlines()
        file.close()
    except Exception as e:
        content = None
        print(e)
    
    return content

def print_items(item):
    for s in item: print(s)

In [838]:
train_content = import_content(train_dir)
dev_content = import_content(dev_dir)
test_content = import_content(test_dir)

### Split data by sentences

In [839]:
def split_sentences(content):
    split_data = [c.split(' ') for c in content] if content != None else []
    sentences = []
    sentence = []
    words = []

    for line in split_data:
        # if end of a sentence
        if line == ['\n']:
            sentences.append(sentence)
            sentence = []
        else:
            s_text  = line[0]
            s_tag = line[-1].replace('\n','')

            sentence.append([s_text, s_tag]) 
            words.append([s_text, s_tag])
    
    sentences.append(sentence) # last item in content not new line so must add previous sentence manually after loop           

    return sentences, words

In [840]:
def split_text_tag(sentences):
    text = []
    tag = []
    combined = []
    sentence_count = 1

    for s in sentences:
        for w in s:
            w_text  = w[0]
            w_tag = w[-1].replace('\n','')

            text.append(w_text)
            tag.append(w_tag)        
            combined.append({
                'sentence': sentence_count,
                'text' : w_text,
                'tag' : w_tag
            })   
        sentence_count+=1       
    return text, tag, combined

In [841]:
train_sentences, train_words = split_sentences(train_content)
dev_sentences, dev_words = split_sentences(dev_content)
test_sentences, test_words = split_sentences(test_content)

train_text, train_tag, train_combined = split_text_tag(train_sentences)
dev_text, dev_tag, dev_combined = split_text_tag(dev_sentences)
test_text, test_tag, test_combined = split_text_tag(test_sentences)

In [842]:
train_voc = np.unique(np.array(train_text))
dev_voc = np.unique(np.array(dev_text))


tag_set = np.unique(np.array(train_tag))

### Qn 1.2 (a)

#### Describe the size (number of sentences) of the training, development and test file for CoNLL2003.

In [843]:
print("Number of sentences (training):", len(train_sentences))
print("Number of sentences (dev):", len(dev_sentences))
print("Number of sentences (test):", len(test_sentences))

Number of sentences (training): 14987
Number of sentences (dev): 3466
Number of sentences (test): 3684


#### Specify the complete set of all possible word labels based on the tagging scheme (IO, BIO, etc.) you chose

In [844]:
print("Tag set (BIO):", tag_set)

Tag set (BIO): ['B-LOC' 'B-MISC' 'B-ORG' 'I-LOC' 'I-MISC' 'I-ORG' 'I-PER' 'O']


### Qn 1.2 (b)

#### Choose an example sentence from the training set of CoNLL2003 that has at least two named entities with more than one word.

In [845]:
def get_multiple_ne_sentence(sentences):
    for sentence in sentences:
        ne_count = 0
        for word_info in sentence:
            if "B-" in word_info[-1]:
                ne_count+=1
        if ne_count == 2:
            return sentence
    return None        

In [846]:
sentence = get_multiple_ne_sentence(train_sentences)
sentence

[['Swiss', 'I-MISC'],
 ['Grand', 'B-MISC'],
 ['Prix', 'I-MISC'],
 ['World', 'B-MISC'],
 ['Cup', 'I-MISC'],
 ['cycling', 'O'],
 ['race', 'O'],
 ['on', 'O'],
 ['Sunday', 'O'],
 [':', 'O']]

#### Explain how to form complete named entities from the label for each word, and list all the named entities in this sentence.

In [847]:
def get_named_entities(sentence):
    inside_tags = ['I-ORG', 'I-LOC', 'I-PER', 'I-MISC'] # Tags that require multiple words to form an entity
    begin_tags = ['B-LOC', 'B-ORG', 'B-MISC'] # Tags that are single word entities
    outside_tags = ['O']
    entities = [] # all entities gotten from search
    entity = [] # word group of current entity if any group tags encountered
    
    for c in sentence:
        if (c['tag'] in begin_tags or c['tag'] in outside_tags or c['tag'] == '\n') and len(entity) != 0:
            entities.append(' '.join(entity))
            entity = []
        if c['tag'] in begin_tags or c['tag'] in inside_tags: 
            entity.append(c['text'])

    return entities

In [848]:
_,_,sentence_text_tag = split_text_tag([sentence])
print("Complete named entities in the sentence:", get_named_entities(sentence_text_tag))

Complete named entities in the sentence: ['Swiss', 'Grand Prix', 'World Cup']


#### Tag-text dataset

In [849]:
train_df = pd.DataFrame(train_combined)
dev_df = pd.DataFrame(dev_combined)
test_df = pd.DataFrame(test_combined)

# path = '../Datasets/Processed/'
# file_name = 'CoNLL2003_processed'
# # Export DataFrame to a CSV file
# df.to_csv(f'{path}{file_name}.csv', index=False)

## 1.3 Model

### Create vocabulary index

In [870]:
#Load w2v models for train and dev

path = '../Pretrained_Models/'

train_w2v = Word2Vec.load('../Pretrained_Models/CONLL2003_pretrain.model')

train_pretrained_weights = train_w2v.wv.vectors
train_num_tokens, train_embedding_dim = train_pretrained_weights.shape

word2idx = train_w2v.wv.key_to_index
word2idx['<UNK>'] = word2idx[list(word2idx.keys())[-1]]+1
word2idx['<PAD>'] = word2idx[list(word2idx.keys())[-1]]+1
voc = train_voc
voc = np.append(voc,'<UNK>')
voc = np.append(voc,'<PAD>')

tag2idx = {k: v for v, k in enumerate(tag_set)}

### Gensim pre-trained word embeddings

In [871]:
w2v_word2idx = dict(w2v.key_to_index)
w2v_voc = w2v.index_to_key
w2v_word2idx['<UNK>'] = w2v_word2idx[list(w2v_word2idx.keys())[-1]]+1
w2v_word2idx['<PAD>'] = w2v_word2idx[list(w2v_word2idx.keys())[-1]]+1

### Create embedding matrix

In [872]:
embeddings_index = {}
for v in w2v_voc:
  embeddings_index[v] =  w2v[v] 


In [889]:
num_tokens = len(voc) + 2
embedding_dim = 50
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word2idx.items():    
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector[0:50]
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 17724 words (5902 misses)


## LSTM

In [890]:
def get_x_sequence(sentences):
  sequence = []
  sent_seq = []
  for s in sentences:
    for w in s:
      if w[0] in word2idx.keys():
        sent_seq.append(word2idx[w[0]])
      else:
        sent_seq.append(word2idx['<UNK>'])
    sequence.append(sent_seq)
    sent_seq = []
  
  return sequence

In [891]:
x_train = get_x_sequence(train_sentences)
x_train = pad_sequences(maxlen=embedding_dim, sequences=x_train, padding="post", value=len(word2idx)-1)

y_train = [[tag2idx[w[1]] for w in s] for s in train_sentences]
y_train = pad_sequences(maxlen=embedding_dim, sequences=y_train, padding="post", value=tag2idx['O'])

x_dev = get_x_sequence(dev_sentences)
x_dev = pad_sequences(maxlen=embedding_dim, sequences=x_dev, padding="post", value=len(word2idx)-1)

y_dev = [[tag2idx[w[1]] for w in s] for s in dev_sentences]
y_dev = pad_sequences(maxlen=embedding_dim, sequences=y_dev, padding="post", value=tag2idx['O'])

In [893]:
num_classes = len(tag_set)

model = keras.Sequential()
model.add(InputLayer(embedding_dim))
model.add(Embedding(input_dim=num_tokens, output_dim=embedding_dim,  embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=False,))
model.add(SpatialDropout1D(0.1))
model.add(Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1)))
model.add(Dense(num_classes, activation='softmax'))

model.compile(optimizer="adam",
              loss="sparse_categorical_crossentropy",
              metrics=["accuracy"])

model.summary()

Model: "sequential_56"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_58 (Embedding)    (None, 50, 50)            1181400   
                                                                 
 spatial_dropout1d_36 (Spat  (None, 50, 50)            0         
 ialDropout1D)                                                   
                                                                 
 bidirectional_37 (Bidirect  (None, 50, 200)           120800    
 ional)                                                          
                                                                 
 dense_34 (Dense)            (None, 50, 8)             1608      
                                                                 
Total params: 1303808 (4.97 MB)
Trainable params: 122408 (478.16 KB)
Non-trainable params: 1181400 (4.51 MB)
_________________________________________________________________


In [937]:
num_epochs = 40
batch_size = 1000

model.fit(x_train, y_train, batch_size=batch_size, epochs=num_epochs, validation_data=(x_dev, y_dev))

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
 1/15 [=>............................] - ETA: 1:03 - loss: 0.2072 - accuracy: 0.9542

KeyboardInterrupt: 

In [933]:
i = 0
predicted_labels = model2.predict(np.array([x_train[i]]))
sentence = x_train[i]
true_labels = y_train[i]
predicted_labels_sentence = predicted_labels[i]
predicted_labels_sentence = np.argmax(predicted_labels_sentence, axis=1)



In [934]:
x_train[0], len(x_train[0])

(array([  973, 12289,   237,   783,     5,  4433,   211,  6498,     0,
        23625, 23625, 23625, 23625, 23625, 23625, 23625, 23625, 23625,
        23625, 23625, 23625, 23625, 23625, 23625, 23625, 23625, 23625,
        23625, 23625, 23625, 23625, 23625, 23625, 23625, 23625, 23625,
        23625, 23625, 23625, 23625, 23625, 23625, 23625, 23625, 23625,
        23625, 23625, 23625, 23625, 23625]),
 50)

In [935]:
y_train[i], len(y_train[i])

(array([5, 7, 4, 7, 7, 7, 4, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
        7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
        7, 7, 7, 7, 7, 7]),
 50)

In [936]:
predicted_labels_sentence, len(predicted_labels_sentence)

(array([7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
        7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
        7, 7, 7, 7, 7, 7], dtype=int64),
 50)