#### NLP Project Part 1

In [1]:
!pip install --upgrade gensim

/bin/bash: /home/burntpie/miniconda3/envs/tf/lib/libtinfo.so.6: no version information available (required by /bin/bash)
Collecting gensim
  Obtaining dependency information for gensim from https://files.pythonhosted.org/packages/7b/ef/d559c7daebb2f00b881575551b23866ebcbf6eeaf33393d692c7f46d0983/gensim-4.3.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Downloading gensim-4.3.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.5 kB)
Collecting smart-open>=1.8.1 (from gensim)
  Obtaining dependency information for smart-open>=1.8.1 from https://files.pythonhosted.org/packages/fc/d9/d97f1db64b09278aba64e8c81b5d322d436132df5741c518f3823824fae0/smart_open-6.4.0-py3-none-any.whl.metadata
  Downloading smart_open-6.4.0-py3-none-any.whl.metadata (21 kB)
Downloading gensim-4.3.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.6/26.6 MB[0m [31m10.9 MB/s[0m eta [36m0:0

In [2]:
import gensim.downloader

print(list(gensim.downloader.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [3]:
# Download the 'word2vec-google-news-300' embeddings
word2vec_google = gensim.downloader.load('word2vec-google-news-300')

#### Question 1.1

In [4]:
words = ["student", "Apple", "apple"]
for word in words:
    similar_word, similarity_score = word2vec_google.most_similar(word, topn=1)[0]
    print(f"Word: {word}")
    print(f"Most similar word: {similar_word}")
    print(f"Cosine similarity: {similarity_score}")
    print("--------")

Word: student
Most similar word: students
Cosine similarity: 0.7294867038726807
--------
Word: Apple
Most similar word: Apple_AAPL
Cosine similarity: 0.7456986308097839
--------
Word: apple
Most similar word: apples
Cosine similarity: 0.720359742641449
--------


#### Question 1.2(a)

In [5]:
def read_conll_file(file_path):
    sentences = []  # This will store lists of word-label pairs, one list for each sentence
    sentence = []   # Temporary list to store word-label pairs for the current sentence

    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()  # Remove any leading/trailing whitespace
            if line:  # If the line isn't empty (i.e., we're within a sentence)
                word, _, _, label = line.split()  # Split the line to get the word and its associated label
                sentence.append((word, label))
            else:  # An empty line means the end of the current sentence
                sentences.append(sentence)
                sentence = []  # Reset the temporary list for the next sentence

    return sentences


In [6]:
train_data = read_conll_file("eng.train")
dev_data = read_conll_file("eng.testa")
test_data = read_conll_file("eng.testb")

print(f"Number of sentences in training data: {len(train_data)}")
print(f"Number of sentences in development data: {len(dev_data)}")
print(f"Number of sentences in test data: {len(test_data)}")

# Extract all unique labels from training data
labels = set()
for sentence in train_data:
    for _, label in sentence:
        labels.add(label)

print(f"All possible labels: {labels}")


Number of sentences in training data: 14986
Number of sentences in development data: 3465
Number of sentences in test data: 3683
All possible labels: {'I-ORG', 'B-LOC', 'B-ORG', 'I-LOC', 'O', 'I-PER', 'I-MISC', 'B-MISC'}


#### Question 1.2(b)

In [7]:
for sentence in train_data:
    entities = []          # List to store identified entities
    current_entity = []    # Temporary list to store words of the current entity
    current_label = None   # Track the label of the current entity

    for word, label in sentence:
        if label != 'O':  # If the word is part of an entity
            prefix, label_type = label.split('-')  # Split the label into its prefix (B/I) and type (e.g., ORG, PER)
            
            # If it's the beginning of an entity or a continuation of a different entity type
            if prefix == 'B' or (prefix == 'I' and current_label != label_type):
                if current_entity:  # If we've been capturing an entity, add it to our list
                    entities.append((' '.join(current_entity), current_label))
                current_entity = [word]  # Start a new entity
                current_label = label_type
            elif prefix == 'I' and current_label == label_type:  # If it's a continuation of the same entity type
                current_entity.append(word)
        else:  # If the word is not part of any entity
            if current_entity:  # If we've been capturing an entity, add it to our list and reset
                entities.append((' '.join(current_entity), current_label))
                current_entity = []
                current_label = None

    # Check if sentence matches criteria of having at least two multi-word named entities
    entity_texts = [entity[0] for entity in entities if ' ' in entity[0]]
    if len(entity_texts) >= 2:
        print(f"Sentence: {' '.join([word for word, _ in sentence])}")
        print(f"Named Entities: {entity_texts}")
        break


Sentence: Germany 's representative to the European Union 's veterinary committee Werner Zwingmann said on Wednesday consumers should buy sheepmeat from countries other than Britain until the scientific advice was clearer .
Named Entities: ['European Union', 'Werner Zwingmann']


#### Question 1.3

In [8]:
# Load word2vec_google
# word2vec_google = gensim.downloader.load('word2vec-google-news-300')

In [9]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, TimeDistributed, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences

2023-10-26 16:10:51.032505: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-10-26 16:10:51.088230: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [10]:
def read_conll2003_file(filepath):
    """Reads the CoNLL2003 file and returns sentences with their corresponding tags."""
    sentences = []
    sentence_words = []
    sentence_tags = []
    
    with open(filepath, 'r') as file:
        for line in file:
            line = line.strip()
            if not line:
                # New sentence starts
                if sentence_words and sentence_tags:
                    sentences.append((sentence_words, sentence_tags))
                    sentence_words = []
                    sentence_tags = []
            else:
                # Extracting word and NER tag
                parts = line.split(' ')
                word = parts[0]
                tag = parts[-1]
                sentence_words.append(word)
                sentence_tags.append(tag)
                
    if sentence_words and sentence_tags:
        sentences.append((sentence_words, sentence_tags))

    return sentences

# Example usage:
train_data = read_conll2003_file("eng.train")
dev_data = read_conll2003_file("eng.testa")
test_data = read_conll2003_file("eng.testb")


In [11]:
# Create a set of all unique words and tags in the training dataset
words = list(set(word for sentence, _ in train_data for word in sentence))
tags = list(set(tag for _, tags in train_data for tag in tags))

# Define your word_to_ix based on your dataset 
word_to_ix = {word: i for i, word in enumerate(words)}
word_to_ix["UNK"] = len(word_to_ix) # Add 'UNK' token
word_to_ix["PAD"] = len(word_to_ix) # Add 'PAD' token

# Create word and tag dictionaries for mapping
tag_to_ix = {tag: i for i, tag in enumerate(tags)}
ix_to_tag = {i: tag for tag, i in tag_to_ix.items()}

In [23]:
print(f"{word_to_ix = }")
print(f"{ix_to_tag = }")

ix_to_tag = {0: 'I-ORG', 1: 'B-LOC', 2: 'B-ORG', 3: 'I-LOC', 4: 'O', 5: 'I-PER', 6: 'I-MISC', 7: 'B-MISC'}


In [13]:
def create_embedding_matrix(word_model, word_index):
    """Create an embedding matrix given a word model (like Word2Vec) and a word_index (like the one from the Tokenizer)"""
    vocab_size = len(word_index) + 1  # Adding 1 because of reserved 0 index
    embedding_dim = word_model.vector_size
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    
    for word, i in word_index.items():
        try:
            embedding_vector = word_model[word]
            embedding_matrix[i] = embedding_vector
        except KeyError:
            # Word not in pretrained model will be all-zeros.
            pass

    return embedding_matrix

pretrained_weights = create_embedding_matrix(word2vec_google, word_to_ix)


In [14]:
def prepare_sequence(seq, to_ix, default=None):
    """Convert a sequence of words/tags to a sequence of indices."""
    if default:
        return [to_ix.get(word, default) for word in seq]
    return [to_ix[word] for word in seq]

# Convert datasets into numerical sequences
x_train = [prepare_sequence(sentence, word_to_ix, word_to_ix["UNK"]) for sentence, _ in train_data]
y_train = [prepare_sequence(tags, tag_to_ix) for _, tags in train_data]

x_dev = [prepare_sequence(sentence, word_to_ix, word_to_ix["UNK"]) for sentence, _ in dev_data]
y_dev = [prepare_sequence(tags, tag_to_ix) for _, tags in dev_data]

x_test = [prepare_sequence(sentence, word_to_ix, word_to_ix["UNK"]) for sentence, _ in test_data]
y_test = [prepare_sequence(tags, tag_to_ix) for _, tags in test_data]


In [15]:
# Padding sequences
MAX_LENGTH = max(len(sentence) for sentence, _ in train_data)

x_train = pad_sequences(x_train, maxlen=MAX_LENGTH, padding='post')
y_train = pad_sequences(y_train, maxlen=MAX_LENGTH, padding='post')

x_dev = pad_sequences(x_dev, maxlen=MAX_LENGTH, padding='post')
y_dev = pad_sequences(y_dev, maxlen=MAX_LENGTH, padding='post')

x_test = pad_sequences(x_test, maxlen=MAX_LENGTH, padding='post')
y_test = pad_sequences(y_test, maxlen=MAX_LENGTH, padding='post')


In [16]:
def NERLSTM(vocab_size, embedding_dim, hidden_dim, tagset_size, pretrained_weights):
    # Input layer
    sentence_input = Input(shape=(None, ), dtype='int32')
    
    # Load pre-trained embeddings
    word_embeddings = Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[pretrained_weights], trainable=False)(sentence_input)
    
    # LSTM layer
    lstm_out = LSTM(hidden_dim, return_sequences=True)(word_embeddings)
    
    # Dense output layer
    tag_scores = TimeDistributed(Dense(tagset_size, activation='softmax'))(lstm_out)
    
    model = Model(sentence_input, tag_scores)
    return model


#### Custom Callback to implement
#### "Use the development set to evaluate the performance of the model for each epoch during training. Please use f1 score to measure the performance."

In [17]:
from keras.callbacks import Callback
from seqeval.metrics import f1_score

class F1Evaluation(Callback):
    def __init__(self, validation_data=(), patience=3):
        super(Callback, self).__init__()

        self.x_val, self.y_val = validation_data
        self.patience = patience
        self.best_f1 = 0.0
        self.wait = 0  # for early stopping

    def on_epoch_end(self, epoch, logs=None):
        y_pred = self.model.predict(self.x_val, verbose=0)
        y_pred = [list(np.argmax(pred, axis=-1)) for pred in y_pred]
        y_true = [list(sentence) for sentence in self.y_val]
        y_pred_str = [[ix_to_tag[ix] for ix in sentence] for sentence in y_pred]
        y_true_str = [[ix_to_tag[ix] for ix in sentence] for sentence in y_true]

        current_f1 = f1_score(y_true_str, y_pred_str)
        logs['val_f1'] = current_f1

        # print current F1 score for the epoch
        # print(f" - val_f1: {current_f1:.4f}")

        # implement early stopping
        if current_f1 > self.best_f1:
            self.best_f1 = current_f1
            self.wait = 0
        else:
            self.wait += 1
            if self.wait >= self.patience:
                self.model.stop_training = True
                print("Early stopping based on F1 score.")

In [20]:
import time

# Set your model parameters
EMBEDDING_DIM = 300
HIDDEN_DIM = 150
VOCAB_SIZE = len(word_to_ix) + 1
TAGSET_SIZE = len(tag_to_ix)

# Initialize model
model = NERLSTM(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, TAGSET_SIZE, pretrained_weights)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
f1_evaluation = F1Evaluation(validation_data=(x_dev, y_dev), patience=3)

# Reshape y_train for the sparse_categorical_crossentropy loss
y_train_reshaped = y_train.reshape(*y_train.shape, 1)
y_dev_reshaped = y_dev.reshape(*y_dev.shape, 1)

# Train the model and record the start and end time
start_time = time.time()
history = model.fit(x_train, y_train_reshaped, epochs=10, batch_size=32, validation_data=(x_dev, y_dev_reshaped), callbacks=[f1_evaluation])
end_time = time.time()

# (c) Report how many epochs and running time
print(f"Number of epochs: {len(history.epoch)}")
print(f"Running time: {end_time - start_time} seconds")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Number of epochs: 10
Running time: 45.95535230636597 seconds


In [21]:
# Convert the y_test from a numpy array to list of lists
y_test_list = [list(sentence) for sentence in y_test]
y_test_list_str = [[ix_to_tag[ix] for ix in sentence] for sentence in y_test_list]

# Compute the F1 score on the test set
test_predictions = model.predict(x_test)
test_predictions = [list(np.argmax(pred, axis=-1)) for pred in test_predictions]
test_predictions_str = [[ix_to_tag[ix] for ix in sentence] for sentence in test_predictions]
test_f1 = f1_score(y_test_list_str, test_predictions_str)
print(f"F1 Score on the Test Set: {test_f1:.4f}")

F1 Score on the Test Set: 0.5435
