In [1]:
import gensim.downloader
import torch

In [2]:
# Download the "word2vec-google-news-300" embeddings
glove_vectors = gensim.downloader.load('word2vec-google-news-300')

In [3]:
# Define the words to find similar words for
words_to_find_similarities = ["student", "Apple", "apple"]

# Initialize a dictionary to store the most similar words and their cosine similarities
most_similar_words = {}

# Find the most similar words and their cosine similarities for each word
for word in words_to_find_similarities:
    if word in glove_vectors:
        # Use the model's `most_similar` function to find the most similar words
        similar_words = glove_vectors.most_similar(word)
        
        # The most similar word and its cosine similarity are in the first result
        most_similar_word, cosine_similarity = similar_words[0]
        
        # Store the most similar word and its cosine similarity in the dictionary
        most_similar_words[word] = (most_similar_word, cosine_similarity)
    else:
        most_similar_words[word] = ("Not in vocabulary", 0.0)

# Print the most similar words and their cosine similarities
for word, (most_similar_word, cosine_similarity) in most_similar_words.items():
    print(f"Most similar word to '{word}': '{most_similar_word}' (Cosine Similarity: {cosine_similarity:.4f})")

Most similar word to 'student': 'students' (Cosine Similarity: 0.7295)
Most similar word to 'Apple': 'Apple_AAPL' (Cosine Similarity: 0.7457)
Most similar word to 'apple': 'apples' (Cosine Similarity: 0.7204)


In [4]:
def count_sentences(filename):
    count = 0
    with open(filename, 'r', encoding='utf-8') as file:
        for line in file:
            if line.strip() == "":
                count += 1
        # Increment count by 1 if the file doesn't end with an empty line
#         if not line.endswith('\n'):
#             count += 1
        if not line:
            count += 1
    return count


def extract_labels(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        data = file.read().strip()
    
    labels = set()
    sentences = data.split('\n\n')
    for sentence in sentences:
        lines = sentence.split('\n')
        for line in lines:
            parts = line.split()
            if len(parts) > 3:
                label = parts[-1]
                labels.add(label)
    
    return labels

# File paths for the CoNLL-2003 dataset
train_file = 'eng.train'
dev_file = 'eng.testa'
test_file = 'eng.testb'

# Count the number of sentences in each file
train_sentences = count_sentences(train_file)
dev_sentences = count_sentences(dev_file)
test_sentences = count_sentences(test_file)

# Extract all possible word labels
train_labels = extract_labels(train_file)
dev_labels = extract_labels(dev_file)
test_labels = extract_labels(test_file)

print(f"Size of the training file (number of sentences): {train_sentences}")
print(f"Size of the development file (number of sentences): {dev_sentences}")
print(f"Size of the test file (number of sentences): {test_sentences}")

print("\nAll possible word labels:")
print("Training Labels:", sorted(train_labels))
print("Development Labels:", sorted(dev_labels))
print("Test Labels:", sorted(test_labels))


Size of the training file (number of sentences): 14986
Size of the development file (number of sentences): 3465
Size of the test file (number of sentences): 3683

All possible word labels:
Training Labels: ['B-LOC', 'B-MISC', 'B-ORG', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PER', 'O']
Development Labels: ['B-MISC', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PER', 'O']
Test Labels: ['B-LOC', 'B-MISC', 'B-ORG', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PER', 'O']


In [5]:
import numpy as np
embedding_matrix = glove_vectors.vectors
embedding_matrix.shape

zero_array = np.zeros((300,), dtype=float)
embedding_matrix = np.vstack((embedding_matrix, zero_array)) # last element a zero array for OOV words

In [14]:
def tokenize(file_name):
    # Initialize empty lists to store sentences and NER tags
    sentences = []
    tags = []

    # Read the content of the CoNLL2003 file
    with open(file_name, 'r') as file:
        current_sentence = []  # Initialize an empty list for the current sentence
        current_tags = []  # Initialize an empty list for the current NER tags
        for line in file:
            line = line.strip()
            if not line:  # Empty line indicates the end of a sentence
                if current_sentence:
                    sentences.append(current_sentence)
                    tags.append(current_tags)
                current_sentence = []  # Reset the current sentence
                current_tags = []  # Reset the current NER tags
            else:
                parts = line.split()
                current_sentence.append(parts[0])
                current_tags.append(parts[-1])

    # Add the last sentence if there's no empty line after it
    if current_sentence:
        sentences.append(current_sentence)
        tags.append(current_tags)

    # Print the result
    # for sentence, tag in zip(sentences, tags):
    #     print(sentence)
    #     print(tag)

    #print(tags)
    
    return sentences, tags

def word_to_embedding(sentences, tags, padding_tag="O"):
    # Convert words to word embeddings
    train_data = []
    padded_tags = []

    # Find the maximum sentence length
    max_sentence_length = max(len(sentence) for sentence in sentences)

    for sentence, tag_sequence in zip(sentences, tags):
        sentence_embeddings = []
        sentence_tags = []
        for word, tag in zip(sentence, tag_sequence):
            if word in glove_vectors:
                embedding = glove_vectors[word]
            else:
                # Handle OOV words (e.g., use a zero vector)
                embedding = np.zeros(300)  # Assuming 300 is the embedding dimension
            sentence_embeddings.append(embedding)
            sentence_tags.append(tag)

        # Padding for both data and tags
        while len(sentence_embeddings) < max_sentence_length:
            sentence_embeddings.append(np.zeros(300))  # Zero padding for the remaining tokens
            sentence_tags.append(padding_tag)

        train_data.append(sentence_embeddings)
        padded_tags.append(sentence_tags)

    # Convert train_data and padded_tags to PyTorch tensors
    train_data = torch.tensor(train_data)
    
    return train_data, padded_tags


# def word_to_embedding(sentences,tags):
#     # Convert words to word embeddings
#     train_data = []
#     for sentence in sentences:
#         sentence_embeddings = []
#         for word in sentence:
#             if word in glove_vectors:
#                 embedding = glove_vectors[word]
#             else:
#                 # Handle OOV words (e.g., use a zero vector)
#                 embedding = np.zeros(300)  # Assuming 300 is the embedding dimension
#             sentence_embeddings.append(embedding)

#         train_data.append(sentence_embeddings)

#     # Padding to ensure all sentences have the same length (if needed)
#     max_sentence_length = max(len(sentence) for sentence in train_data)
    
#     for i, sentence in enumerate(train_data):
#         while len(sentence) < max_sentence_length:
#             sentence.append(np.zeros(300))  # Zero padding for the remaining tokens

#     # Convert train_data to a PyTorch tensor
#     train_data = torch.tensor(train_data)
    
#     return train_data, tags

train_sentences, train_tags = tokenize("eng.train")
dev_sentences, dev_tags = tokenize("eng.testa")
test_sentences, test_tags = tokenize("eng.testb")

# train_data, train_labels = word_to_embedding(train_sentences, train_tags)
# dev_data, dev_labels = word_to_embedding(dev_sentences, dev_tags)
# test_data, test_labels = word_to_embedding(test_sentences, test_tags)
# train_data = word_to_embedding(train_sentences)
# dev_data = word_to_embedding(dev_sentences)
# test_data = word_to_embedding(test_sentences)

In [6]:
# def extract_into_list(fileName):
#     # Initialize empty lists for the first and last columns
#     first_column = []
#     last_column = []

#     # Open and read the file
#     with open(fileName, 'r') as file:
#         for line in file:
#             # Split the line into columns
#             columns = line.strip().split()
#             if columns:  # Check if the line is not empty
#                 # Append the first and last columns to their respective lists
#                 first_column.append(columns[0])
#                 last_column.append(columns[-1])

#     # Print the extracted lists
# #     print("First Column (Words):")
# #     print(first_column)

# #     print("\nLast Column (Labels):")
# #     print(last_column)
#     return first_column, last_column

# train_data, train_labels = extract_into_list(train_file)
# dev_data, dev_labels = extract_into_list(dev_file)
# test_data, test_labels = extract_into_list(test_file)

# def convert_to_embedding(train_data):
#     # Convert words to word embeddings
#     train_data_embeddings = []
#     for word in train_data:
#         if word in glove_vectors:
#             # Check if the word is in the GloVe vocabulary
#             embedding = glove_vectors[word]
#             train_data_embeddings.append(embedding)
#         else:
#             # Handle out-of-vocabulary words (e.g., by using a special token or zero vector)
#             train_data_embeddings.append(zero_array)
#     return train_data_embeddings

# train_data = convert_to_embedding(train_data)
# dev_data = convert_to_embedding(dev_data)
# test_data = convert_to_embedding(test_data)

In [15]:
from torch.utils.data import Dataset, DataLoader

class NERDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        x = self.data[index]
        y = self.labels[index]
        return x, y
    
batch_size = 32
train_dataset = NERDataset(train_sentences, train_tags)
dev_dataset = NERDataset(dev_sentences, dev_tags)
test_dataset = NERDataset(test_sentences, test_tags)
# train_dataset = NERDataset(train_data, train_labels)
# dev_dataset = NERDataset(dev_data, dev_labels)
# test_dataset = NERDataset(test_data, test_labels)
# train_dataset = NERDataset(np.array(train_data, dtype=float), np.array(train_labels))
# dev_dataset = NERDataset(np.array(dev_data, dtype=float), np.array(dev_labels))
# test_dataset = NERDataset(np.array(test_data, dtype=float), np.array(test_labels))

train_data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
dev_data_loader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=False)  # No need to shuffle the dev set
test_data_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)  # No need to shuffle the test set

In [16]:
import torch
import torch.nn as nn
from torch.optim import Adam
from torch.utils.data import DataLoader
from seqeval.metrics import f1_score

# Define the LSTM model
class LSTMNER(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, num_layers, bidirectional, dropout):
        super(LSTMNER, self).__init__()
        
        # Embedding layer (use pretrained word embeddings)
        self.embedding = nn.Embedding(input_dim, embedding_dim)#nn.Embedding.from_pretrained(torch.tensor(embedding_matrix))
        self.embedding.weight.requires_grad = False
        
        # LSTM layer
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, bidirectional=bidirectional, dropout=dropout, batch_first=True)
        
        # Output layer (softmax classifier)
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)

    def forward(self, text):
        print("DEBUG...Text:" , text.shape)
        embedded = self.embedding(text.to(torch.long))
        #print(embedded)
        print("DEBUG:" , embedded.shape)
        output, (hidden, cell) = self.lstm(embedded)
        predictions = self.fc(output)
        return predictions

# Define a function to evaluate the model
def evaluate(model, data_loader, tag_vocab):
    model.eval()
    predictions = []
    true_labels = []

    with torch.no_grad():
        for batch in data_loader:
            text, tags = batch
            text = torch.tensor(text)
            predictions = model(text)
            # Extract predicted labels and true labels, and convert them to human-readable labels
            true_labels = data_loader.labels

    f1 = f1_score(true_labels, predictions, average='micro')  # Adjust 'average' as needed
    return f1


# Define hyperparameters
tag_vocab = ['B-LOC', 'B-MISC', 'B-ORG', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PER', 'O']
input_dim = len(glove_vectors.index_to_key)
embedding_dim = 300  # Use the dimension of your pretrained word embeddings
hidden_dim = 256  # Adjust as needed
output_dim = 8#len(tag_vocab)
num_layers = 2  # Adjust as needed
bidirectional = True  # You can change this based on your requirements
dropout = 0.5  # Adjust as needed

model = LSTMNER(input_dim, embedding_dim, hidden_dim, output_dim, num_layers, bidirectional, dropout)

# Define your loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=0.001)  # Adjust the learning rate as needed

# Training loop
best_f1 = 0.0
patience = 3  # Number of epochs to wait for F1 improvement
current_patience = 0

num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    
    for batch in train_data_loader:
        text, tags = batch
        text = torch.tensor(text)
        optimizer.zero_grad()
        predictions = model(text)
        #predictions = predictions.view(-1, output_dim)
        #tags = tags.view(-1)
        loss = criterion(predictions, tags)
        loss.backward()
        optimizer.step()
    
    # Evaluate on the development set
    dev_f1 = evaluate(model, dev_data_loader, tag_vocab)
    print(f'Epoch [{epoch+1}/{num_epochs}]. Development F1: {dev_f1:.4f}')
    
    if dev_f1 > best_f1:
        best_f1 = dev_f1
        current_patience = 0
        # Save the model checkpoint here if needed
    else:
        current_patience += 1
        if current_patience >= patience:
            print("Early stopping!")
            break

# Evaluate the best model on the test set
test_f1 = evaluate(model, test_data_loader, tag_vocab)
print(f'Final Test F1: {test_f1:.4f}')

RuntimeError: each element in list of batch should be of equal size