In [1]:
import torch
from transformers import BertTokenizer, BertModel
from sentence_transformers import SentenceTransformer
import datasets
from sklearn.neighbors import NearestNeighbors
import numpy as np


# R_NUMBER_SEED = 1234567 # Replace this with your own student number
R_NUMBER_SEED = 928036 # my student number 
DOCS_TO_ADD = 1000
query_documents = datasets.load_dataset("parquet", data_files="./acl_anthology_queries.parquet")["train"]
all_documents = datasets.load_dataset("parquet", data_files="./acl_anthology_full.parquet")["train"]
# Shuffle with seed and take only n docs
shuffled_documents = all_documents.shuffle(seed=R_NUMBER_SEED)
random_documents = shuffled_documents.select(range(DOCS_TO_ADD))
# Concatenate relevant documents with random sample and shuffle again
anthology_sample = datasets.concatenate_datasets([query_documents, random_documents]).shuffle(seed=R_NUMBER_SEED)
# Export to Parquet to avoid downloading full anthology
anthology_sample.to_parquet("./anthology_sample.parquet")


Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

27259830

In [2]:
import json
queries = json.load(open("./acl_anthology_queries.json", "r"))

In [3]:
# Download stopwords
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))




[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mjova\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# !pip install sumy

In [5]:
# Remove stopwords
def remove_stopwords(doc):
    text = ' '.join([word for word in doc.split() if word.lower() not in stop_words])

def preprocess_document(doc):
    # Flatten dictionary and combine relevant text fields
    text = f"{doc.get('title', '')} {doc.get('abstract', '')} {doc.get('full_text', '')}"
    
    # Remove stopwords
    text = ' '.join([word for word in text.split() if word.lower() not in stop_words])
    
    return text

def preprocess_documents(documents):
    return [preprocess_document(doc) for doc in documents]

preprocessed_documents = preprocess_documents(anthology_sample)


In [6]:

# Initialize lists to store embeddings
# Initialize the MiniLM model for document embeddings
# minilm_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
# Function to get document embedding using MiniLM
# def get_document_embedding(document):
#     return minilm_model.encode(document, convert_to_tensor=False)

# minilm_embeddings = []


bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

# Function to get document embedding using MiniLM
# def get_document_embedding(document):
#     return minilm_model.encode(document, convert_to_tensor=False)

# Function to get word embeddings using BERT and then aggregate them
def get_aggregated_word_embeddings(document):
    inputs = bert_tokenizer(document, return_tensors='pt', truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    word_embeddings = outputs.last_hidden_state.squeeze(0)  # Removing the batch dimension
    aggregated_embedding = word_embeddings.mean(dim=0)  # Mean aggregation
    return aggregated_embedding.numpy()

In [7]:
# !curl -c -L -O https://nlp.stanford.edu/data/glove.6B.zip
# !curl -C -O https://huggingface.co/stanfordnlp/glove/resolve/main/glove.6B.zip

In [8]:

# Load GloVe embeddings
def load_glove_model(file_path):
    model = {}
    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            parts = line.split()
            word = parts[0]
            vector = np.array(parts[1:], dtype='float32')
            model[word] = vector
    return model

glove_model = load_glove_model("glove.6B/glove.6B.100d.txt")

# Example words to embed
# words = ["example", "document", "word", "embedding"]



In [9]:
bert_embeddings = []
glove_embeddings = []

# Generate embeddings for each word
def get_glove_word_embeddings(document):
    word_embeddings = [glove_model[word] for word in document.split() if word in glove_model]
    if not word_embeddings:
        # raise ValueError("None of the words in the input text are in the GloVe model.")

        print("None of the words in the input text are in the GloVe model.")
        print("document:",document)

        zero_array = np.zeros(100, dtype=float) ### document didn't match any words from glove, hence returning zero array
        document_embedding = zero_array
    else:
        document_embedding = np.mean(word_embeddings, axis=0)
        # print("len:", len(document_embedding))
    return document_embedding

# Compute embeddings for the first 1000 documents
count = 1
for document in preprocessed_documents[:]:
    if count % 100 == 0:
        print("Processing doc:",count)
    # Ensure the document is a string
    document = str(document)
    bert_embeddings.append(get_aggregated_word_embeddings(document))
    we = get_glove_word_embeddings(document) ### HAD TO implement this check with we (word embeddings) as some of the documents had 0 hits on glove
    glove_embeddings.append(we)
    count+=1

None of the words in the input text are in the GloVe model.
document: Translation Unit Concerning Timing Simultaneous Translation None None
Processing doc: 100
None of the words in the input text are in the GloVe model.
document: Collection Evaluation Broadcast News Data {A}rabic None None
Processing doc: 200
None of the words in the input text are in the GloVe model.
document: {E}nglish Speech Database Read {J}apanese Learners {CALL} System Development None None
Processing doc: 300
None of the words in the input text are in the GloVe model.
document: Towards Use Word Stems Suffixes Statistical Machine Translation None None
None of the words in the input text are in the GloVe model.
document: Language Resource Creation Distribution {L}inguistic {D}ata {C}onsortium: Progress Report None None
Processing doc: 400
Processing doc: 500
None of the words in the input text are in the GloVe model.
document: 表示法學習技術於節錄式語音文件摘要之研究(A Study Representation Learning Techniques Extractive Spoken Docume

In [10]:
# preprocessed_documents = preprocess_documents(anthology_sample)
# minilm_embeddings2 = [get_mpnet_embedding(doc) for doc in preprocessed_documents[:10]]  # Only first 1000 for k-NN
# from sklearn.neighbors import NearestNeighbors
# knn2 = NearestNeighbors(n_neighbors=5, metric='cosine').fit(minilm_embeddings)

# Fit NearestNeighbors model
nn_bert = NearestNeighbors(n_neighbors=5, metric='cosine', algorithm='brute')
nn_bert.fit(bert_embeddings)
nn_glove = NearestNeighbors(n_neighbors=5, metric='cosine', algorithm='brute')
nn_glove.fit(glove_embeddings)


In [11]:
# Get embedding for the query sentence
test_query = queries["queries"][50]['q']
test_query_bert_embed = get_aggregated_word_embeddings(test_query).reshape(1, -1)
test_query_glove_embed = get_glove_word_embeddings(test_query).reshape(1, -1)


distances_bert, indices_bert = nn_bert.kneighbors(test_query_bert_embed)
# Print the indices and distances of the nearest neighbors
print("BERT Indices of nearest neighbors:", indices_bert)
print("BERT Distances to nearest neighbors:", distances_bert)

distances_glove, indices_glove = nn_glove.kneighbors(test_query_glove_embed)

# Print the indices and distances of the nearest neighbors
print("Glove Indices of nearest neighbors:", indices_glove)
print("Glove Distances to nearest neighbors:", distances_glove)

# Print the nearest neighbor sentences
# nearest_neighbors = [preprocessed_documents[idx] for idx in indices_glove[0]]
# print("Nearest neighbor sentences:", nearest_neighbors)
# query_embedding2 = get_mpnet_embedding(test_query).reshape(1, -1)
# distances, indices = knn2.kneighbors(query_embedding2)


BERT Indices of nearest neighbors: [[ 307  998 1063  118  613]]
BERT Distances to nearest neighbors: [[0.3178627  0.31918108 0.31948835 0.33049715 0.33055514]]
Glove Indices of nearest neighbors: [[831 324 194 139 202]]
Glove Distances to nearest neighbors: [[0.06632299 0.06715717 0.06814211 0.07119117 0.07293454]]


# BELOW Cell is VERY IMPORTANT

In [13]:
### Above results tell us following:
print("Ground truth question:", queries["queries"][50]['q'])
print("Ground truth answer:", queries["queries"][50]['a'])
print("Ground truth references:", queries["queries"][50]['r'])
print("OUR BERT references", [anthology_sample[idx]["acl_id"] for idx in indices_bert])
print("OUR Glove references", [anthology_sample[idx]["acl_id"] for idx in indices_glove])


Ground truth question: What is the name of the research initiative creating resources for African languages?
Ground truth answer: Masakhane. (Masakha is also fine.)
Ground truth references: ['2023.acl-long.796', '2023.acl-long.609', '2023.ijcnlp-main.10']
OUR BERT references [['L14-1558', '2009.mtsummit-plenaries.7', '1993.eamt-1.15', 'L08-1141', 'L14-1106']]
OUR Glove references [['L06-1176', 'L16-1719', '2022.semeval-1.0', '2023.acl-long.734', 'W15-4630']]


In [14]:
# import numpy as np

## Retrieving the query
# Function to retrieve the ground truth for a given query
def get_ground_truth(query):
    for q in queries["queries"]:
        if q["q"] == query:
            # return q["r"]
            return [r for r in q["r"]]
    return None
# ground_truth = get_ground_truth(query)

# Function to compare the result with the ground truth for a single query
# Function to calculate average precision for a single query
def average_precision(retrieved_docs, ground_truth_ids):
    if not ground_truth_ids:
        return 0
    retrieved_docs_set = set(retrieved_docs)
    ground_truth_set = set(ground_truth_ids)
    
    num_relevant = 0
    precision_sum = 0
    for i, doc in enumerate(retrieved_docs):
        if doc in ground_truth_set:
            num_relevant += 1
            precision_sum += num_relevant / (i + 1)
    
    return precision_sum / len(ground_truth_set)

# Function to compare the result with the ground truth for a single query
def compare_with_ground_truth(query_text, nn_model, dataset, ground_truth_function, k,get_embeddings_func):
    # print("cgt k=",k)
    # Get nearest neighbors
    indices = get_nearest_neighbors(query_text, nn_model, k,get_embeddings_func)
    
    # Convert numpy.int64 to Python int
    indices = [int(i) for i in indices]
    
    # Retrieve document IDs for nearest neighbors
    retrieved_docs = [dataset[i]['acl_id'] for i in indices]
    
    # Get ground truth
    ground_truth_ids = ground_truth_function(query_text)
    
    if not ground_truth_ids:
        return 0, 0, 0, 0, 0, 0, 0  # Return zeros if no ground truth is available
    
    # Calculate true positives, false positives, and false negatives
    tp = len(set(retrieved_docs) & set(ground_truth_ids))
    fp = len(retrieved_docs) - tp
    fn = len(ground_truth_ids) - tp
    
    # Calculate precision, recall, and F1-score
    precision = tp / len(retrieved_docs) if retrieved_docs else 0
    recall = tp / len(ground_truth_ids) if ground_truth_ids else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) else 0
    
    # Calculate average precision
    ap = average_precision(retrieved_docs, ground_truth_ids)
    
    return precision, recall, f1, ap, tp, fp, fn

# Function to evaluate the model on all queries
def evaluate_model_on_all_queries(queries, embeddings, dataset, ground_truth_function, k, get_embeddings_func):
    # Fit NearestNeighbors model
    print("k=",k)
    nn = NearestNeighbors(n_neighbors=k, metric='cosine', algorithm='brute')
    nn.fit(embeddings)

    precision_scores = []
    recall_scores = []
    f1_scores = []
    ap_scores = []
    total_tp = total_fp = total_fn = 0

    for query in queries["queries"]:
        query_text = query["q"]
        precision, recall, f1, ap, tp, fp, fn = compare_with_ground_truth(query_text, nn, dataset, ground_truth_function, k,get_embeddings_func)
        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)
        ap_scores.append(ap)
        total_tp += tp
        total_fp += fp
        total_fn += fn
    
    # Calculate macro average metrics
    macro_precision = np.mean(precision_scores)
    macro_recall = np.mean(recall_scores)
    macro_f1 = np.mean(f1_scores)
    
    # Calculate micro average metrics
    micro_precision = total_tp / (total_tp + total_fp) if (total_tp + total_fp) else 0
    micro_recall = total_tp / (total_tp + total_fn) if (total_tp + total_fn) else 0
    micro_f1 = 2 * (micro_precision * micro_recall) / (micro_precision + micro_recall) if (micro_precision + micro_recall) else 0
    
    # Calculate mean AP
    mean_ap = np.mean(ap_scores)
    
    return macro_precision, macro_recall, macro_f1, micro_precision, micro_recall, micro_f1, mean_ap


In [15]:
def get_nearest_neighbors(query_text, nn_model, k, get_embeddings_func):
    # query_embedding = get_aggregated_word_embeddings([query_text])
    # print(query_text)
    # test_query_word_embed = get_aggregated_word_embeddings(query_text).reshape(1, -1)
    test_query_word_embed = get_embeddings_func(query_text).reshape(1, -1)
    distances, indices = nn_model.kneighbors(test_query_word_embed, n_neighbors=k)
    return indices[0]

def get_bert_nearest_neighbors(query_text, nn_model, k):
    # query_embedding = get_aggregated_word_embeddings([query_text])
    # print(query_text)
    # test_query_word_embed = get_aggregated_word_embeddings(query_text).reshape(1, -1)
    test_query_word_embed = get_aggregated_word_embeddings(query_text).reshape(1, -1)
    distances, indices = nn_model.kneighbors(test_query_word_embed, n_neighbors=k)
    return indices[0]

def get_glove_nearest_neighbors(query_text, nn_model, k):
    # query_embedding = get_aggregated_word_embeddings([query_text])
    # print(query_text)
    # test_query_word_embed = get_aggregated_word_embeddings(query_text).reshape(1, -1)
    test_query_word_embed =  get_glove_word_embeddings(query_text).reshape(1, -1)
    distances, indices = nn_model.kneighbors(test_query_word_embed, n_neighbors=k)
    return indices[0]
    
def nearest_neighbour(embeddings): # can be tfidf_matrix or LSI_matrix doesnt
    nn = NearestNeighbors(n_neighbors=1, metric='cosine',algorithm = 'brute')
    nn.fit(embeddings)
    

# RESULTS when summarizing preprocessing text using ntlk library and BertModel.from_pretrained('bert-base-uncased') for word embeddings.

In [16]:
k = 5
macro_precision, macro_recall, macro_f1, micro_precision, micro_recall, micro_f1, mean_ap = evaluate_model_on_all_queries(queries, bert_embeddings, anthology_sample, get_ground_truth, k,get_embeddings_func=get_aggregated_word_embeddings )
print(f"Macro Average Precision: {macro_precision:.4f}")
print(f"Macro Average Recall: {macro_recall:.4f}")
print(f"Macro Average F1-Score: {macro_f1:.4f}")
print(f"Micro Average Precision: {micro_precision:.4f}")
print(f"Micro Average Recall: {micro_recall:.4f}")
print(f"Micro Average F1-Score: {micro_f1:.4f}")
print(f"Mean Average Precision (mAP): {mean_ap:.4f}")

k= 5
Macro Average Precision: 0.0020
Macro Average Recall: 0.0102
Macro Average F1-Score: 0.0034
Micro Average Precision: 0.0020
Micro Average Recall: 0.0058
Micro Average F1-Score: 0.0030
Mean Average Precision (mAP): 0.0051


In [17]:
k = 5
macro_precision, macro_recall, macro_f1, micro_precision, micro_recall, micro_f1, mean_ap = evaluate_model_on_all_queries(queries, glove_embeddings, anthology_sample, get_ground_truth, k,get_embeddings_func=get_glove_word_embeddings)
print(f"Macro Average Precision: {macro_precision:.4f}")
print(f"Macro Average Recall: {macro_recall:.4f}")
print(f"Macro Average F1-Score: {macro_f1:.4f}")
print(f"Micro Average Precision: {micro_precision:.4f}")
print(f"Micro Average Recall: {micro_recall:.4f}")
print(f"Micro Average F1-Score: {micro_f1:.4f}")
print(f"Mean Average Precision (mAP): {mean_ap:.4f}")

k= 5
Macro Average Precision: 0.0102
Macro Average Recall: 0.0374
Macro Average F1-Score: 0.0153
Micro Average Precision: 0.0102
Micro Average Recall: 0.0291
Micro Average F1-Score: 0.0151
Mean Average Precision (mAP): 0.0323


In [18]:
# !pip install chromadb openai

In [19]:
import chromadb
from chromadb.utils import embedding_functions

CHROMA_DATA_PATH = "chroma_data/"
EMBED_MODEL = "all-MiniLM-L6-v2"
COLLECTION_NAME = "demo_docs"
client = chromadb.PersistentClient(path=CHROMA_DATA_PATH)

In [20]:
>>> embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
...     model_name=EMBED_MODEL
... )


In [21]:
full_texts = [f'{d["full_text"]}' for d in anthology_sample]
abstracts = [f'{d["abstract"]}' for d in anthology_sample]
ids=[f"{i}" for i in range(len(anthology_sample))]
acl_ids = [f'acl_id:{d["acl_id"]}' for d in anthology_sample]
authors=[{"author":{d["author"]}} for d in anthology_sample]
# collection.add(documents=preprocessed_documents)

In [22]:
client.delete_collection(COLLECTION_NAME)

In [23]:
collection = client.create_collection(
...     name=COLLECTION_NAME,
...     embedding_function=embedding_func,
...     metadata={"hnsw:space": "cosine"},
... )

In [24]:
collection.add(
...     documents=preprocessed_documents,
...     ids=ids,
# ...     metadatas=authors
... )

In [25]:
query_results = collection.query(
...     query_texts=["What is the name of the research initiative creating resources for African languages?"],
...     n_results=5,
... )

# These are the search results on example question (queries["queries"][50]) using ChromaDB

In [26]:
# query_results["ids"][0]

for id in query_results["ids"][0]:
    print(anthology_sample[int(id)]["acl_id"])

L14-1106
2023.sigtyp-1.17
L16-1719
2021.mrl-1.11
W14-2212


# Here are our results for the same question

In [27]:
print("OUR BERT references", [anthology_sample[idx]["acl_id"] for idx in indices_bert])
print("OUR Glove references", [anthology_sample[idx]["acl_id"] for idx in indices_glove])


OUR BERT references [['L14-1558', '2009.mtsummit-plenaries.7', '1993.eamt-1.15', 'L08-1141', 'L14-1106']]
OUR Glove references [['L06-1176', 'L16-1719', '2022.semeval-1.0', '2023.acl-long.734', 'W15-4630']]


# Ground truth for the same query (q50)


In [28]:
print("Ground truth question:", queries["queries"][50]['q'])
print("Ground truth answer:", queries["queries"][50]['a'])
print("Ground truth references:", queries["queries"][50]['r'])

Ground truth question: What is the name of the research initiative creating resources for African languages?
Ground truth answer: Masakhane. (Masakha is also fine.)
Ground truth references: ['2023.acl-long.796', '2023.acl-long.609', '2023.ijcnlp-main.10']


# Conclusion: It seems that our word transformers have low performance due to lack of context.