In [1]:
import torch
from transformers import BertTokenizer, BertModel
from sentence_transformers import SentenceTransformer
import datasets


# R_NUMBER_SEED = 1234567 # Replace this with your own student number
R_NUMBER_SEED = 928036 # my student number 
DOCS_TO_ADD = 1000
query_documents = datasets.load_dataset("parquet", data_files="./acl_anthology_queries.parquet")["train"]
all_documents = datasets.load_dataset("parquet", data_files="./acl_anthology_full.parquet")["train"]
# Shuffle with seed and take only n docs
shuffled_documents = all_documents.shuffle(seed=R_NUMBER_SEED)
random_documents = shuffled_documents.select(range(DOCS_TO_ADD))
# Concatenate relevant documents with random sample and shuffle again
anthology_sample = datasets.concatenate_datasets([query_documents, random_documents]).shuffle(seed=R_NUMBER_SEED)
# Export to Parquet to avoid downloading full anthology
anthology_sample.to_parquet("./anthology_sample.parquet")


Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

27259830

In [2]:
import json
queries = json.load(open("./acl_anthology_queries.json", "r"))

In [3]:
# Download stopwords
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Remove stopwords
def remove_stopwords(doc):
    text = ' '.join([word for word in doc.split() if word.lower() not in stop_words])



[nltk_data] Downloading package stopwords to /home/marko/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# !pip install sumy

In [5]:
import sumy
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer

def extractive_summary(text, sentence_count=5):
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    summarizer = LsaSummarizer()
    summary = summarizer(parser.document, sentence_count)
    return ' '.join(str(sentence) for sentence in summary)

# text = """
# """
# summary = extractive_summary(text)
# print(len(text))
# print(len(summary))



In [6]:
def preprocess_document(doc):
    # Flatten dictionary and combine relevant text fields
    summary = extractive_summary(doc.get('full_text', ''))
    text = f"{doc.get('title', '')} {doc.get('abstract', '')} {summary}"
    
    # Remove stopwords
    text = ' '.join([word for word in text.split() if word.lower() not in stop_words])
    
    return text


def preprocess_documents(documents):
    return [preprocess_document(doc) for doc in documents]

preprocessed_documents = preprocess_documents(anthology_sample)


In [7]:

# Initialize lists to store embeddings
# Initialize the MiniLM model for document embeddings
minilm_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
# Function to get document embedding using MiniLM
def get_document_embedding(document):
    return minilm_model.encode(document, convert_to_tensor=False)

minilm_embeddings = []
# bert_embeddings = []

In [8]:
# Compute embeddings for the first 1000 documents
count = 1
for document in preprocessed_documents[:]:
    print("Processing doc:",count)
    # Ensure the document is a string
    document = str(document)
    # print("Processing sentence embeddings:",count)
    minilm_embeddings.append(get_document_embedding(document))
    # print("Processing word embeddings:",count)
    # bert_embeddings.append(get_aggregated_word_embeddings(document))
    count+=1

Processing doc: 1
Processing doc: 2
Processing doc: 3
Processing doc: 4
Processing doc: 5
Processing doc: 6
Processing doc: 7
Processing doc: 8
Processing doc: 9
Processing doc: 10
Processing doc: 11
Processing doc: 12
Processing doc: 13
Processing doc: 14
Processing doc: 15
Processing doc: 16
Processing doc: 17
Processing doc: 18
Processing doc: 19
Processing doc: 20
Processing doc: 21
Processing doc: 22
Processing doc: 23
Processing doc: 24
Processing doc: 25
Processing doc: 26
Processing doc: 27
Processing doc: 28
Processing doc: 29
Processing doc: 30
Processing doc: 31
Processing doc: 32
Processing doc: 33
Processing doc: 34
Processing doc: 35
Processing doc: 36
Processing doc: 37
Processing doc: 38
Processing doc: 39
Processing doc: 40
Processing doc: 41
Processing doc: 42
Processing doc: 43
Processing doc: 44
Processing doc: 45
Processing doc: 46
Processing doc: 47
Processing doc: 48
Processing doc: 49
Processing doc: 50
Processing doc: 51
Processing doc: 52
Processing doc: 53
Pr

In [9]:
# preprocessed_documents = preprocess_documents(anthology_sample)
# minilm_embeddings2 = [get_mpnet_embedding(doc) for doc in preprocessed_documents[:10]]  # Only first 1000 for k-NN
from sklearn.neighbors import NearestNeighbors
# knn2 = NearestNeighbors(n_neighbors=5, metric='cosine').fit(minilm_embeddings)

# Fit NearestNeighbors model
nn_minilm = NearestNeighbors(n_neighbors=5, metric='cosine', algorithm='brute')
nn_minilm.fit(minilm_embeddings)


In [10]:
# Get embedding for the query sentence
test_query = queries["queries"][50]['q']
test_query_sen_embed = get_document_embedding(test_query).reshape(1, -1)
# get_nearest_neighbors(test_query_sen_embed,nn_minilm,1)
# get_document_embedding

distances, indices = nn_minilm.kneighbors(test_query_sen_embed)

# Print the indices and distances of the nearest neighbors
print("Indices of nearest neighbors:", indices)
print("Distances to nearest neighbors:", distances)

# Print the nearest neighbor sentences
nearest_neighbors = [preprocessed_documents[idx] for idx in indices[0]]
# print("Nearest neighbor sentences:", nearest_neighbors)
# query_embedding2 = get_mpnet_embedding(test_query).reshape(1, -1)
# distances, indices = knn2.kneighbors(query_embedding2)


Indices of nearest neighbors: [[613 201 357 651 103]]
Distances to nearest neighbors: [[0.4439807  0.49771446 0.5152395  0.5434519  0.54790807]]


# BELOW Cell is VERY IMPORTANT

In [11]:
### Above results tell us following:
print("Ground truth question:", queries["queries"][50]['q'])
print("Ground truth answer:", queries["queries"][50]['a'])
print("Ground truth references:", queries["queries"][50]['r'])
print("OUR references", [anthology_sample[idx]["acl_id"] for idx in indices])


Ground truth question: What is the name of the research initiative creating resources for African languages?
Ground truth answer: Masakhane. (Masakha is also fine.)
Ground truth references: ['2023.acl-long.796', '2023.acl-long.609', '2023.ijcnlp-main.10']
OUR references [['L14-1106', 'L02-1155', 'L02-1245', '2023.ijcnlp-main.10', '2023.sigtyp-1.17']]


In [12]:
import numpy as np

## Retrieving the query
# Function to retrieve the ground truth for a given query
def get_ground_truth(query):
    for q in queries["queries"]:
        if q["q"] == query:
            # return q["r"]
            return [r for r in q["r"]]
    return None
# ground_truth = get_ground_truth(query)

# Function to compare the result with the ground truth for a single query
# Function to calculate average precision for a single query
def average_precision(retrieved_docs, ground_truth_ids):
    if not ground_truth_ids:
        return 0
    retrieved_docs_set = set(retrieved_docs)
    ground_truth_set = set(ground_truth_ids)
    
    num_relevant = 0
    precision_sum = 0
    for i, doc in enumerate(retrieved_docs):
        if doc in ground_truth_set:
            num_relevant += 1
            precision_sum += num_relevant / (i + 1)
    
    return precision_sum / len(ground_truth_set)

# Function to compare the result with the ground truth for a single query
def compare_with_ground_truth(query_text, nn_model, dataset, ground_truth_function, k):
    # print("cgt k=",k)
    # Get nearest neighbors
    indices = get_nearest_neighbors(query_text, nn_model, k)
    
    # Convert numpy.int64 to Python int
    indices = [int(i) for i in indices]
    
    # Retrieve document IDs for nearest neighbors
    retrieved_docs = [dataset[i]['acl_id'] for i in indices]
    
    # Get ground truth
    ground_truth_ids = ground_truth_function(query_text)
    
    if not ground_truth_ids:
        return 0, 0, 0, 0, 0, 0, 0  # Return zeros if no ground truth is available
    
    # Calculate true positives, false positives, and false negatives
    tp = len(set(retrieved_docs) & set(ground_truth_ids))
    fp = len(retrieved_docs) - tp
    fn = len(ground_truth_ids) - tp
    
    # Calculate precision, recall, and F1-score
    precision = tp / len(retrieved_docs) if retrieved_docs else 0
    recall = tp / len(ground_truth_ids) if ground_truth_ids else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) else 0
    
    # Calculate average precision
    ap = average_precision(retrieved_docs, ground_truth_ids)
    
    return precision, recall, f1, ap, tp, fp, fn

# Function to evaluate the model on all queries
def evaluate_model_on_all_queries(queries, embeddings, dataset, ground_truth_function, k):
    # Fit NearestNeighbors model
    print("k=",k)
    nn = NearestNeighbors(n_neighbors=k, metric='cosine', algorithm='brute')
    nn.fit(embeddings)

    precision_scores = []
    recall_scores = []
    f1_scores = []
    ap_scores = []
    total_tp = total_fp = total_fn = 0

    for query in queries["queries"]:
        query_text = query["q"]
        precision, recall, f1, ap, tp, fp, fn = compare_with_ground_truth(query_text, nn, dataset, ground_truth_function, k)
        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)
        ap_scores.append(ap)
        total_tp += tp
        total_fp += fp
        total_fn += fn
    
    # Calculate macro average metrics
    macro_precision = np.mean(precision_scores)
    macro_recall = np.mean(recall_scores)
    macro_f1 = np.mean(f1_scores)
    
    # Calculate micro average metrics
    micro_precision = total_tp / (total_tp + total_fp) if (total_tp + total_fp) else 0
    micro_recall = total_tp / (total_tp + total_fn) if (total_tp + total_fn) else 0
    micro_f1 = 2 * (micro_precision * micro_recall) / (micro_precision + micro_recall) if (micro_precision + micro_recall) else 0
    
    # Calculate mean AP
    mean_ap = np.mean(ap_scores)
    
    return macro_precision, macro_recall, macro_f1, micro_precision, micro_recall, micro_f1, mean_ap


In [13]:
def get_nearest_neighbors(query_text, nn_model, k):
    # query_embedding = get_aggregated_word_embeddings([query_text])
    # print(query_text)
    # test_query_word_embed = get_aggregated_word_embeddings(query_text).reshape(1, -1)
    test_query_word_embed = get_document_embedding(query_text).reshape(1, -1)
    distances, indices = nn_model.kneighbors(test_query_word_embed, n_neighbors=k)
    return indices[0]

def nearest_neighbour(embeddings): # can be tfidf_matrix or LSI_matrix doesnt
    nn = NearestNeighbors(n_neighbors=1, metric='cosine',algorithm = 'brute')
    nn.fit(embeddings)
    
# test_query = queries["queries"][0]['q']
# get_nearest_neighbors(test_query,nn_minilm,1)

# RESULTS when summarizing text using Sumy library before embeddings

In [14]:
k = 5
macro_precision, macro_recall, macro_f1, micro_precision, micro_recall, micro_f1, mean_ap = evaluate_model_on_all_queries(queries, minilm_embeddings, anthology_sample, get_ground_truth, k)
print(f"Macro Average Precision: {macro_precision:.4f}")
print(f"Macro Average Recall: {macro_recall:.4f}")
print(f"Macro Average F1-Score: {macro_f1:.4f}")
print(f"Micro Average Precision: {micro_precision:.4f}")
print(f"Micro Average Recall: {micro_recall:.4f}")
print(f"Micro Average F1-Score: {micro_f1:.4f}")
print(f"Mean Average Precision (mAP): {mean_ap:.4f}")

k= 5
Macro Average Precision: 0.1878
Macro Average Recall: 0.6245
Macro Average F1-Score: 0.2758
Micro Average Precision: 0.1878
Micro Average Recall: 0.5349
Micro Average F1-Score: 0.2779
Mean Average Precision (mAP): 0.5161


In [15]:
# !pip install chromadb openai

In [16]:
import chromadb
from chromadb.utils import embedding_functions


CHROMA_DATA_PATH = "chroma_data/"
EMBED_MODEL = "all-MiniLM-L6-v2"
COLLECTION_NAME = "demo_docs"
client = chromadb.PersistentClient(path=CHROMA_DATA_PATH)

ModuleNotFoundError: No module named 'chromadb'

In [None]:
>>> embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
...     model_name=EMBED_MODEL
... )


In [None]:
full_texts = [f'{d["full_text"]}' for d in anthology_sample]
abstracts = [f'{d["abstract"]}' for d in anthology_sample]
ids=[f"{i}" for i in range(len(anthology_sample))]
acl_ids = [f'acl_id:{d["acl_id"]}' for d in anthology_sample]
authors=[{"author":{d["author"]}} for d in anthology_sample]
# collection.add(documents=preprocessed_documents)

In [None]:
client.delete_collection(COLLECTION_NAME)

In [None]:
collection = client.create_collection(
...     name=COLLECTION_NAME,
...     embedding_function=embedding_func,
...     metadata={"hnsw:space": "cosine"},
... )

In [None]:
collection.add(
...     documents=preprocessed_documents,
...     ids=ids,
# ...     metadatas=authors
... )

In [None]:
query_results = collection.query(
...     query_texts=["What is the name of the research initiative creating resources for African languages?"],
...     n_results=5,
... )

# These are the search results on example question (queries["queries"][50]) using CromaDB

In [None]:
# query_results["ids"][0]

for id in query_results["ids"][0]:
    print(anthology_sample[int(id)]["acl_id"])