In [3]:
# from retrievers.bm25_retriever import BM25Retriever
# from retrievers.colbert_retriever import ColBERTRetriever
# from retrievers.dense_retriever import DPRRetriever
#
# documents = ["The sky is blue.", "The sun is bright.", "The grass is green."]
#
# # Initialize the Retriever with the documents
# dense_retriever = DPRRetriever(documents)

In [34]:
import numpy as np
from sentence_transformers import SentenceTransformer

class ColBERTRetriever:
    def __init__(self, documents):
        # Load ColBERT Model (Sentence Transformers)
        self.model = SentenceTransformer('bert-base-nli-mean-tokens')

        # Encode documents into embeddings (token-level interactions)
        self.documents = documents
        self.document_embeddings = self.encode_documents(documents)

    def encode_documents(self, documents):
        # Token-level embeddings via Sentence-BERT
        embeddings = self.model.encode(documents)
        return np.array(embeddings)

    def retrieve(self, query, top_k=5):
        # Encode the query using ColBERT embedding model
        query_embedding = self.model.encode([query])

        # Compute cosine similarities
        similarities = np.dot(self.document_embeddings, query_embedding.T)

        # Sort documents by similarity score
        ranked_indexes = similarities.argsort()[-top_k:][::-1]

        # Return top-k results
        return [self.documents[i] for i in ranked_indexes]

In [35]:
documents = ["The sky is blue.", "The sky is pink", "The sun is bright.", "The grass is green."]

# Initialize the Retriever with the documents
dense_retriever = ColBERTRetriever(documents)

query = "What is the color of the sky?"

# Retrieve the top 2 most relevant documents based on the query
top_k = 2
retrieved_docs = dense_retriever.retrieve(query, top_k=top_k)

# Output the retrieved documents"
print("Top-k retrieved documents (BM25):", retrieved_docs)

Some weights of the model checkpoint at facebook/dpr-question_encoder-single-nq-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.bias', 'question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.bias', 'ctx_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRContextEncoder from the

Top-k retrieved documents (BM25): ['The sky is blue.', 'The sky is pink']
