In [1]:
from pylate import models, indexes, retrieve
from datasets import load_dataset
import os
import json
from pathlib import Path


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
DATASET_PATH = os.path.join(os.getcwd(), "..", "data", "hotpot_dev_fullwiki_v1.json")

if not os.path.exists(DATASET_PATH):
    raise FileNotFoundError(f"Dataset not found at {DATASET_PATH}")


def load_jsonl_or_json(path: Path):
    with path.open("r", encoding="utf-8") as f:
        raw = f.read().strip()
        if not raw:
            return []
        if "\n" in raw and raw.lstrip().startswith("{"):
            return [json.loads(line) for line in raw.splitlines() if line.strip()]
        return json.loads(raw)
    

In [3]:

# Load model
model = models.ColBERT(
    model_name_or_path="lightonai/Reason-ModernColBERT",
)



In [4]:
token_embeddings = model.encode(["what is the capital of France?"])[0]
for i in token_embeddings:
    print(i[:10])

print(token_embeddings.shape)


[ 0.01615289  0.10855096  0.04368661 -0.0444998   0.14050989 -0.08097885
  0.07934773 -0.02179314 -0.08743791  0.01667484]
[ 0.01092617  0.11805642  0.04453169 -0.0078429   0.08532196 -0.02074775
  0.09786479 -0.0055049  -0.10680763  0.07232263]
[ 0.06882509  0.1368493   0.03428188 -0.04705479  0.13494694 -0.03523139
  0.16335389 -0.10394149 -0.12982729 -0.01270007]
[ 0.00685698  0.12351122  0.07891472 -0.11566098  0.15681885 -0.00683769
  0.16335769 -0.06396662 -0.08065119 -0.06266539]
[ 0.05137798  0.14946052  0.04974353 -0.09155101  0.15787841 -0.02492066
  0.1425892  -0.07718005 -0.11448385 -0.03326385]
[ 0.06521307  0.07519273 -0.05463459 -0.05925547  0.15255405 -0.02500105
  0.10581504 -0.01879801  0.02976482  0.05567322]
[-0.00715305  0.12537864  0.06596203 -0.07221137  0.14411297 -0.04162292
  0.15013932 -0.05290528 -0.11014893  0.0024587 ]
[-0.00310663  0.03099844  0.04649072  0.03120333  0.12115342 -0.09273321
  0.03230219  0.0636235  -0.07586976  0.02637839]
[-0.01435228  0.

## Prepare sentences for indexing

In [None]:


dataset = load_jsonl_or_json(Path(DATASET_PATH))

print(f"Total entries: {len(dataset)}")
print("Sample entry:")
for q in dataset[0]:
    print(q, dataset[0][q])

    


Total entries: 7405
Sample entry:
_id 5a8b57f25542995d1e6f1371
answer yes
question Were Scott Derrickson and Ed Wood of the same nationality?
supporting_facts [['Scott Derrickson', 0], ['Ed Wood', 0]]
context [['Adam Collis', ['Adam Collis is an American filmmaker and actor.', ' He attended the Duke University from 1986 to 1990 and the University of California, Los Angeles from 2007 to 2010.', ' He also studied cinema at the University of Southern California from 1991 to 1997.', ' Collis first work was the assistant director for the Scott Derrickson\'s short "Love in the Ruins" (1995).', ' In 1998, he played "Crankshaft" in Eric Koyanagi\'s "Hundred Percent".']], ['Ed Wood (film)', ['Ed Wood is a 1994 American biographical period comedy-drama film directed and produced by Tim Burton, and starring Johnny Depp as cult filmmaker Ed Wood.', " The film concerns the period in Wood's life when he made his best-known films as well as his relationship with actor Bela Lugosi, played by Martin La

In [None]:
import hashlib 
import tqdm

documents = []
documents_ids = []
document_ids_to_sentence = {}
MAX_ENTRIES = 100

duplicate_count = 0
for entry in dataset[:MAX_ENTRIES]:
    question_id = entry['_id']
    question = entry['question']
    paragraphs = entry['context']
    for paragraph in paragraphs:
        title = paragraph[0]
        sentences = paragraph[1]
        for sentence in sentences:
            # hash the title and sentence id to create a unique document id
            doc = f"{title}: {sentence}"
            doc_id = hashlib.md5(doc.encode()).hexdigest()

            # check if the document id already exists (hash collision or actual duplicate)
            if doc_id in document_ids_to_sentence:
                duplicate_count += 1
                continue
            
            document_ids_to_sentence[doc_id] = doc
            documents.append(doc)
            documents_ids.append(doc_id)


print(f"Total duplicate documents: {duplicate_count}")
assert len(documents) == len(documents_ids)
print(f"Total documents: {len(documents)}")


Total duplicate documents: 32
Total documents: 4215
Sample documents:
72b1a97c5fd6227955a9afb6536e300f -> Adam Collis: Adam Collis is an American filmmaker and actor.
df901e5fe6f8bb2788e5c1e272cf2d61 -> Adam Collis:  He attended the Duke University from 1986 to 1990 and the University of California, Los Angeles from 2007 to 2010.
0a8be5e7a74dbce6697d9047471efc9c -> Adam Collis:  He also studied cinema at the University of Southern California from 1991 to 1997.
ad9d8b05bb6dd7ccad69d3e3a289885d -> Adam Collis:  Collis first work was the assistant director for the Scott Derrickson's short "Love in the Ruins" (1995).
e48c4cee406930960108874b265aa7da -> Adam Collis:  In 1998, he played "Crankshaft" in Eric Koyanagi's "Hundred Percent".
a249d9483fe213c91690aff8cd7df56a -> Ed Wood (film): Ed Wood is a 1994 American biographical period comedy-drama film directed and produced by Tim Burton, and starring Johnny Depp as cult filmmaker Ed Wood.
1eddda3f130dd9ca3d12b2368cc68f3e -> Ed Wood (film):  

## Inspecting sample documents

In [29]:

print("Sample documents:")
for doc_id in list(document_ids_to_sentence.keys())[:10]:
    print(f"{doc_id} -> {document_ids_to_sentence[doc_id]}")



Sample documents:
72b1a97c5fd6227955a9afb6536e300f -> Adam Collis: Adam Collis is an American filmmaker and actor.
df901e5fe6f8bb2788e5c1e272cf2d61 -> Adam Collis:  He attended the Duke University from 1986 to 1990 and the University of California, Los Angeles from 2007 to 2010.
0a8be5e7a74dbce6697d9047471efc9c -> Adam Collis:  He also studied cinema at the University of Southern California from 1991 to 1997.
ad9d8b05bb6dd7ccad69d3e3a289885d -> Adam Collis:  Collis first work was the assistant director for the Scott Derrickson's short "Love in the Ruins" (1995).
e48c4cee406930960108874b265aa7da -> Adam Collis:  In 1998, he played "Crankshaft" in Eric Koyanagi's "Hundred Percent".
a249d9483fe213c91690aff8cd7df56a -> Ed Wood (film): Ed Wood is a 1994 American biographical period comedy-drama film directed and produced by Tim Burton, and starring Johnny Depp as cult filmmaker Ed Wood.
1eddda3f130dd9ca3d12b2368cc68f3e -> Ed Wood (film):  The film concerns the period in Wood's life when he 

In [30]:
# Initialize PLAID index (IVF with product quantization (PQ) )
print("Creating PLAID index...")
index = indexes.PLAID(
    index_folder="index",
    index_name="hotpotqa-colbert-index",
    override=True,
)

# Encode and add documents
print("Encoding documents...")
documents_embeddings = model.encode(
    documents,
    batch_size=32,
    is_query=False,
    show_progress_bar=True,
)

print("Adding documents to index...")
index.add_documents(
    documents_ids=documents_ids,
    documents_embeddings=documents_embeddings,
)


Creating PLAID index...
Encoding documents...


Encoding documents (bs=32): 100%|██████████| 132/132 [00:38<00:00,  3.42it/s]


Adding documents to index...


Encoding queries (bs=32): 100%|██████████| 1/1 [00:00<00:00,  5.78it/s]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism 

[[{'id': 'a249d9483fe213c91690aff8cd7df56a', 'score': 11.185302734375}, {'id': 'c9ac9bbe6fe678d10b9bf82574684350', 'score': 11.082275390625}, {'id': '06fbbec8ecb084e33f019b539dbef512', 'score': 10.936767578125}, {'id': 'b28cd28c89fffa230ab80feec6c079c9', 'score': 10.8861083984375}, {'id': 'ad9d8b05bb6dd7ccad69d3e3a289885d', 'score': 10.767578125}], [{'id': 'e75cc939d7b7fc7f4f42b0d6ac4bdcd7', 'score': 17.854248046875}, {'id': '87719ac778fffb42bab718868727e54f', 'score': 16.807861328125}, {'id': '30ec97c98ff6206b86eb65abeb6fae08', 'score': 16.489013671875}, {'id': 'b78c15d9dfdd114c3023beb222f85310', 'score': 16.313720703125}, {'id': 'd3c642ea0baa9fac2ea2797ac3336009', 'score': 16.058837890625}]]


In [33]:

# Retrieve
retriever = retrieve.ColBERT(index=index)

queries = [dataset[0]['question'], dataset[1]['question']]

queries_embeddings = model.encode(
    queries,
    batch_size=32,
    is_query=True,
    show_progress_bar=True,
)

scores = retriever.retrieve(
    queries_embeddings=queries_embeddings,
    k=5,
)


Encoding queries (bs=32): 100%|██████████| 1/1 [00:00<00:00,  2.57it/s]


In [40]:
for query_idx, query in enumerate(scores):
    print(f"\nQuery: {queries[query_idx]}\n")
    for result in query:
        print(result)



Query: Were Scott Derrickson and Ed Wood of the same nationality?

{'id': 'a249d9483fe213c91690aff8cd7df56a', 'score': 11.185302734375}
{'id': 'c9ac9bbe6fe678d10b9bf82574684350', 'score': 11.082275390625}
{'id': '06fbbec8ecb084e33f019b539dbef512', 'score': 10.936767578125}
{'id': 'b28cd28c89fffa230ab80feec6c079c9', 'score': 10.8861083984375}
{'id': 'ad9d8b05bb6dd7ccad69d3e3a289885d', 'score': 10.767578125}

Query: What government position was held by the woman who portrayed Corliss Archer in the film Kiss and Tell?

{'id': 'e75cc939d7b7fc7f4f42b0d6ac4bdcd7', 'score': 17.854248046875}
{'id': '87719ac778fffb42bab718868727e54f', 'score': 16.807861328125}
{'id': '30ec97c98ff6206b86eb65abeb6fae08', 'score': 16.489013671875}
{'id': 'b78c15d9dfdd114c3023beb222f85310', 'score': 16.313720703125}
{'id': 'd3c642ea0baa9fac2ea2797ac3336009', 'score': 16.058837890625}


## Map IDs to documents

In [42]:
for query_idx, query in enumerate(scores):
    print(f"\nQuery: {queries[query_idx]}\n")
    for result in query:
        print(document_ids_to_sentence[result['id']])


Query: Were Scott Derrickson and Ed Wood of the same nationality?

Ed Wood (film): Ed Wood is a 1994 American biographical period comedy-drama film directed and produced by Tim Burton, and starring Johnny Depp as cult filmmaker Ed Wood.
The Exorcism of Emily Rose: The Exorcism of Emily Rose is a 2005 American legal drama horror film directed by Scott Derrickson and starring Laura Linney and Tom Wilkinson.
Doctor Strange (2016 film):  The film was directed by Scott Derrickson, who wrote it with Jon Spaihts and C. Robert Cargill, and stars Benedict Cumberbatch as Stephen Strange, along with Chiwetel Ejiofor, Rachel McAdams, Benedict Wong, Michael Stuhlbarg, Benjamin Bratt, Scott Adkins, Mads Mikkelsen, and Tilda Swinton.
Sinister (film): Sinister is a 2012 supernatural horror film directed by Scott Derrickson and written by Derrickson and C. Robert Cargill.
Adam Collis:  Collis first work was the assistant director for the Scott Derrickson's short "Love in the Ruins" (1995).

Query: Wha