In [1]:
!pip install datasets ir_datasets neural-cherche

Collecting datasets
  Downloading datasets-4.1.1-py3-none-any.whl.metadata (18 kB)
Collecting ir_datasets
  Downloading ir_datasets-0.5.11-py3-none-any.whl.metadata (12 kB)
Collecting neural-cherche
  Downloading neural_cherche-1.4.3.tar.gz (31 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-21.0.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.4.1,>=0.3.0 (from datasets)
  Downloading dill-0.4.0-py3-none-any.whl.metadata (10 kB)
Collecting pandas (from datasets)
  Downloading pandas-2.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (91 kB)
Collecting tqdm>=4.66.3 (from datasets)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any

In [2]:
!git clone https://github.com/sionic-ai/muvera-py.git

Cloning into 'muvera-py'...
remote: Enumerating objects: 10, done.[K
remote: Counting objects: 100% (10/10), done.[K
remote: Compressing objects: 100% (9/9), done.[K
remote: Total 10 (delta 0), reused 10 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (10/10), 1.91 MiB | 7.10 MiB/s, done.


In [1]:
import os

MUVERA_PY_PATH = "./muvera-py"
if not os.path.isdir(MUVERA_PY_PATH):
    raise FileNotFoundError(f"The directory '{MUVERA_PY_PATH}' was not found. Please clone the repository first.")

# Change current directory
os.chdir(MUVERA_PY_PATH)

In [2]:
import os
import sys
import torch
import numpy as np
from tqdm import tqdm
import time
import torch
import numpy as np
from tqdm import tqdm
from typing import Dict, List
from dataclasses import replace
import json
# Import third-party libraries
from neural_cherche import models, rank
import logging
import itertools

In [3]:
import ir_datasets
from tqdm import tqdm
import json

def create_corpus():
    with open('./corpus_v1.json', 'r') as file:
      data = json.load(file)
    # 2. Create the corpus dictionary
    corpus = {}
    for id,item in data.items():
        corpus[id] = {"title": "", "text": item['text']}

    # 3. Create the queries dictionary
    queries = {}
    # Assuming Queries.json contains a list of query strings or a dictionary
    # Let's assume it's a list of dictionaries like [{'query': 'query text', 'query_id': '1'}, ...]
    # If it's just a list of strings, we'll need to adjust how we generate query_ids
    try:
        with open('./Queries.json', 'r') as file:
            rag_queries_data = json.load(file)
        # Assuming rag_queries_data is a list of dictionaries, each with a 'query' key
        for i, item in enumerate(rag_queries_data):
             # Generate a simple query_id if not provided
            query_id = item.get('query_id', f"query_{i+1}")
            queries[query_id] = item['query']
    except FileNotFoundError:
        print("Warning: /content/Queries.json not found. Queries dictionary will be empty.")
    except KeyError:
        print("Error: 'query' key not found in one of the items in Queries.json.")
        # Handle the case where the JSON structure is unexpected
        queries = {} # Reset queries to avoid further errors

    return corpus,queries

In [None]:
import time
import torch
import numpy as np
from tqdm import tqdm
from typing import Dict, List
from dataclasses import replace

# Import third-party libraries
from neural_cherche import models, rank


# Import our custom modules
from fde_generator import (
    FixedDimensionalEncodingConfig,
    generate_document_fde_batch,
    generate_query_fde, # We need this for the search step
    EncodingType,
)
import logging

from multiprocessing import Pool, cpu_count

import time
import torch
import numpy as np
from tqdm import tqdm
from typing import Dict, List
from dataclasses import replace

# Import third-party libraries
from neural_cherche import models, rank
from ir_datasets.util import Cache

# Import our custom modules
from fde_generator import (
    FixedDimensionalEncodingConfig,
    generate_document_fde_batch,
    generate_query_fde, # We need this for the search step
    EncodingType,
    ProjectionType,
)
import logging

from multiprocessing import Pool, cpu_count

class ColbertFdeRetriever:
    """
    Uses a real ColBERT model to generate embeddings, then FDE for search.
    This version uses the correct `rank.ColBERT` interface.
    """
    def __init__(self, model_name="colbert-ir/colbertv2.0"):
        # --- Model Initialization ---
        device = "cuda" if torch.cuda.is_available() else "cpu"
        model = models.ColBERT(model_name_or_path=model_name, device=device)

        # We use the 'rank' API as shown in the original muvera-py repo
        self.ranker = rank.ColBERT(key="id", on=["title", "text"], model=model)

        # --- FDE Configuration ---
        self.doc_config = FixedDimensionalEncodingConfig(
                    dimension=128,
                    num_repetitions=20,
                    num_simhash_projections=5,
                    projection_type=ProjectionType.AMS_SKETCH, # Enable the projection
                    projection_dimension=16, # The new, smaller dimension
                    seed=42,
                    encoding_type=EncodingType.AVERAGE,
                    fill_empty_partitions=True,
                )

        self.fde_index, self.doc_ids = None, []
        self.doc_embeddings_list = None # Initialize the attribute here
        print("ColbertFdeRetriever initialized successfully.")

    # In your main script (ColbertFdeRetriever class)

    def index(self, corpus: dict):
        """
        Generates FDEs for the corpus using the highly optimized batch function.
        """
        print("\n--- Starting Corpus Indexing ---")
        start_time = time.time()  

        self.doc_ids = list(corpus.keys())
        documents_for_ranker = [
            {"id": doc_id, **corpus[doc_id]} for doc_id in self.doc_ids
        ]

        print(f"Generating native multi-vector embeddings for {len(documents_for_ranker)} documents...")
        doc_embeddings_map = self.ranker.encode_documents(
            documents=documents_for_ranker, batch_size=32
        )
        self.doc_embeddings_list = [
            doc_embeddings_map[doc_id] for doc_id in self.doc_ids
        ]

        duration_embed = time.time() - start_time
        print(f"Multi-vector embedding generation took {duration_embed:.2f} seconds.")

        # --- REPLACEMENT LOGIC ---
        print("Generating FDEs from ColBERT embeddings using the optimized BATCH function...")
        start_fde_time = time.time()

        # A single, powerful function call replaces the entire multiprocessing block.
        self.fde_index = generate_document_fde_batch(
            self.doc_embeddings_list, self.doc_config
        )

        duration_fde = time.time() - start_fde_time
        print(f"FDE generation took {duration_fde:.2f} seconds.")
        print(f"--- Corpus Indexing Finished in {time.time() - start_time:.2f} seconds ---")
        print(f"Final FDE Index Shape: {self.fde_index.shape}")


    def search(self, query: str) -> Dict[str, float]:
        """
        Encodes a query to an FDE and searches the document index.
        """
        # Step 4c: Encode the query into a multi-vector embedding
        query_embeddings_map = self.ranker.encode_queries(queries=[query])
        query_embeddings = list(query_embeddings_map.values())[0]

        # Step 4d: Convert the query embedding to an FDE
        # We create a new config for the query. Crucially, fill_empty_partitions must be False.
        query_config = replace(self.doc_config, fill_empty_partitions=False)
        query_fde = generate_query_fde(query_embeddings, query_config)

        # Step 4e: The Search!
        # This is a single, lightning-fast matrix-vector multiplication.
        # It computes the dot product of the query FDE against all document FDEs simultaneously.
        scores = self.fde_index @ query_fde

        # Return a dictionary of {doc_id: score} sorted from highest to lowest score.
        return dict(
            sorted(zip(self.doc_ids, scores), key=lambda item: item[1], reverse=True)
        )

if __name__ == '__main__':
    # --- Step 1: Load Dataset ---
    corpus,queries = create_corpus()

    # --- Step 2 & 3: Initialize and Index ---
    fde_retriever = ColbertFdeRetriever()
    fde_retriever.index(corpus=corpus)

    # # --- Step 4: Perform a Search ---
    # print("\n--- Performing Search ---")

    # # Let's take the first query from the dataset as an example
    # example_query_id = list(queries.keys())[0]
    # example_query_text = queries[example_query_id]

    # print(f"Searching for query ID '{example_query_id}': '{example_query_text}'")

    # search_start_time = time.time()
    # results = fde_retriever.search(query=example_query_text)
    # search_duration = time.time() - search_start_time

    # print(f"Search completed in {search_duration:.4f} seconds.")

    # # --- Display Top 5 Results ---
    # print("\nTop 5 results:")
    # for i, (doc_id, score) in enumerate(list(results.items())[:5]):
    #     print(f"{i+1}. Doc ID: {doc_id}, Score: {score:.4f}")
    #     # print(f"   Title: {corpus[doc_id]['title']}")

Some weights of BertForMaskedLM were not initialized from the model checkpoint at colbert-ir/colbertv2.0 and are newly initialized: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ColbertFdeRetriever initialized successfully.

--- Starting Corpus Indexing ---
Generating native multi-vector embeddings for 91411 documents...


ColBERT documents embeddings: 100%|██████████| 2857/2857 [06:11<00:00,  7.69it/s]


Multi-vector embedding generation took 371.79 seconds.
Generating FDEs from ColBERT embeddings using the optimized BATCH function...
FDE generation took 1048.20 seconds.
--- Corpus Indexing Finished in 1419.99 seconds ---
Final FDE Index Shape: (91411, 10240)


In [None]:
import numpy as np
import json

# Assuming `fde_retriever` has finished indexing...
fde_index = fde_retriever.fde_index
doc_ids = fde_retriever.doc_ids

print("\n--- Saving Index and Document IDs ---")

# 1. Save the FDE index to a binary .npy file
np.save("fde_index.npy", fde_index)
print(f"FDE index saved to fde_index.npy with shape {fde_index.shape}")

# 2. Save the document IDs to a JSON file
with open("doc_ids.json", "w") as f:
    json.dump(doc_ids, f)
print("Document IDs saved to doc_ids.json")