In [1]:
!pip install datasets transformers chromadb

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting chromadb
  Downloading chromadb-0.5.18-py3-none-any.whl.metadata (6.8 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi>=0.95.2 (from chromadb)
  

In [2]:
# Loading the Stanford Question Answering Dataset (SQuAD)
from datasets import load_dataset
dataset = load_dataset("rajpurkar/squad")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [3]:
# Getting the training and the validation dataset
train_dataset = dataset["train"].select(range(1000))

In [4]:
train_dataset

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 1000
})

In [5]:
train_dataset.num_rows

1000

In [6]:
from transformers import AutoTokenizer, AutoModel
import torch

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

def embed_text(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach().numpy()[0]

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [7]:
import chromadb

# Crée une instance du client ChromaDB
client = chromadb.Client()

# Crée une nouvelle collection appelée 'squad_contexts' (si elle n'existe pas)
collection = client.create_collection(name="squad_contexts")

In [8]:
batch_size = 32
batch_embeddings = []
batch_contexts = []
batch_metadatas = []
batch_ids = []  # To store unique IDs
count_batch = 0
len_batches = len(train_dataset) // batch_size + 1

for i, example in enumerate(train_dataset):
    context = example["context"]
    context_embedding = embed_text(context)

    # Convert the numpy ndarray embedding to a list of floats
    batch_embeddings.append(context_embedding.tolist())
    batch_contexts.append(context)
    batch_metadatas.append({"id": example["id"]})
    batch_ids.append(example["id"])  # Add the unique ID to the batch

    # Add data in batches of `batch_size`
    if (i + 1) % batch_size == 0 or (i + 1) == len(train_dataset):
        count_batch += 1
        print(f"Batch {count_batch}/{len_batches}")

        # Pass 'ids' argument along with other arguments
        collection.add(
            embeddings=batch_embeddings,
            documents=batch_contexts,
            metadatas=batch_metadatas,
            ids=batch_ids  # Add the list of IDs here
        )

        # Reset the lists after adding
        batch_embeddings = []
        batch_contexts = []
        batch_metadatas = []
        batch_ids = []  # Reset the IDs list


Batch 1/32
Batch 2/32
Batch 3/32
Batch 4/32
Batch 5/32
Batch 6/32
Batch 7/32
Batch 8/32
Batch 9/32
Batch 10/32
Batch 11/32
Batch 12/32
Batch 13/32
Batch 14/32
Batch 15/32
Batch 16/32
Batch 17/32
Batch 18/32
Batch 19/32
Batch 20/32
Batch 21/32
Batch 22/32
Batch 23/32
Batch 24/32
Batch 25/32
Batch 26/32
Batch 27/32
Batch 28/32
Batch 29/32
Batch 30/32
Batch 31/32
Batch 32/32


In [9]:
def search_query(query, n_results=1):
    query_embedding = embed_text(query)
    results = collection.query(query_embeddings=[query_embedding], n_results=n_results)
    return results

In [10]:
# Exemple de recherche
query = "What is the capital of France?"
results = search_query(query)

# Affichage des résultats
for idx, result in enumerate(results['documents'][0]):
    print(f"Rank {idx + 1}: {result}")


Rank 1: As of 2012[update] research continued in many fields. The university president, John Jenkins, described his hope that Notre Dame would become "one of the pre–eminent research institutions in the world" in his inaugural address. The university has many multi-disciplinary institutes devoted to research in varying fields, including the Medieval Institute, the Kellogg Institute for International Studies, the Kroc Institute for International Peace studies, and the Center for Social Concerns. Recent research includes work on family conflict and child development, genome mapping, the increasing trade deficit of the United States with China, studies in fluid mechanics, computational science and engineering, and marketing trends on the Internet. As of 2013, the university is home to the Notre Dame Global Adaptation Index which ranks countries annually based on how vulnerable they are to climate change and how prepared they are to adapt.


In [37]:
def evaluate_search_engine():
    total_precision = 0
    n_questions = len(train_dataset)

    for example in train_dataset:
        question = example["question"]
        true_answer = example["answers"]["text"][0]

        # Debug: Print the question and true answer
        #print(f"Question: {question}")
        #print(f"True Answer: {true_answer}")

        # Generate the embedding for the question
        query_embedding = embed_text(question)

        # Perform search in ChromaDB
        results = collection.query(query_embeddings=[query_embedding], n_results=20)

        # Debug: Print out retrieved documents
        retrieved_answers = results['documents'][0]
        #rint(f"Retrieved Answers: {retrieved_answers}")

        # Check if any retrieved passage contains the true answer
        relevant = any(true_answer in passage for passage in retrieved_answers)
        total_precision += int(relevant)

    # Calculate metrics
    precision = total_precision / n_questions

    return precision


In [40]:
precision = evaluate_search_engine()
print(f"Précision: {precision:.4f}")

Précision: 0.4620
