In [20]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [21]:
pip install -U minsearch qdrant_client tqdm dlt python-dotenv  -q

In [22]:
from dotenv import load_dotenv
load_dotenv()

False

In [23]:
import dlt
print(dlt.__version__)

1.15.0


In [24]:
import requests
import pandas as pd

url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
docs_url = url_prefix + 'search_evaluation/documents-with-ids.json'
documents = requests.get(docs_url).json()

ground_truth_url = url_prefix + 'search_evaluation/ground-truth-data.csv'
df_ground_truth = pd.read_csv(ground_truth_url)
ground_truth = df_ground_truth.to_dict(orient='records')


In [25]:
from tqdm.auto import tqdm

def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total)
    }

In [26]:
import minsearch

index = minsearch.Index(
    text_fields=["question", "section", "text"],
    keyword_fields=["course", "id"]
)

index.fit(documents)

def search_minsearch_text(q):
    filter_dict = {"course": q['course']}
    boost_dict = {'question': 1.5, 'section': 0.1}

    query = q['question']
    return index.search(query, filter_dict=filter_dict, boost_dict=boost_dict, num_results=5)


In [27]:
print(evaluate(ground_truth, search_minsearch_text))

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.848714069591528, 'mrr': 0.7288235717887772}


In [28]:
from minsearch import VectorSearch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

texts = []

for doc in documents:
    t = doc['question']
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)


In [29]:
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x7a6b6c787a90>

In [30]:
def search_minsearch_vector(q):
    filter_dict = {"course": q['course']}

    query = {q['question']}
    vectors = pipeline.transform(query)
    return vindex.search(vectors, filter_dict=filter_dict, num_results=5)

In [31]:
print(evaluate(ground_truth, search_minsearch_vector))

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.48173762697212014, 'mrr': 0.3571284489590088}


In [32]:
texts = []

for doc in documents:
    t = doc['question'] + ' ' + doc['text']
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)


In [33]:
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x7a6cdca0e0d0>

In [34]:
print(evaluate(ground_truth, search_minsearch_vector))

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.8210503566025502, 'mrr': 0.6717707657949719}


In [35]:
!pip install -q "dlt[qdrant]" "qdrant-client[fastembed]"

In [36]:
from sentence_transformers import SentenceTransformer

model = "jinaai/jina-embeddings-v2-small-en"
#model = SentenceTransformer(model_name, device='cuda')


In [37]:
print(documents[0])

import dlt
@dlt.resource(parallelized=False)
def homework3_data():
    for doc in documents:
        yield doc

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.", 'section': 'General course-related questions', 'question': 'Course - When will the course start?', 'course': 'data-engineering-zoomcamp', 'id': 'c02e79ef'}


On dev run the following:
```sh
docker run -p 6333:6333 -p 6334:6334 -v "$(pwd)/qdrant_storage:/qdrant/storage:z" qdrant/qdrant
```

In [38]:
from dlt.destinations import qdrant
from dlt.destinations.adapters import qdrant_adapter

dlt.config["embedders.openai.batch_size"] = 10
dlt.config["extract.workers"] = 1
dlt.config["load.workers"] = 1
dlt.config["load.batch_size"]= 10
dlt.config["runtime.max_retries"] = 15

qd_db = "qdrant_storage"
qd_collection= "homework3_dataset"

try:
    from google.colab import userdata
    env = userdata
except ImportError:
    import os
    env = os.environ


qd_api_key = env.get('QD_API_KEY')
qd_url = env.get('QD_URL')

print(qd_api_key, qd_url)

credentials = { 'api_key': qd_api_key, 'location': qd_url }
qd_destination = qdrant( model = model,
                        credentials = credentials,
                        payload_storage_index = {
                            "id": "keyword",
                            "course": "keyword",
                            "pipeline_name": "keyword",
                        }

                )


qdrant_pipeline = dlt.pipeline( pipeline_name="homework3",
                                  destination=qd_destination,
                                  dataset_name=qd_collection,
                                  progress=dlt.progress.tqdm(colour="yellow")
                                )

from qdrant_client import QdrantClient
qd_db = "db.qdrant"
client = QdrantClient(location=qd_url, api_key=qd_api_key)

result = client.get_collections()
for collection in result.collections:
    if collection.name.startswith(qd_collection):
        client.delete_collection(collection.name)


load_info = qdrant_pipeline.run(qdrant_adapter(homework3_data(), embed=['question', 'text']),
              primary_key="id",
              write_disposition="replace"
            )



eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.UFyIAgXMbSTpelW_kdhU9bX5wzNKXpOQxwHI58bywms https://59a60c41-e31e-4e0d-ab82-2aaca54b7d4c.europe-west3-0.gcp.cloud.qdrant.io:6333


[destination.qdrant]
location="value"
. Deprecated in dlt 1.0.0 to be removed in 2.0.0.
Extract homework3: Resources:   0%|[33m          [0m| 0/1 [00:00<?, ?it/s]
homework3_data: 0it [00:00, ?it/s][A
                                                                           
homework3_data: 948it [00:00, 3098.72it/s][A
Extract homework3: Resources:   0%|[33m          [0m| 0/1 [00:00<?, ?it/s]
_dlt_pipeline_state: 0it [00:00, ?it/s][A
                                                                           
_dlt_pipeline_state: 1it [00:00, 35.57it/s][A
Normalize homework3 in 1755121868.0425618: Files:   0%|[33m          [0m| 0/2 [00:00<?, ?it/s]
Items: 0it [00:00, ?it/s][A
Normalize homework3 in 1755121868.0425618: Files: 100%|[33m██████████[0m| 2/2 [00:00<00:00,  4.44it/s]
                                                                                               
Items: 949it [00:00, 2130.35it/s][A
Load homework3 in 1755121868.0425618: Jobs:   0%|[33m          [0m

PipelineStepFailed: Pipeline execution failed at `step=load` when processing package with `load_id=1755121868.0425618` with exception:

<class 'dlt.load.exceptions.LoadClientJobRetry'>
Job with `job_id=homework3_data.11770d77f4.jsonl.gz` had 5 retries which is a multiple of `max_retry_count=5`. Exiting retry loop. You can still rerun the load package to retry this job. Last failure message was: Worker PID: 10810 terminated unexpectedly with code -9

In [None]:
client.create_payload_index( field_name= "course",
                             field_schema="keyword",
                             collection_name = qd_collection)

In [None]:
vector_name="fast-jina-embeddings-v2-small-en"

In [None]:
from fastembed import TextEmbedding

def embeddingsFor(docs, model_name):
    model = TextEmbedding(model_name=model_name)
    return list(model.query_embed(docs))[0]

In [None]:
from qdrant_client.models import Filter, FieldCondition, MatchValue
import numpy as np

def search_qdrant(q):
    course = q['course']
    query = q['question']

    # Create an embedding for the entire query string.
    query_vector = embeddingsFor([query], model_name)

    query_filter=Filter(
        must=[FieldCondition(key="course", match=MatchValue(value=course))]
    )

    # Use the generated vector in the search query.
    result = client.query_points(collection_name=qd_collection,
                               query_filter=query_filter,
                               query=query_vector,
                               using=vector_name,
                               with_payload= True,
                               limit=5)

    payloads = [point.payload for point in result.points]

    return payloads

In [None]:
evaluation = evaluate(ground_truth, search_qdrant)

In [None]:
print(evaluation)