In [68]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [69]:
pip install -U minsearch qdrant_client tqdm dlt -q

In [70]:
import dlt
print(dlt.__version__)

1.15.0


In [71]:
import requests
import pandas as pd

url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
docs_url = url_prefix + 'search_evaluation/documents-with-ids.json'
documents = requests.get(docs_url).json()

ground_truth_url = url_prefix + 'search_evaluation/ground-truth-data.csv'
df_ground_truth = pd.read_csv(ground_truth_url)
ground_truth = df_ground_truth.to_dict(orient='records')


In [72]:
from tqdm.auto import tqdm

def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total)
    }

In [73]:
import minsearch

index = minsearch.Index(
    text_fields=["question", "section", "text"],
    keyword_fields=["course", "id"]
)

index.fit(documents)

def search_minsearch_text(q):
    filter_dict = {"course": q['course']}
    boost_dict = {'question': 1.5, 'section': 0.1}

    query = q['question']
    return index.search(query, filter_dict=filter_dict, boost_dict=boost_dict, num_results=5)


In [74]:
print(evaluate(ground_truth, search_minsearch_text))

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.848714069591528, 'mrr': 0.7288235717887772}


In [75]:
from minsearch import VectorSearch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

texts = []

for doc in documents:
    t = doc['question']
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)


In [76]:
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x7b0db45bf4d0>

In [77]:
def search_minsearch_vector(q):
    filter_dict = {"course": q['course']}

    query = {q['question']}
    vectors = pipeline.transform(query)
    return vindex.search(vectors, filter_dict=filter_dict, num_results=5)

In [78]:
print(evaluate(ground_truth, search_minsearch_vector))

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.48173762697212014, 'mrr': 0.3572833369353793}


In [79]:
texts = []

for doc in documents:
    t = doc['question'] + ' ' + doc['text']
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)


In [80]:
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x7b0dd3d86ad0>

In [81]:
print(evaluate(ground_truth, search_minsearch_vector))

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.8210503566025502, 'mrr': 0.6717347453353508}


In [82]:
!pip install -q "dlt[qdrant]" "qdrant-client[fastembed]"

In [83]:
model="jinaai/jina-embeddings-v2-small-en"

In [84]:
import dlt
@dlt.resource
def homework3_data():
    for document in documents:
        yield doc['question'] + ' ' + doc['text']

In [85]:
#docker run -p 6333:6333 -p 6334:6334 -v "$(pwd)/qdrant_storage:/qdrant/storage:z" qdrant/qdrant

In [92]:
from dlt.destinations import qdrant
from google.colab import userdata

qd_db = "qdrant_storage"
qd_collection= "homework3_dataset"
qd_api_key = userdata.get('QD_API_KEY')
qd_url =  userdata.get('QD_URL')


credentials = { 'api_key': qd_api_key, 'location': qd_url }
print(credentials)
qd_destination = qdrant(model=model, credentials=credentials)
qdrant_pipeline = dlt.pipeline(
    pipeline_name="homework3",
    destination=qd_destination,
    dataset_name=qd_collection,
)
load_info = qdrant_pipeline.run(homework3_data())

{'api_key': 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.UFyIAgXMbSTpelW_kdhU9bX5wzNKXpOQxwHI58bywms', 'location': 'https://59a60c41-e31e-4e0d-ab82-2aaca54b7d4c.europe-west3-0.gcp.cloud.qdrant.io:6333'}


[destination.qdrant]
location="value"
. Deprecated in dlt 1.0.0 to be removed in 2.0.0.


PipelineStepFailed: Pipeline execution failed at `step=sync` with exception:

<class 'qdrant_client.http.exceptions.UnexpectedResponse'>
Unexpected Response: 400 (Bad Request)
Raw response content:
b'{"status":{"error":"Bad request: Index required but not found for \\"pipeline_name\\" of one of the following types: [keyword]. Help: Create an index for this key or use a different filter."},"time": ...'

In [89]:
from qdrant_client import QdrantClient
qd_db = "db.qdrant"
client = QdrantClient(host="localhost", port=6333)

In [None]:
vector_name="fast-jina-embeddings-v2-small-en"

In [None]:
model_name="jinaai/jina-embeddings-v2-small-en"

In [None]:
from fastembed import TextEmbedding

def embeddingsFor(docs, model_name):
    model = TextEmbedding(model_name=model_name)
    return list(model.query_embed(docs))[0]

In [None]:
from qdrant_client.models import Filter, FieldCondition, MatchValue
import numpy as np

def search_qdrant(q):
    course = q['course']
    query = q['question']

    # Create an embedding for the entire query string.
    query_vector = embeddingsFor([query], model_name)

    query_filter=Filter(
        must=[FieldCondition(key="course", match=MatchValue(value=course))]
    )

    # Use the generated vector in the search query.
    result = client.query_points(collection_name=qd_collection,
                               query_filter=query_filter,
                               query=query_vector,
                               using=vector_name,
                               with_payload= True,
                               limit=5)

    payloads = [point.payload for point in result.points]

    return payloads

In [None]:
evaluation = evaluate(ground_truth, search_qdrant)

In [None]:
print(evaluation)