<a href="https://colab.research.google.com/github/nickprock/appunti_data_science/blob/master/semantic-search/nested_prefetch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install qdrant-client
!pip install -U sentence-transformers

# Dataset

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('/content/airc.csv', sep=";")
# Minimal cleaning
df.fillna(value="", inplace=True)
df["Domanda"] = df["Domanda"].apply(lambda x: x.strip())
df.head()

# embedder

In [None]:
from sentence_transformers import SentenceTransformer

matryoshka_dim = [64, 256, 768]

model_64 = SentenceTransformer(
    "nickprock/sentence-BERTino-sts-matryoshka",
    trust_remote_code=True,
    truncate_dim=matryoshka_dim[0],
)

model_256 = SentenceTransformer(
    "nickprock/sentence-BERTino-sts-matryoshka",
    trust_remote_code=True,
    truncate_dim=matryoshka_dim[1],
)

model_full = SentenceTransformer(
    "nickprock/sentence-BERTino-sts-matryoshka",
    trust_remote_code=True,
    truncate_dim=matryoshka_dim[2],
)

In [None]:
vec_64 = model_64.encode(df["Risposta"])
vec_256 = model_256.encode(df["Risposta"])
vec_full = model_full.encode(df["Risposta"])

## Qdrant

In [None]:
from qdrant_client import QdrantClient
from qdrant_client.http.models import VectorParams, Distance

client = QdrantClient(
    ":memory:",
    timeout=None
    )

client.create_collection(
    collection_name="multiple_vectors",
    vectors_config={
        "vec_64": VectorParams(
            size=64,
            distance=Distance.COSINE,
        ),
        "vec_256": VectorParams(
            size=256,
            distance=Distance.COSINE,
        ),
        "vec_768": VectorParams(
            size=768,
            distance=Distance.COSINE,
        ),
    },
)

client.create_collection(
   collection_name="single_vector_64",
   vectors_config=VectorParams(
       size=64,
       distance=Distance.COSINE,
   )
)

client.create_collection(
   collection_name="single_vector_full",
   vectors_config=VectorParams(
       size=768,
       distance=Distance.COSINE,
   )
)

In [None]:
from qdrant_client.http.models import PointStruct

for index, row in df.iterrows():
    client.upsert(
        collection_name="multiple_vectors",
        points=[
            PointStruct(
                id=index,
                vector={
                    "vec_64": vec_64[index],
                    "vec_256": vec_256[index],
                    "vec_768": vec_full[index],
                },
                payload={
                    "Question": row["Domanda"],
                    "Answer": row["Risposta"],
                }
            )
        ]
    )

    client.upsert(
        collection_name="single_vector_64",
        points=[
            PointStruct(
                id=index,
                vector=vec_64[index],
                payload={
                    "Question": row["Domanda"],
                    "Answer": row["Risposta"],
                }
            )
        ]
    )

    client.upsert(
        collection_name="single_vector_full",
        points=[
            PointStruct(
                id=index,
                vector=vec_full[index],
                payload={
                    "Question": row["Domanda"],
                    "Answer": row["Risposta"],
                }
            )
        ]
    )

## single query

In [None]:
from datetime import datetime

query = "C'è relazione tra l'alimentazione e i tumori?"

In [None]:
start_time = datetime.now()

search_result_64 = client.query_points(
    collection_name="single_vector_64",
    query=model_64.encode(query),
    with_payload=True,
    limit=100
).points

end_time = datetime.now()

retrieve_time = end_time - start_time

print('Duration: {}'.format(retrieve_time))
print("\n")
print(search_result_64)

In [None]:
from sentence_transformers import CrossEncoder

ranker = CrossEncoder("nickprock/cross-encoder-italian-bert-stsb")

In [None]:
N = 10

start_time = datetime.now()
retrieved_documents = [[query, res.payload["Answer"]] for res in search_result_64]
scores = ranker.predict(retrieved_documents)

# Sort the scores in decreasing order
results = [{"input": inp, "score": score} for inp, score in zip(retrieved_documents, scores)]
results = sorted(results, key=lambda x: x["score"], reverse=True)[:N]
end_time = datetime.now()
reranking_time = end_time - start_time

print('Duration: {}'.format(reranking_time))

In [None]:
start_time = datetime.now()
scores = ranker.predict(retrieved_documents)
end_time = datetime.now()
reranking_time_2 = end_time - start_time

print('Duration: {}'.format(reranking_time_2))

In [None]:
total_duration = retrieve_time + reranking_time
print('Total Duration: {}'.format(total_duration))

In [None]:
total_duration_2 = retrieve_time + reranking_time_2
print('Total Duration: {}'.format(total_duration_2))

In [None]:
reranking_time_2/total_duration_2

In [None]:
results

## nested query

In [None]:
from qdrant_client.models import Prefetch

In [None]:
start_time = datetime.now()

search_result_MV = client.query_points(
    collection_name="multiple_vectors",
    prefetch=Prefetch(
        prefetch=Prefetch(
            query=model_64.encode(query),  # <------ small byte vector
            using="vec_64",
            limit=100,
        ),
        query=model_256.encode(query),  # <-- mid byte vector
        using="vec_256",
        limit=50,
    ),
    query=model_full.encode(query), # <-- full dense vector
    with_payload=True,
    using="vec_768",
    limit=10,
).points

end_time = datetime.now()

print('Duration: {}'.format(end_time - start_time))
print("\n")
print(search_result_MV)

In [None]:
total_duration/(end_time - start_time)

In [None]:
for p in search_result_MV:
  print("'",p.payload['Answer'], "', 'score: '", p.score)
  print("\n")