This example demonstrates shortcoming of naive cosine similiarity search without any document optimization techniques.

The results could be improved by using more "smart" embeddings (_jina_embeddings_v3_ for example) or by enriching queries and data.

In [5]:
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams
from qdrant_client.http.models.models import Distance
from openai import OpenAI
import uuid
import os

from dotenv import load_dotenv

load_dotenv()

True

In [25]:

class VectorService:
    def __init__(self, url, key, ai: OpenAI):
        self.url = url
        self.key = key
        self.client = QdrantClient(url=url, api_key=key)
        self.ai = ai

    def create_collection(self, name):
        col = self.client.collection_exists(name)
        if not col:
            self.client.create_collection(name, VectorParams(
                size=1536, distance=Distance.COSINE
            ))

    def add_points(self, col_name, points: list[dict]):
        for point in points:
            point["id"] = str(uuid.uuid4())
            point["embedding"] = self.ai.embeddings.create(
                model="text-embedding-3-small", input=point["text"]
            ).data[0].embedding
        data = [
            {
                "id": point["id"], "vector": point["embedding"],
                "payload": {"role": point.get("role"), "text": point["text"]}
            }
        ]
        self.client.upsert(collection_name=col_name, points=data, wait=True)

    def search(self, col_name, query, limit):
        embedding = self.ai.embeddings.create(
            model="text-embedding-3-small", input=query
        ).data[0].embedding
        return self.client.search(col_name, query_vector=embedding, limit=limit, with_payload=True)

In [27]:
openai = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
vector_service = VectorService(
    url=os.getenv("QDRANT_REMOTE_URL"), key=os.getenv("QDRANT_API_KEY"), ai=openai
)

In [11]:
data = [
    'Apple (Consumer Electronics)',
    'Tesla (Automotive)',
    'Microsoft (Software)',
    'Google (Internet Services)',
    'Nvidia (Semiconductors)',
    'Meta (Social Media)',
    'X Corp (Social Media)',
    'Tech•sistence (Newsletter)'
]

queries = [ 'Car company', 'Macbooks', 'Facebook', 'Newsletter' ];

In [12]:
vector_service.create_collection("emb-01")

In [23]:
for item in data:
    vector_service.add_points("emb-01", [{"text": item, "role": "normal"}])

In [33]:
for query in queries:
    print("Matchning query: ", query)
    res = vector_service.search("emb-01", query, 3)
    print(f"Best match: {res[0].payload}")

Matchning query:  Car company
Best match: {'role': 'normal', 'text': 'Tesla (Automotive)'}
Matchning query:  Macbooks
Best match: {'role': 'normal', 'text': 'Apple (Consumer Electronics)'}
Matchning query:  Facebook
Best match: {'role': 'normal', 'text': 'X Corp (Social Media)'}
Matchning query:  Newsletter
Best match: {'role': 'normal', 'text': 'Tech•sistence (Newsletter)'}
